summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIrina Yatsenko <irina.yatsenko@mongodb.com>2023-03-03 17:49:27 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2023-03-03 23:00:36 +0000
commit23a26b5dad45d12187b61a1e5e49c60c5be26632 (patch)
tree15c189d6cb1488a4fbc4b0d676a8d0792cf0e10d
parent4f07b59413fd6053ad8033934c4312fecbcbd5d9 (diff)
downloadmongo-23a26b5dad45d12187b61a1e5e49c60c5be26632.tar.gz
SERVER-74297 Initial implementation of $percentile accumulator
-rw-r--r--jstests/aggregation/accumulators/percentiles_approx.js89
-rw-r--r--jstests/aggregation/accumulators/percentiles_syntax.js106
-rw-r--r--src/mongo/db/pipeline/SConscript2
-rw-r--r--src/mongo/db/pipeline/accumulator_percentile.cpp174
-rw-r--r--src/mongo/db/pipeline/accumulator_percentile.h82
-rw-r--r--src/mongo/db/pipeline/percentile_algo.h77
-rw-r--r--src/mongo/db/pipeline/percentile_algo_sort_and_rank.cpp90
7 files changed, 620 insertions, 0 deletions
diff --git a/jstests/aggregation/accumulators/percentiles_approx.js b/jstests/aggregation/accumulators/percentiles_approx.js
new file mode 100644
index 00000000000..227b15ca1df
--- /dev/null
+++ b/jstests/aggregation/accumulators/percentiles_approx.js
@@ -0,0 +1,89 @@
+/**
+ * Tests for the approximate percentile accumulator semantics.
+ * @tags: [
+ * featureFlagApproxPercentiles,
+ * # sharded collections aren't supported yet
+ * assumes_unsharded_collection,
+ * ]
+ */
+(function() {
+"use strict";
+
+load("jstests/aggregation/extras/utils.js");
+
+const coll = db[jsTestName()];
+
+/**
+ * Tests for correctness without grouping. Each group gets its own accumulator so we can validate
+ * the basic $percentile functionality using a single group.
+ */
+function testWithSingleGroup({docs, percentileSpec, expectedResult, msg}) {
+ coll.drop();
+ coll.insertMany(docs);
+ const res = coll.aggregate([{$group: {_id: null, p: percentileSpec}}]).toArray();
+
+ // For $percentile the result should be ordered to match the spec, so assert exact equality.
+ assert.eq(expectedResult, res[0].p, msg + ` result: ${tojson(res)}`);
+}
+
+testWithSingleGroup({
+ docs: [{x: 0}, {x: "non-numeric"}, {x: 1}, {no_x: 0}, {x: 2}],
+ percentileSpec: {$percentile: {p: [0.5], input: "$x", algorithm: "approximate"}},
+ expectedResult: [1],
+ msg: "Non-numeric data should be ignored"
+});
+
+testWithSingleGroup({
+ docs: [{x: "non-numeric"}, {no_x: 0}],
+ percentileSpec: {$percentile: {p: [0.5], input: "$x", algorithm: "approximate"}},
+ expectedResult: null,
+ msg: "Percentile of completely non-numeric data"
+});
+
+testWithSingleGroup({
+ docs: [{x: "non-numeric"}, {no_x: 0}],
+ percentileSpec: {$percentile: {p: [0.5, 0.9], input: "$x", algorithm: "approximate"}},
+ expectedResult: null,
+ msg: "Multiple percentiles of completely non-numeric data"
+});
+
+testWithSingleGroup({
+ docs: [{x: 10}, {x: 5}, {x: 27}],
+ percentileSpec: {$percentile: {p: [0], input: "$x", algorithm: "approximate"}},
+ expectedResult: [5],
+ msg: "Minimun percentile"
+});
+
+testWithSingleGroup({
+ docs: [{x: 10}, {x: 5}, {x: 27}],
+ percentileSpec: {$percentile: {p: [1], input: "$x", algorithm: "approximate"}},
+ expectedResult: [27],
+ msg: "Maximum percentile"
+});
+
+testWithSingleGroup({
+ docs: [{x: 0}, {x: 1}, {x: 2}],
+ percentileSpec: {$percentile: {p: [0.5, 0.9, 0.1], input: "$x", algorithm: "approximate"}},
+ expectedResult: [1, 2, 0],
+ msg: "Multiple percentiles"
+});
+
+function testWithMultipleGroups({docs, percentileSpec, expectedResult, msg}) {
+ coll.drop();
+ coll.insertMany(docs);
+ const res =
+ coll.aggregate([{$group: {_id: "$k", p: percentileSpec}}, {$sort: {_id: 1}}]).toArray();
+
+ // For $percentile the result should be ordered to match the spec, so assert exact equality.
+ for (let i = 0; i < res.length; i++) {
+ assert.eq(expectedResult[i], res[i].p, msg + ` result: ${tojson(res)}`);
+ }
+}
+
+testWithMultipleGroups({
+ docs: [{k: 0, x: 0}, {k: 0, x: 1}, {k: 1, x: 2}, {k: 2}, {k: 0, x: "str"}, {k: 1, x: 0}],
+ percentileSpec: {$percentile: {p: [0.9], input: "$x", algorithm: "approximate"}},
+ expectedResult: [/* k:0 */[1], /* k:1 */[2], /* k:2 */ null],
+ msg: "Multiple groups"
+});
+})();
diff --git a/jstests/aggregation/accumulators/percentiles_syntax.js b/jstests/aggregation/accumulators/percentiles_syntax.js
new file mode 100644
index 00000000000..ed5e1d12e1f
--- /dev/null
+++ b/jstests/aggregation/accumulators/percentiles_syntax.js
@@ -0,0 +1,106 @@
+/**
+ * Tests for the $percentile accumulator syntax.
+ * @tags: [
+ * featureFlagApproxPercentiles,
+ * # sharded collections aren't supported yet
+ * assumes_unsharded_collection,
+ * ]
+ */
+(function() {
+"use strict";
+
+load("jstests/aggregation/extras/utils.js");
+
+const coll = db[jsTestName()];
+coll.drop();
+// These tests don't validate the computed $percentile but we need a result to be produced in
+// order to check its format.
+coll.insert({x: 42});
+
+/**
+ * Tests to check that invalid $percentile specifications are rejected.
+ */
+function assertInvalidSyntax(percentileSpec, msg) {
+ assert.commandFailed(
+ coll.runCommand("aggregate",
+ {pipeline: [{$group: {_id: null, p: percentileSpec}}], cursor: {}}),
+ msg);
+}
+
+assertInvalidSyntax({$percentile: 0.5}, "Should fail if $percentile is not an object");
+
+assertInvalidSyntax({$percentile: {input: "$x", algorithm: "approximate"}},
+ "Should fail if $percentile is missing 'p' field");
+
+assertInvalidSyntax({$percentile: {p: [0.5], algorithm: "approximate"}},
+ "Should fail if $percentile is missing 'input' field");
+
+assertInvalidSyntax({$percentile: {p: [0.5], input: "$x"}},
+ "Should fail if $percentile is missing 'algorithm' field");
+
+assertInvalidSyntax({$percentile: {p: [0.5], input: "$x", algorithm: "approximate", extras: 42}},
+ "Should fail if $percentile contains an unexpected field");
+
+assertInvalidSyntax({$percentile: {p: 0.5, input: "$x", algorithm: "approximate"}},
+ "Should fail if 'p' field in $percentile isn't array");
+
+assertInvalidSyntax({$percentile: {p: [], input: "$x", algorithm: "approximate"}},
+ "Should fail if 'p' field in $percentile is an empty array");
+
+assertInvalidSyntax(
+ {$percentile: {p: [0.5, "foo"], input: "$x", algorithm: "approximate"}},
+ "Should fail if 'p' field in $percentile is an array with a non-numeric element");
+
+assertInvalidSyntax(
+ {$percentile: {p: [0.5, 10], input: "$x", algorithm: "approximate"}},
+ "Should fail if 'p' field in $percentile is an array with any value outside of [0, 1] range");
+
+assertInvalidSyntax({$percentile: {p: [0.5, 0.7], input: "$x", algorithm: 42}},
+ "Should fail if 'algorithm' field isn't a string");
+
+assertInvalidSyntax({$percentile: {p: [0.5, 0.7], input: "$x", algorithm: "fancy"}},
+ "Should fail if 'algorithm' isn't one of _predefined_ strings");
+
+/**
+ * Test that valid $percentile specifications are accepted. The results, i.e. semantics, are tested
+ * elsewhere and would cover all of the cases below, we are providing them here nonetheless for
+ * completeness.
+ */
+function assertValidSyntax(percentileSpec, msg) {
+ assert.commandWorked(
+ coll.runCommand("aggregate",
+ {pipeline: [{$group: {_id: null, p: percentileSpec}}], cursor: {}}),
+ msg);
+}
+
+assertValidSyntax(
+ {$percentile: {p: [0.0, 0.0001, 0.5, 0.995, 1.0], input: "$x", algorithm: "approximate"}},
+ "Should be able to specify an array of percentiles");
+
+assertValidSyntax(
+ {$percentile: {p: [0.5, 0.9], input: {$divide: ["$x", 2]}, algorithm: "approximate"}},
+ "Should be able to specify 'input' as an expression");
+
+assertValidSyntax({$percentile: {p: [0.5, 0.9], input: "x", algorithm: "approximate"}},
+ "Non-numeric inputs should be gracefully ignored");
+
+/**
+ * Test that the "arrayness" of the result matches the "arrayness" of the specification.
+ */
+function assertArrayness(percentileSpec, msg) {
+ const pInSpec = percentileSpec.$percentile.p;
+
+ const res = coll.aggregate([{$group: {_id: null, p: percentileSpec}}]).toArray();
+
+ assert.eq(Array.isArray(pInSpec), Array.isArray(res[0].p), msg + ` result: ${tojson(res)}`);
+ if (Array.isArray(pInSpec)) {
+ assert.eq(pInSpec.length, res[0].p.length, msg + ` result: ${tojson(res)}`);
+ }
+}
+
+assertArrayness({$percentile: {p: [0.2, 0.9], input: "$x", algorithm: "approximate"}},
+ "Computed percentiles should be in an array of length 2");
+
+assertArrayness({$percentile: {p: [0.2], input: "$x", algorithm: "approximate"}},
+ "Computed percentiles should be in an array of length 1");
+})();
diff --git a/src/mongo/db/pipeline/SConscript b/src/mongo/db/pipeline/SConscript
index 918878e266e..cf4b9d8e8d9 100644
--- a/src/mongo/db/pipeline/SConscript
+++ b/src/mongo/db/pipeline/SConscript
@@ -116,11 +116,13 @@ env.Library(
'accumulator_merge_objects.cpp',
'accumulator_min_max.cpp',
'accumulator_multi.cpp',
+ 'accumulator_percentile.cpp',
'accumulator_push.cpp',
'accumulator_rank.cpp',
'accumulator_std_dev.cpp',
'accumulator_sum.cpp',
'map_reduce_options.idl',
+ 'percentile_algo_sort_and_rank.cpp',
'window_function/window_bounds.cpp',
'window_function/window_function_covariance.cpp',
'window_function/window_function_count.cpp',
diff --git a/src/mongo/db/pipeline/accumulator_percentile.cpp b/src/mongo/db/pipeline/accumulator_percentile.cpp
new file mode 100644
index 00000000000..659c08d526e
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_percentile.cpp
@@ -0,0 +1,174 @@
+/**
+ * Copyright (C) 2023-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/db/pipeline/accumulator_percentile.h"
+
+namespace mongo {
+
+using boost::intrusive_ptr;
+
+REGISTER_ACCUMULATOR_WITH_FEATURE_FLAG(percentile,
+ AccumulatorPercentile::parseArgs,
+ feature_flags::gFeatureFlagApproxPercentiles);
+
+namespace {
+// Checks that 'pv' is an array of valid percentile specifications.
+std::vector<double> validatePercentileArg(const Value& pv) {
+ const auto msg = "'p' must be an array of numeric values from [0.0, 1.0] range, but found "_sd;
+ uassert(7429700, str::stream() << msg << pv.toString(), pv.isArray());
+
+ std::vector<double> ps;
+ for (const Value& p : pv.getArray()) {
+ uassert(7429701, str::stream() << msg << pv.toString(), p.numeric());
+ auto pd = p.coerceToDouble();
+ uassert(7429702, str::stream() << msg << pv.toString(), pd >= 0 && pd <= 1);
+ ps.push_back(pd);
+ }
+ return ps;
+}
+} // namespace
+
+// TODO SERVER-74556: Move parsing of the args into IDL.
+AccumulationExpression AccumulatorPercentile::parseArgs(ExpressionContext* const expCtx,
+ BSONElement elem,
+ VariablesParseState vps) {
+ expCtx->sbeGroupCompatible = false;
+
+ uassert(7429703,
+ str::stream() << "specification must be an object; found " << elem,
+ elem.type() == BSONType::Object);
+ BSONObj obj = elem.embeddedObject();
+
+ boost::intrusive_ptr<Expression> input;
+ std::vector<double> ps;
+ std::string algo = "";
+ for (auto&& element : obj) {
+ auto fieldName = element.fieldNameStringData();
+ if (fieldName == kFieldNameInput) {
+ input = Expression::parseOperand(expCtx, element, vps);
+ } else if (fieldName == kFieldNameP) {
+ auto pv = Value(element);
+ ps = validatePercentileArg(pv);
+ } else if (fieldName == kFieldNameAlgo) {
+ if (element.type() == BSONType::String) {
+ algo = element.String();
+ }
+ // More types of percentiles will be supported in the future: PM-1883
+ uassert(7429704,
+ str::stream()
+ << "the valid specifications for 'algorithm' are: 'approximate'; found "
+ << element,
+ algo == "approximate");
+ } else {
+ uasserted(7429705,
+ str::stream() << "Unknown argument for 'percentile' operator: " << fieldName);
+ }
+ }
+ uassert(7429706, str::stream() << "Missing value for '" << kFieldNameP << "'", !ps.empty());
+ uassert(7429707, str::stream() << "Missing value for '" << kFieldNameInput << "'", input);
+ uassert(
+ 7429708, str::stream() << "Missing value for '" << kFieldNameAlgo << "'", !algo.empty());
+
+ auto factory = [expCtx, ps] {
+ // Temporary implementation! To be replaced based on the user's choice of algorithm.
+ auto algo = PercentileAlgorithm::createDiscreteSortAndRank();
+
+ return AccumulatorPercentile::create(expCtx, ps, std::move(algo));
+ };
+
+ return {ExpressionConstant::create(expCtx, Value(BSONNULL)) /*initializer*/,
+ std::move(input) /*argument*/,
+ std::move(factory),
+ "$percentile"_sd /*name*/};
+}
+
+void AccumulatorPercentile::processInternal(const Value& input, bool merging) {
+ if (merging) {
+ // Merging percentiles from different shards. For approximate percentiels 'input' is a
+ // document with a digest.
+ // TODO SERVER-74358: Support sharded collections
+ uasserted(7429709, "Sharded collections are not yet supported by $percentile");
+ return;
+ }
+
+ if (!input.numeric()) {
+ return;
+ }
+
+ _algo->incorporate(input.coerceToDouble());
+
+ // TODO SERVER-74558: Could/should we update the memory usage less often?
+ _memUsageBytes = sizeof(*this) + _algo->memUsageBytes();
+}
+
+Value AccumulatorPercentile::getValue(bool toBeMerged) {
+ if (toBeMerged) {
+ // Return a document, containing the whole digest, because they would need to be merged
+ // on mongos to compute the final result.
+ // TODO SERVER-74358: Support sharded collections
+ uasserted(7429710, "Sharded collections are not yet supported by $percentile");
+ return Value(Document{});
+ }
+
+ // Compute the percentiles for each requested one in the order listed. Computing a percentile
+ // can only fail if there have been no numeric inputs, then all percentiles would fail. Rather
+ // than returning an array of nulls in this case, we return a single null value.
+ std::vector<double> pctls;
+ for (double p : _percentiles) {
+ auto res = _algo->computePercentile(p);
+ if (!res) {
+ return Value(BSONNULL);
+ }
+ pctls.push_back(res.value());
+ }
+
+ return Value(std::vector<Value>(pctls.begin(), pctls.end()));
+}
+
+AccumulatorPercentile::AccumulatorPercentile(ExpressionContext* const expCtx,
+ const std::vector<double>& ps,
+ std::unique_ptr<PercentileAlgorithm> algo)
+ : AccumulatorState(expCtx), _percentiles(ps), _algo(std::move(algo)) {
+ _memUsageBytes = sizeof(*this) + _algo->memUsageBytes();
+}
+
+void AccumulatorPercentile::reset() {
+ // Temporary implementation! To be replaced based on the user's choice of algorithm.
+ auto algo = PercentileAlgorithm::createDiscreteSortAndRank();
+
+ _memUsageBytes = sizeof(*this) + _algo->memUsageBytes();
+}
+
+intrusive_ptr<AccumulatorState> AccumulatorPercentile::create(
+ ExpressionContext* const expCtx,
+ const std::vector<double>& ps,
+ std::unique_ptr<PercentileAlgorithm> algo) {
+ return new AccumulatorPercentile(expCtx, ps, std::move(algo));
+}
+} // namespace mongo
diff --git a/src/mongo/db/pipeline/accumulator_percentile.h b/src/mongo/db/pipeline/accumulator_percentile.h
new file mode 100644
index 00000000000..f25724911f0
--- /dev/null
+++ b/src/mongo/db/pipeline/accumulator_percentile.h
@@ -0,0 +1,82 @@
+/**
+ * Copyright (C) 2023-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/db/pipeline/accumulation_statement.h"
+#include "mongo/db/pipeline/percentile_algo.h"
+
+namespace mongo {
+/**
+ * Accumulator for computing $percentile.
+ */
+class AccumulatorPercentile : public AccumulatorState {
+public:
+ // {$percentile: {input: <Numeric>, p: <Array Expression>, algorithm: <String>}
+ static constexpr auto kFieldNameInput = "input"_sd;
+ static constexpr auto kFieldNameP = "p"_sd;
+ static constexpr auto kFieldNameAlgo = "algorithm"_sd;
+
+ static constexpr auto kName = "$percentile"_sd;
+ const char* getOpName() const final {
+ return kName.rawData();
+ }
+
+ /**
+ * Parsing and creating the accumulator. A separate accumulator object is created per group.
+ */
+ static AccumulationExpression parseArgs(ExpressionContext* expCtx,
+ BSONElement elem,
+ VariablesParseState vps);
+
+ static boost::intrusive_ptr<AccumulatorState> create(ExpressionContext* expCtx,
+ const std::vector<double>& ps,
+ std::unique_ptr<PercentileAlgorithm> algo);
+
+ AccumulatorPercentile(ExpressionContext* expCtx,
+ const std::vector<double>& ps,
+ std::unique_ptr<PercentileAlgorithm> algo);
+
+ /**
+ * Ingressing values and computing the requested percentiles.
+ */
+ void processInternal(const Value& input, bool merging) final;
+ Value getValue(bool toBeMerged) final;
+
+ /**
+ * Other infra for the accumulators.
+ */
+ void reset() final;
+
+private:
+ std::vector<double> _percentiles;
+ std::unique_ptr<PercentileAlgorithm> _algo;
+};
+
+} // namespace mongo
diff --git a/src/mongo/db/pipeline/percentile_algo.h b/src/mongo/db/pipeline/percentile_algo.h
new file mode 100644
index 00000000000..ce37ec2db77
--- /dev/null
+++ b/src/mongo/db/pipeline/percentile_algo.h
@@ -0,0 +1,77 @@
+/**
+ * Copyright (C) 2023-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include <boost/optional/optional.hpp>
+#include <memory>
+
+namespace mongo {
+
+/**
+ * Eventually we'll be supporting multiple types of percentiles (discrete, continuous, approximate)
+ * and potentially multiple different algorithms for computing the approximate ones.
+ *
+ * The goal is to keep these algorithms MQL- and engine-agnostic with this interface.
+ */
+struct PercentileAlgorithm {
+ virtual ~PercentileAlgorithm() {}
+
+ virtual void incorporate(double input) = 0;
+ virtual void incorporate(const std::vector<double>& inputs) = 0;
+
+ /**
+ * 'p' must be from [0, 1] range.
+ *
+ * It is always valid to call 'computePercentile()', however, if no input has been incorporated
+ * yet, all implementations must return 'boost::none'. It is allowed to incorporate more inputs
+ * after calling 'computePercentile()' and call it again (naturally, the result might differ
+ * depending on the new data).
+ *
+ * Note 1: the implementations are free to either return "none" or throw if they require setting
+ * up for computing a specific percentile but a different one is requested here.
+ *
+ * Note 2: the implementations might have different tradeoffs regarding balancing performance of
+ * ingress vs computing the percentile, so this interface provides no perfomance guarantees.
+ * Refer to the documentation of the specific implementations for details.
+ */
+ virtual boost::optional<double> computePercentile(double p) = 0;
+
+ /*
+ * The owner might need a rough estimate of how much memory the algorithm is using.
+ */
+ virtual long memUsageBytes() = 0;
+
+ /**
+ * Factory methods for instantiating concrete algorithms.
+ */
+ static std::unique_ptr<PercentileAlgorithm> createDiscreteSortAndRank();
+};
+
+} // namespace mongo
diff --git a/src/mongo/db/pipeline/percentile_algo_sort_and_rank.cpp b/src/mongo/db/pipeline/percentile_algo_sort_and_rank.cpp
new file mode 100644
index 00000000000..921e8845f41
--- /dev/null
+++ b/src/mongo/db/pipeline/percentile_algo_sort_and_rank.cpp
@@ -0,0 +1,90 @@
+/**
+ * Copyright (C) 2023-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "mongo/db/pipeline/percentile_algo.h"
+
+namespace mongo {
+
+/**
+ * 'SortAndRank' algorithm for computing percentiles is accurate and doesn't require specifying a
+ * percentile in advance but is only suitable for small datasets. It accumulats all data sent to it
+ * and sorts it all when a percentile is requested. Requesting more percentiles after the first one
+ * without incorporating more data is fast as it doesn't need to sort again.
+ */
+class DiscreteSortAndRank : public PercentileAlgorithm {
+public:
+ DiscreteSortAndRank() = default; // no config required for this algorithm
+
+ void incorporate(double input) final {
+ // Take advantage of already sorted input -- avoid resorting it later.
+ if (!_shouldSort && !_accumulatedValues.empty() && input < _accumulatedValues.back()) {
+ _shouldSort = true;
+ }
+
+ _accumulatedValues.push_back(input);
+ }
+ void incorporate(const std::vector<double>& inputs) final {
+ _shouldSort = true;
+ _accumulatedValues.reserve(_accumulatedValues.size() + inputs.size());
+ _accumulatedValues.insert(_accumulatedValues.end(), inputs.begin(), inputs.end());
+ }
+
+ boost::optional<double> computePercentile(double p) final {
+ if (_accumulatedValues.empty()) {
+ return boost::none;
+ }
+
+ if (_shouldSort) {
+ std::sort(_accumulatedValues.begin(), _accumulatedValues.end());
+ _shouldSort = false;
+ }
+
+ const double n = _accumulatedValues.size();
+ const size_t rank = static_cast<size_t>(std::max<double>(0, std::ceil(p * n) - 1));
+ return _accumulatedValues[rank];
+ }
+
+ long memUsageBytes() final {
+ return _accumulatedValues.capacity() * sizeof(double);
+ }
+
+private:
+ std::vector<double> _accumulatedValues;
+ bool _shouldSort = true;
+};
+
+std::unique_ptr<PercentileAlgorithm> PercentileAlgorithm::createDiscreteSortAndRank() {
+ return std::make_unique<DiscreteSortAndRank>();
+}
+
+} // namespace mongo