diff options
author | Irina Yatsenko <irina.yatsenko@mongodb.com> | 2023-03-03 17:49:27 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2023-03-03 23:00:36 +0000 |
commit | 23a26b5dad45d12187b61a1e5e49c60c5be26632 (patch) | |
tree | 15c189d6cb1488a4fbc4b0d676a8d0792cf0e10d | |
parent | 4f07b59413fd6053ad8033934c4312fecbcbd5d9 (diff) | |
download | mongo-23a26b5dad45d12187b61a1e5e49c60c5be26632.tar.gz |
SERVER-74297 Initial implementation of $percentile accumulator
-rw-r--r-- | jstests/aggregation/accumulators/percentiles_approx.js | 89 | ||||
-rw-r--r-- | jstests/aggregation/accumulators/percentiles_syntax.js | 106 | ||||
-rw-r--r-- | src/mongo/db/pipeline/SConscript | 2 | ||||
-rw-r--r-- | src/mongo/db/pipeline/accumulator_percentile.cpp | 174 | ||||
-rw-r--r-- | src/mongo/db/pipeline/accumulator_percentile.h | 82 | ||||
-rw-r--r-- | src/mongo/db/pipeline/percentile_algo.h | 77 | ||||
-rw-r--r-- | src/mongo/db/pipeline/percentile_algo_sort_and_rank.cpp | 90 |
7 files changed, 620 insertions, 0 deletions
diff --git a/jstests/aggregation/accumulators/percentiles_approx.js b/jstests/aggregation/accumulators/percentiles_approx.js new file mode 100644 index 00000000000..227b15ca1df --- /dev/null +++ b/jstests/aggregation/accumulators/percentiles_approx.js @@ -0,0 +1,89 @@ +/** + * Tests for the approximate percentile accumulator semantics. + * @tags: [ + * featureFlagApproxPercentiles, + * # sharded collections aren't supported yet + * assumes_unsharded_collection, + * ] + */ +(function() { +"use strict"; + +load("jstests/aggregation/extras/utils.js"); + +const coll = db[jsTestName()]; + +/** + * Tests for correctness without grouping. Each group gets its own accumulator so we can validate + * the basic $percentile functionality using a single group. + */ +function testWithSingleGroup({docs, percentileSpec, expectedResult, msg}) { + coll.drop(); + coll.insertMany(docs); + const res = coll.aggregate([{$group: {_id: null, p: percentileSpec}}]).toArray(); + + // For $percentile the result should be ordered to match the spec, so assert exact equality. + assert.eq(expectedResult, res[0].p, msg + ` result: ${tojson(res)}`); +} + +testWithSingleGroup({ + docs: [{x: 0}, {x: "non-numeric"}, {x: 1}, {no_x: 0}, {x: 2}], + percentileSpec: {$percentile: {p: [0.5], input: "$x", algorithm: "approximate"}}, + expectedResult: [1], + msg: "Non-numeric data should be ignored" +}); + +testWithSingleGroup({ + docs: [{x: "non-numeric"}, {no_x: 0}], + percentileSpec: {$percentile: {p: [0.5], input: "$x", algorithm: "approximate"}}, + expectedResult: null, + msg: "Percentile of completely non-numeric data" +}); + +testWithSingleGroup({ + docs: [{x: "non-numeric"}, {no_x: 0}], + percentileSpec: {$percentile: {p: [0.5, 0.9], input: "$x", algorithm: "approximate"}}, + expectedResult: null, + msg: "Multiple percentiles of completely non-numeric data" +}); + +testWithSingleGroup({ + docs: [{x: 10}, {x: 5}, {x: 27}], + percentileSpec: {$percentile: {p: [0], input: "$x", algorithm: "approximate"}}, + expectedResult: [5], + msg: "Minimun percentile" +}); + +testWithSingleGroup({ + docs: [{x: 10}, {x: 5}, {x: 27}], + percentileSpec: {$percentile: {p: [1], input: "$x", algorithm: "approximate"}}, + expectedResult: [27], + msg: "Maximum percentile" +}); + +testWithSingleGroup({ + docs: [{x: 0}, {x: 1}, {x: 2}], + percentileSpec: {$percentile: {p: [0.5, 0.9, 0.1], input: "$x", algorithm: "approximate"}}, + expectedResult: [1, 2, 0], + msg: "Multiple percentiles" +}); + +function testWithMultipleGroups({docs, percentileSpec, expectedResult, msg}) { + coll.drop(); + coll.insertMany(docs); + const res = + coll.aggregate([{$group: {_id: "$k", p: percentileSpec}}, {$sort: {_id: 1}}]).toArray(); + + // For $percentile the result should be ordered to match the spec, so assert exact equality. + for (let i = 0; i < res.length; i++) { + assert.eq(expectedResult[i], res[i].p, msg + ` result: ${tojson(res)}`); + } +} + +testWithMultipleGroups({ + docs: [{k: 0, x: 0}, {k: 0, x: 1}, {k: 1, x: 2}, {k: 2}, {k: 0, x: "str"}, {k: 1, x: 0}], + percentileSpec: {$percentile: {p: [0.9], input: "$x", algorithm: "approximate"}}, + expectedResult: [/* k:0 */[1], /* k:1 */[2], /* k:2 */ null], + msg: "Multiple groups" +}); +})(); diff --git a/jstests/aggregation/accumulators/percentiles_syntax.js b/jstests/aggregation/accumulators/percentiles_syntax.js new file mode 100644 index 00000000000..ed5e1d12e1f --- /dev/null +++ b/jstests/aggregation/accumulators/percentiles_syntax.js @@ -0,0 +1,106 @@ +/** + * Tests for the $percentile accumulator syntax. + * @tags: [ + * featureFlagApproxPercentiles, + * # sharded collections aren't supported yet + * assumes_unsharded_collection, + * ] + */ +(function() { +"use strict"; + +load("jstests/aggregation/extras/utils.js"); + +const coll = db[jsTestName()]; +coll.drop(); +// These tests don't validate the computed $percentile but we need a result to be produced in +// order to check its format. +coll.insert({x: 42}); + +/** + * Tests to check that invalid $percentile specifications are rejected. + */ +function assertInvalidSyntax(percentileSpec, msg) { + assert.commandFailed( + coll.runCommand("aggregate", + {pipeline: [{$group: {_id: null, p: percentileSpec}}], cursor: {}}), + msg); +} + +assertInvalidSyntax({$percentile: 0.5}, "Should fail if $percentile is not an object"); + +assertInvalidSyntax({$percentile: {input: "$x", algorithm: "approximate"}}, + "Should fail if $percentile is missing 'p' field"); + +assertInvalidSyntax({$percentile: {p: [0.5], algorithm: "approximate"}}, + "Should fail if $percentile is missing 'input' field"); + +assertInvalidSyntax({$percentile: {p: [0.5], input: "$x"}}, + "Should fail if $percentile is missing 'algorithm' field"); + +assertInvalidSyntax({$percentile: {p: [0.5], input: "$x", algorithm: "approximate", extras: 42}}, + "Should fail if $percentile contains an unexpected field"); + +assertInvalidSyntax({$percentile: {p: 0.5, input: "$x", algorithm: "approximate"}}, + "Should fail if 'p' field in $percentile isn't array"); + +assertInvalidSyntax({$percentile: {p: [], input: "$x", algorithm: "approximate"}}, + "Should fail if 'p' field in $percentile is an empty array"); + +assertInvalidSyntax( + {$percentile: {p: [0.5, "foo"], input: "$x", algorithm: "approximate"}}, + "Should fail if 'p' field in $percentile is an array with a non-numeric element"); + +assertInvalidSyntax( + {$percentile: {p: [0.5, 10], input: "$x", algorithm: "approximate"}}, + "Should fail if 'p' field in $percentile is an array with any value outside of [0, 1] range"); + +assertInvalidSyntax({$percentile: {p: [0.5, 0.7], input: "$x", algorithm: 42}}, + "Should fail if 'algorithm' field isn't a string"); + +assertInvalidSyntax({$percentile: {p: [0.5, 0.7], input: "$x", algorithm: "fancy"}}, + "Should fail if 'algorithm' isn't one of _predefined_ strings"); + +/** + * Test that valid $percentile specifications are accepted. The results, i.e. semantics, are tested + * elsewhere and would cover all of the cases below, we are providing them here nonetheless for + * completeness. + */ +function assertValidSyntax(percentileSpec, msg) { + assert.commandWorked( + coll.runCommand("aggregate", + {pipeline: [{$group: {_id: null, p: percentileSpec}}], cursor: {}}), + msg); +} + +assertValidSyntax( + {$percentile: {p: [0.0, 0.0001, 0.5, 0.995, 1.0], input: "$x", algorithm: "approximate"}}, + "Should be able to specify an array of percentiles"); + +assertValidSyntax( + {$percentile: {p: [0.5, 0.9], input: {$divide: ["$x", 2]}, algorithm: "approximate"}}, + "Should be able to specify 'input' as an expression"); + +assertValidSyntax({$percentile: {p: [0.5, 0.9], input: "x", algorithm: "approximate"}}, + "Non-numeric inputs should be gracefully ignored"); + +/** + * Test that the "arrayness" of the result matches the "arrayness" of the specification. + */ +function assertArrayness(percentileSpec, msg) { + const pInSpec = percentileSpec.$percentile.p; + + const res = coll.aggregate([{$group: {_id: null, p: percentileSpec}}]).toArray(); + + assert.eq(Array.isArray(pInSpec), Array.isArray(res[0].p), msg + ` result: ${tojson(res)}`); + if (Array.isArray(pInSpec)) { + assert.eq(pInSpec.length, res[0].p.length, msg + ` result: ${tojson(res)}`); + } +} + +assertArrayness({$percentile: {p: [0.2, 0.9], input: "$x", algorithm: "approximate"}}, + "Computed percentiles should be in an array of length 2"); + +assertArrayness({$percentile: {p: [0.2], input: "$x", algorithm: "approximate"}}, + "Computed percentiles should be in an array of length 1"); +})(); diff --git a/src/mongo/db/pipeline/SConscript b/src/mongo/db/pipeline/SConscript index 918878e266e..cf4b9d8e8d9 100644 --- a/src/mongo/db/pipeline/SConscript +++ b/src/mongo/db/pipeline/SConscript @@ -116,11 +116,13 @@ env.Library( 'accumulator_merge_objects.cpp', 'accumulator_min_max.cpp', 'accumulator_multi.cpp', + 'accumulator_percentile.cpp', 'accumulator_push.cpp', 'accumulator_rank.cpp', 'accumulator_std_dev.cpp', 'accumulator_sum.cpp', 'map_reduce_options.idl', + 'percentile_algo_sort_and_rank.cpp', 'window_function/window_bounds.cpp', 'window_function/window_function_covariance.cpp', 'window_function/window_function_count.cpp', diff --git a/src/mongo/db/pipeline/accumulator_percentile.cpp b/src/mongo/db/pipeline/accumulator_percentile.cpp new file mode 100644 index 00000000000..659c08d526e --- /dev/null +++ b/src/mongo/db/pipeline/accumulator_percentile.cpp @@ -0,0 +1,174 @@ +/** + * Copyright (C) 2023-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/db/pipeline/accumulator_percentile.h" + +namespace mongo { + +using boost::intrusive_ptr; + +REGISTER_ACCUMULATOR_WITH_FEATURE_FLAG(percentile, + AccumulatorPercentile::parseArgs, + feature_flags::gFeatureFlagApproxPercentiles); + +namespace { +// Checks that 'pv' is an array of valid percentile specifications. +std::vector<double> validatePercentileArg(const Value& pv) { + const auto msg = "'p' must be an array of numeric values from [0.0, 1.0] range, but found "_sd; + uassert(7429700, str::stream() << msg << pv.toString(), pv.isArray()); + + std::vector<double> ps; + for (const Value& p : pv.getArray()) { + uassert(7429701, str::stream() << msg << pv.toString(), p.numeric()); + auto pd = p.coerceToDouble(); + uassert(7429702, str::stream() << msg << pv.toString(), pd >= 0 && pd <= 1); + ps.push_back(pd); + } + return ps; +} +} // namespace + +// TODO SERVER-74556: Move parsing of the args into IDL. +AccumulationExpression AccumulatorPercentile::parseArgs(ExpressionContext* const expCtx, + BSONElement elem, + VariablesParseState vps) { + expCtx->sbeGroupCompatible = false; + + uassert(7429703, + str::stream() << "specification must be an object; found " << elem, + elem.type() == BSONType::Object); + BSONObj obj = elem.embeddedObject(); + + boost::intrusive_ptr<Expression> input; + std::vector<double> ps; + std::string algo = ""; + for (auto&& element : obj) { + auto fieldName = element.fieldNameStringData(); + if (fieldName == kFieldNameInput) { + input = Expression::parseOperand(expCtx, element, vps); + } else if (fieldName == kFieldNameP) { + auto pv = Value(element); + ps = validatePercentileArg(pv); + } else if (fieldName == kFieldNameAlgo) { + if (element.type() == BSONType::String) { + algo = element.String(); + } + // More types of percentiles will be supported in the future: PM-1883 + uassert(7429704, + str::stream() + << "the valid specifications for 'algorithm' are: 'approximate'; found " + << element, + algo == "approximate"); + } else { + uasserted(7429705, + str::stream() << "Unknown argument for 'percentile' operator: " << fieldName); + } + } + uassert(7429706, str::stream() << "Missing value for '" << kFieldNameP << "'", !ps.empty()); + uassert(7429707, str::stream() << "Missing value for '" << kFieldNameInput << "'", input); + uassert( + 7429708, str::stream() << "Missing value for '" << kFieldNameAlgo << "'", !algo.empty()); + + auto factory = [expCtx, ps] { + // Temporary implementation! To be replaced based on the user's choice of algorithm. + auto algo = PercentileAlgorithm::createDiscreteSortAndRank(); + + return AccumulatorPercentile::create(expCtx, ps, std::move(algo)); + }; + + return {ExpressionConstant::create(expCtx, Value(BSONNULL)) /*initializer*/, + std::move(input) /*argument*/, + std::move(factory), + "$percentile"_sd /*name*/}; +} + +void AccumulatorPercentile::processInternal(const Value& input, bool merging) { + if (merging) { + // Merging percentiles from different shards. For approximate percentiels 'input' is a + // document with a digest. + // TODO SERVER-74358: Support sharded collections + uasserted(7429709, "Sharded collections are not yet supported by $percentile"); + return; + } + + if (!input.numeric()) { + return; + } + + _algo->incorporate(input.coerceToDouble()); + + // TODO SERVER-74558: Could/should we update the memory usage less often? + _memUsageBytes = sizeof(*this) + _algo->memUsageBytes(); +} + +Value AccumulatorPercentile::getValue(bool toBeMerged) { + if (toBeMerged) { + // Return a document, containing the whole digest, because they would need to be merged + // on mongos to compute the final result. + // TODO SERVER-74358: Support sharded collections + uasserted(7429710, "Sharded collections are not yet supported by $percentile"); + return Value(Document{}); + } + + // Compute the percentiles for each requested one in the order listed. Computing a percentile + // can only fail if there have been no numeric inputs, then all percentiles would fail. Rather + // than returning an array of nulls in this case, we return a single null value. + std::vector<double> pctls; + for (double p : _percentiles) { + auto res = _algo->computePercentile(p); + if (!res) { + return Value(BSONNULL); + } + pctls.push_back(res.value()); + } + + return Value(std::vector<Value>(pctls.begin(), pctls.end())); +} + +AccumulatorPercentile::AccumulatorPercentile(ExpressionContext* const expCtx, + const std::vector<double>& ps, + std::unique_ptr<PercentileAlgorithm> algo) + : AccumulatorState(expCtx), _percentiles(ps), _algo(std::move(algo)) { + _memUsageBytes = sizeof(*this) + _algo->memUsageBytes(); +} + +void AccumulatorPercentile::reset() { + // Temporary implementation! To be replaced based on the user's choice of algorithm. + auto algo = PercentileAlgorithm::createDiscreteSortAndRank(); + + _memUsageBytes = sizeof(*this) + _algo->memUsageBytes(); +} + +intrusive_ptr<AccumulatorState> AccumulatorPercentile::create( + ExpressionContext* const expCtx, + const std::vector<double>& ps, + std::unique_ptr<PercentileAlgorithm> algo) { + return new AccumulatorPercentile(expCtx, ps, std::move(algo)); +} +} // namespace mongo diff --git a/src/mongo/db/pipeline/accumulator_percentile.h b/src/mongo/db/pipeline/accumulator_percentile.h new file mode 100644 index 00000000000..f25724911f0 --- /dev/null +++ b/src/mongo/db/pipeline/accumulator_percentile.h @@ -0,0 +1,82 @@ +/** + * Copyright (C) 2023-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/db/pipeline/accumulation_statement.h" +#include "mongo/db/pipeline/percentile_algo.h" + +namespace mongo { +/** + * Accumulator for computing $percentile. + */ +class AccumulatorPercentile : public AccumulatorState { +public: + // {$percentile: {input: <Numeric>, p: <Array Expression>, algorithm: <String>} + static constexpr auto kFieldNameInput = "input"_sd; + static constexpr auto kFieldNameP = "p"_sd; + static constexpr auto kFieldNameAlgo = "algorithm"_sd; + + static constexpr auto kName = "$percentile"_sd; + const char* getOpName() const final { + return kName.rawData(); + } + + /** + * Parsing and creating the accumulator. A separate accumulator object is created per group. + */ + static AccumulationExpression parseArgs(ExpressionContext* expCtx, + BSONElement elem, + VariablesParseState vps); + + static boost::intrusive_ptr<AccumulatorState> create(ExpressionContext* expCtx, + const std::vector<double>& ps, + std::unique_ptr<PercentileAlgorithm> algo); + + AccumulatorPercentile(ExpressionContext* expCtx, + const std::vector<double>& ps, + std::unique_ptr<PercentileAlgorithm> algo); + + /** + * Ingressing values and computing the requested percentiles. + */ + void processInternal(const Value& input, bool merging) final; + Value getValue(bool toBeMerged) final; + + /** + * Other infra for the accumulators. + */ + void reset() final; + +private: + std::vector<double> _percentiles; + std::unique_ptr<PercentileAlgorithm> _algo; +}; + +} // namespace mongo diff --git a/src/mongo/db/pipeline/percentile_algo.h b/src/mongo/db/pipeline/percentile_algo.h new file mode 100644 index 00000000000..ce37ec2db77 --- /dev/null +++ b/src/mongo/db/pipeline/percentile_algo.h @@ -0,0 +1,77 @@ +/** + * Copyright (C) 2023-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include <boost/optional/optional.hpp> +#include <memory> + +namespace mongo { + +/** + * Eventually we'll be supporting multiple types of percentiles (discrete, continuous, approximate) + * and potentially multiple different algorithms for computing the approximate ones. + * + * The goal is to keep these algorithms MQL- and engine-agnostic with this interface. + */ +struct PercentileAlgorithm { + virtual ~PercentileAlgorithm() {} + + virtual void incorporate(double input) = 0; + virtual void incorporate(const std::vector<double>& inputs) = 0; + + /** + * 'p' must be from [0, 1] range. + * + * It is always valid to call 'computePercentile()', however, if no input has been incorporated + * yet, all implementations must return 'boost::none'. It is allowed to incorporate more inputs + * after calling 'computePercentile()' and call it again (naturally, the result might differ + * depending on the new data). + * + * Note 1: the implementations are free to either return "none" or throw if they require setting + * up for computing a specific percentile but a different one is requested here. + * + * Note 2: the implementations might have different tradeoffs regarding balancing performance of + * ingress vs computing the percentile, so this interface provides no perfomance guarantees. + * Refer to the documentation of the specific implementations for details. + */ + virtual boost::optional<double> computePercentile(double p) = 0; + + /* + * The owner might need a rough estimate of how much memory the algorithm is using. + */ + virtual long memUsageBytes() = 0; + + /** + * Factory methods for instantiating concrete algorithms. + */ + static std::unique_ptr<PercentileAlgorithm> createDiscreteSortAndRank(); +}; + +} // namespace mongo diff --git a/src/mongo/db/pipeline/percentile_algo_sort_and_rank.cpp b/src/mongo/db/pipeline/percentile_algo_sort_and_rank.cpp new file mode 100644 index 00000000000..921e8845f41 --- /dev/null +++ b/src/mongo/db/pipeline/percentile_algo_sort_and_rank.cpp @@ -0,0 +1,90 @@ +/** + * Copyright (C) 2023-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include <algorithm> +#include <cmath> +#include <vector> + +#include "mongo/db/pipeline/percentile_algo.h" + +namespace mongo { + +/** + * 'SortAndRank' algorithm for computing percentiles is accurate and doesn't require specifying a + * percentile in advance but is only suitable for small datasets. It accumulats all data sent to it + * and sorts it all when a percentile is requested. Requesting more percentiles after the first one + * without incorporating more data is fast as it doesn't need to sort again. + */ +class DiscreteSortAndRank : public PercentileAlgorithm { +public: + DiscreteSortAndRank() = default; // no config required for this algorithm + + void incorporate(double input) final { + // Take advantage of already sorted input -- avoid resorting it later. + if (!_shouldSort && !_accumulatedValues.empty() && input < _accumulatedValues.back()) { + _shouldSort = true; + } + + _accumulatedValues.push_back(input); + } + void incorporate(const std::vector<double>& inputs) final { + _shouldSort = true; + _accumulatedValues.reserve(_accumulatedValues.size() + inputs.size()); + _accumulatedValues.insert(_accumulatedValues.end(), inputs.begin(), inputs.end()); + } + + boost::optional<double> computePercentile(double p) final { + if (_accumulatedValues.empty()) { + return boost::none; + } + + if (_shouldSort) { + std::sort(_accumulatedValues.begin(), _accumulatedValues.end()); + _shouldSort = false; + } + + const double n = _accumulatedValues.size(); + const size_t rank = static_cast<size_t>(std::max<double>(0, std::ceil(p * n) - 1)); + return _accumulatedValues[rank]; + } + + long memUsageBytes() final { + return _accumulatedValues.capacity() * sizeof(double); + } + +private: + std::vector<double> _accumulatedValues; + bool _shouldSort = true; +}; + +std::unique_ptr<PercentileAlgorithm> PercentileAlgorithm::createDiscreteSortAndRank() { + return std::make_unique<DiscreteSortAndRank>(); +} + +} // namespace mongo |