diff options
Diffstat (limited to 'src/qdoc/catch_generators/src/catch_generators/utilities/statistics/distribution.h')
-rw-r--r-- | src/qdoc/catch_generators/src/catch_generators/utilities/statistics/distribution.h | 158 |
1 files changed, 158 insertions, 0 deletions
diff --git a/src/qdoc/catch_generators/src/catch_generators/utilities/statistics/distribution.h b/src/qdoc/catch_generators/src/catch_generators/utilities/statistics/distribution.h new file mode 100644 index 000000000..4374993bf --- /dev/null +++ b/src/qdoc/catch_generators/src/catch_generators/utilities/statistics/distribution.h @@ -0,0 +1,158 @@ +// Copyright (C) 2022 The Qt Company Ltd. +// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0 + +#pragma once + +#include "../../namespaces.h" + +#include <functional> +#include <optional> +#include <ostream> +#include <unordered_map> + +namespace QDOC_CATCH_GENERATORS_UTILITIES_ABSOLUTE_NAMESPACE { + + template<typename T> + using Histogram = std::unordered_map<T, std::size_t>; + + template<typename InputIt, typename GroupBy> + auto make_histogram(InputIt begin, InputIt end, GroupBy&& group_by) { + Histogram<std::invoke_result_t<GroupBy, decltype(*begin)>> histogram{}; + + while (begin != end) { + auto key{std::invoke(std::forward<GroupBy>(group_by), *begin)}; + + histogram.try_emplace(key, 0); + histogram[key] += 1; + ++begin; + } + + return histogram; + } + + template<typename T> + struct DistributionError { + T value; + double probability; + double expected_probability; + }; + + template<typename T> + inline std::ostream& operator<<(std::ostream& os, const DistributionError<T>& error) { + return os << "DistributionError{" << + "The value { " << error.value << + " } appear with a probability of { " << error.probability << + " } while a probability of { " << error.expected_probability << " } was expected." << + "}"; + } + + // REMARK: The following should really return an Either of unit/error + // but std::variant in C++ is both extremely unusable and comes with a + // strong overhead unless certain conditions are met. + // For this reason, we keep to the less intutitive optional error. + + /*! + * Returns true when the given \a sequence approximately respects a + * given distribution. + * + * The \a sequence respects a given distribution when the count of + * each collection of values is a percentage of the total values that + * is near the percentage probability described by distribution. + * + * The values in \a sequence are collected according to \a group_by. + * \a group_by, given an element of \a sequence, should return a value + * of some type that represent the category of the inspected value. + * Values that have the same category share their count. + * + * The distribution that should be respected is given by \a + * probability_of. \a probability_of is a function that takes a + * category that was produced from a call to \a group_by and returns + * the expect probability, in percentage, of apperance for that + * category. + * + * The given probability is then compared to the one found by counting + * the element of \a sequence under \a group_by, to ensure that it + * matches. + * + * The margin of error for the comparison is given, in percentage + * points, by \a margin. + * The approximation uses an absolute comparison and scales the + * margin inversely based on the size of \a sequence, to account for the + * precision of the data set itself. + * + * When the distribution is not respected, a DistributionError is + * returned enclosed in an optional value. + * The error allows reports which the first category for which the + * comparison failed, along with its expected probability and the one + * that was actually inferred from \a sequence. + */ + template<typename T, typename GroupBy, typename ProbabilityOf> + std::optional<DistributionError<T>> respects_distribution(std::vector<T>&& sequence, GroupBy&& group_by, ProbabilityOf&& probability_of, double margin = 33) { + std::size_t data_point_amount{sequence.size()}; + + // REMARK: We scale the margin based on the data set to allow for + // an easier change in downstream tests. + // The precision required for the approximation will vary + // depending on how many values we generate. + // The amount of values we generate depends on how much time we + // want the tests to take. + // This amount may change in the future. For example, as code is + // added and tests are added, we might need some expensive + // computations here and there. + // Sometimes, this will increase the test suite runtime without an + // obvious way of improving the performance of the underlying code + // to reduce it. + // In those cases, the total run time can be decreased by running + // less generations for battle-tested tests. + // If some code has not been changed for a long time, it will have + // had thousands of generations by that point, giving us a good + // degree of certainty of it not being bugged (for whatever bugs + // the tests account for). + // Then, running a certain amount of generation is not required + // anymore such that some of them can be optimized out. + // For tests like the one using this function, where our ability + // to test is always dependent on the amount of generations, + // changing the generated amount will mean that we will need to + // change our conditions too, potentially changing the meaning of + // the test. + // To take this into account, we perform a scaling on the + // condition itself, so that if the amount of data points that are + // generated changes, we do not generally have to change anything + // in the condition. + // + // For this case, we scale logarithmically_10 for the simple + // reason that we tend to generate values in power of tens, + // starting with the 100 values default that Quickcheck used. + // + // The default value for the margin on which the scaling is based, + // was chosen heuristically. + // As we expect generation under 10^3 to be generally meaningless + // for this kind of testing, the value was chosen so that it would + // start to normalize around that amount. + // Deviation of about 5-10% were identified trough various + // generations for an amount of data points near 1000, while a + // deviation of about 1-3% was identified with about 10000 values. + // With the chosen default value, the scaling approaches those + // percentage points with some margin of error. + // + // We expect up to a 10%, or a bit more, deviation to be suitable + // for our purposes, as it would still allow for a varied + // distribution in downstream consumers. + double scaled_margin{margin * (1.0/std::log10(data_point_amount))}; + + auto histogram{make_histogram(sequence.begin(), sequence.end(), std::forward<GroupBy>(group_by))}; + + for (auto& bin : histogram) { + auto [key, count] = bin; + + double actual_percentage{percent_of(static_cast<double>(count), static_cast<double>(data_point_amount))}; + double expected_percentage{std::invoke(std::forward<ProbabilityOf>(probability_of), key)}; + + if (!(actual_percentage == Approx(expected_percentage).margin(scaled_margin))) + return std::make_optional(DistributionError<T>{key, actual_percentage, expected_percentage}); + } + + return std::nullopt; + } + +} // end QDOC_CATCH_GENERATORS_UTILITIES_ABSOLUTE_NAMESPACE |