1 files changed, 158 insertions, 0 deletions
diff --git a/src/qdoc/catch_generators/src/catch_generators/utilities/statistics/distribution.h b/src/qdoc/catch_generators/src/catch_generators/utilities/statistics/distribution.h
new file mode 100644
index 000000000..4374993bf
--- /dev/null
+++ b/src/qdoc/catch_generators/src/catch_generators/utilities/statistics/distribution.h
@@ -0,0 +1,158 @@
+// Copyright (C) 2022 The Qt Company Ltd.
+// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
+
+#pragma once
+
+#include "../../namespaces.h"
+
+#include <functional>
+#include <optional>
+#include <ostream>
+#include <unordered_map>
+
+namespace QDOC_CATCH_GENERATORS_UTILITIES_ABSOLUTE_NAMESPACE {
+
+    template<typename T>
+    using Histogram = std::unordered_map<T, std::size_t>;
+
+    template<typename InputIt, typename GroupBy>
+    auto make_histogram(InputIt begin, InputIt end, GroupBy&& group_by) {
+        Histogram<std::invoke_result_t<GroupBy, decltype(*begin)>> histogram{};
+
+        while (begin != end) {
+            auto key{std::invoke(std::forward<GroupBy>(group_by), *begin)};
+
+            histogram.try_emplace(key, 0);
+            histogram[key] += 1;
+            ++begin;
+        }
+
+        return histogram;
+    }
+
+    template<typename T>
+    struct DistributionError {
+        T value;
+        double probability;
+        double expected_probability;
+    };
+
+    template<typename T>
+    inline std::ostream& operator<<(std::ostream& os, const DistributionError<T>& error) {
+        return os << "DistributionError{" <<
+            "The value { " << error.value <<
+            " } appear with a probability of { " << error.probability <<
+            " } while a probability of { " << error.expected_probability << " } was expected." <<
+            "}";
+    }
+
+    // REMARK: The following should really return an Either of unit/error
+    // but std::variant in C++ is both extremely unusable and comes with a
+    // strong overhead unless certain conditions are met.
+    // For this reason, we keep to the less intutitive optional error.
+
+    /*!
+    * Returns true when the given \a sequence approximately respects a
+    * given distribution.
+    *
+    * The \a sequence respects a given distribution when the count of
+    * each collection of values is a percentage of the total values that
+    * is near the percentage probability described by distribution.
+    *
+    * The values in \a sequence are collected according to \a group_by.
+    * \a group_by, given an element of \a sequence, should return a value
+    * of some type that represent the category of the inspected value.
+    * Values that have the same category share their count.
+    *
+    * The distribution that should be respected is given by \a
+    * probability_of. \a probability_of is a function that takes a
+    * category that was produced from a call to \a group_by and returns
+    * the expect probability, in percentage, of apperance for that
+    * category.
+    *
+    * The given probability is then compared to the one found by counting
+    * the element of \a sequence under \a group_by, to ensure that it
+    * matches.
+    *
+    * The margin of error for the comparison is given, in percentage
+    * points, by \a margin.
+    * The approximation uses an absolute comparison and scales the
+    * margin inversely based on the size of \a sequence, to account for the
+    * precision of the data set itself.
+    *
+    * When the distribution is not respected, a DistributionError is
+    * returned enclosed in an optional value.
+    * The error allows reports which the first category for which the
+    * comparison failed, along with its expected probability and the one
+    * that was actually inferred from \a sequence.
+    */
+    template<typename T, typename GroupBy, typename ProbabilityOf>
+    std::optional<DistributionError<T>> respects_distribution(std::vector<T>&& sequence, GroupBy&& group_by, ProbabilityOf&& probability_of, double margin = 33) {
+        std::size_t data_point_amount{sequence.size()};
+
+        // REMARK: We scale the margin based on the data set to allow for
+        // an easier change in downstream tests.
+        // The precision required for the approximation will vary
+        // depending on how many values we generate.
+        // The amount of values we generate depends on how much time we
+        // want the tests to take.
+        // This amount may change in the future. For example, as code is
+        // added and tests are added, we might need some expensive
+        // computations here and there.
+        // Sometimes, this will increase the test suite runtime without an
+        // obvious way of improving the performance of the underlying code
+        // to reduce it.
+        // In those cases, the total run time can be decreased by running
+        // less generations for battle-tested tests.
+        // If some code has not been changed for a long time, it will have
+        // had thousands of generations by that point, giving us a good
+        // degree of certainty of it not being bugged (for whatever bugs
+        // the tests account for).
+        // Then, running a certain amount of generation is not required
+        // anymore such that some of them can be optimized out.
+        // For tests like the one using this function, where our ability
+        // to test is always dependent on the amount of generations,
+        // changing the generated amount will mean that we will need to
+        // change our conditions too, potentially changing the meaning of
+        // the test.
+        // To take this into account, we perform a scaling on the
+        // condition itself, so that if the amount of data points that are
+        // generated changes, we do not generally have to change anything
+        // in the condition.
+        //
+        // For this case, we scale logarithmically_10 for the simple
+        // reason that we tend to generate values in power of tens,
+        // starting with the 100 values default that Quickcheck used.
+        //
+        // The default value for the margin on which the scaling is based,
+        // was chosen heuristically.
+        // As we expect generation under 10^3 to be generally meaningless
+        // for this kind of testing, the value was chosen so that it would
+        // start to normalize around that amount.
+        // Deviation of about 5-10% were identified trough various
+        // generations for an amount of data points near 1000, while a
+        // deviation of about 1-3% was identified with about 10000 values.
+        // With the chosen default value, the scaling approaches those
+        // percentage points with some margin of error.
+        //
+        // We expect up to a 10%, or a bit more, deviation to be suitable
+        // for our purposes, as it would still allow for a varied
+        // distribution in downstream consumers.
+        double scaled_margin{margin * (1.0/std::log10(data_point_amount))};
+
+        auto histogram{make_histogram(sequence.begin(), sequence.end(), std::forward<GroupBy>(group_by))};
+
+        for (auto& bin : histogram) {
+            auto [key, count] = bin;
+
+            double actual_percentage{percent_of(static_cast<double>(count), static_cast<double>(data_point_amount))};
+            double expected_percentage{std::invoke(std::forward<ProbabilityOf>(probability_of), key)};
+
+            if (!(actual_percentage == Approx(expected_percentage).margin(scaled_margin)))
+                return std::make_optional(DistributionError<T>{key, actual_percentage, expected_percentage});
+        }
+
+        return std::nullopt;
+    }
+
+} // end QDOC_CATCH_GENERATORS_UTILITIES_ABSOLUTE_NAMESPACE