summaryrefslogtreecommitdiff
path: root/numpy/random/src
diff options
context:
space:
mode:
authorWarren Weckesser <warren.weckesser@gmail.com>2019-10-18 04:12:24 -0400
committerWarren Weckesser <warren.weckesser@gmail.com>2019-10-18 04:12:24 -0400
commit8634b18abdb30629b4b869f0c81bb7a350832baa (patch)
tree81ded688d97dcca235c9e6eded5210d7d94b5919 /numpy/random/src
parent9ee262b5bf89e8c866a507e4d62b78532361adc2 (diff)
downloadnumpy-8634b18abdb30629b4b869f0c81bb7a350832baa.tar.gz
ENH: random: Add the multivariate hypergeometric distribution
The new method multivariate_hypergeometric(self, object colors, object nsample, size=None, method='marginals') of the class numpy.random.Generator implements the multivariate hypergeometric distribution; see https://en.wikipedia.org/wiki/Hypergeometric_distribution, specifically the section "Multivariate hypergeometric distribution". Two algorithms are implemented. The user selects which algorithm to use with the `method` parameter. The default, `method='marginals'`, is based on repeated calls of the univariate hypergeometric distribution function. The other algorithm, selected with `method='count'`, is a brute-force method that allocates an internal array of length ``sum(colors)``. It should only be used when that value is small, but it can be much faster than the "marginals" algorithm in that case. The C implementations of the two methods are in the files random_mvhg_count.c and random_mvhg_marginals.c in numpy/random/src/distributions.
Diffstat (limited to 'numpy/random/src')
-rw-r--r--numpy/random/src/distributions/random_mvhg_count.c131
-rw-r--r--numpy/random/src/distributions/random_mvhg_marginals.c138
2 files changed, 269 insertions, 0 deletions
diff --git a/numpy/random/src/distributions/random_mvhg_count.c b/numpy/random/src/distributions/random_mvhg_count.c
new file mode 100644
index 000000000..9c0cc045d
--- /dev/null
+++ b/numpy/random/src/distributions/random_mvhg_count.c
@@ -0,0 +1,131 @@
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
+#include "include/distributions.h"
+
+/*
+ * random_mvhg_count
+ *
+ * Draw variates from the multivariate hypergeometric distribution--
+ * the "count" algorithm.
+ *
+ * Parameters
+ * ----------
+ * bitgen_t *bitgen_state
+ * Pointer to a `bitgen_t` instance.
+ * int64_t total
+ * The sum of the values in the array `colors`. (This is redundant
+ * information, but we know the caller has already computed it, so
+ * we might as well use it.)
+ * size_t num_colors
+ * The length of the `colors` array.
+ * int64_t *colors
+ * The array of colors (i.e. the number of each type in the collection
+ * from which the random variate is drawn).
+ * int64_t nsample
+ * The number of objects drawn without replacement for each variate.
+ * `nsample` must not exceed sum(colors). This condition is not checked;
+ * it is assumed that the caller has already validated the value.
+ * size_t num_variates
+ * The number of variates to be produced and put in the array
+ * pointed to by `variates`. One variate is a vector of length
+ * `num_colors`, so the array pointed to by `variates` must have length
+ * `num_variates * num_colors`.
+ * int64_t *variates
+ * The array that will hold the result. It must have length
+ * `num_variates * num_colors`.
+ * The array is not initialized in the function; it is expected that the
+ * array has been initialized with zeros when the function is called.
+ *
+ * Notes
+ * -----
+ * The "count" algorithm for drawing one variate is roughly equivalent to the
+ * following numpy code:
+ *
+ * choices = np.repeat(np.arange(len(colors)), colors)
+ * selection = np.random.choice(choices, nsample, replace=False)
+ * variate = np.bincount(selection, minlength=len(colors))
+ *
+ * This function uses a temporary array with length sum(colors).
+ *
+ * Assumptions on the arguments (not checked in the function):
+ * * colors[k] >= 0 for k in range(num_colors)
+ * * total = sum(colors)
+ * * 0 <= nsample <= total
+ * * the product total * sizeof(size_t) does not exceed SIZE_MAX
+ * * the product num_variates * num_colors does not overflow
+ */
+
+int random_mvhg_count(bitgen_t *bitgen_state,
+ int64_t total,
+ size_t num_colors, int64_t *colors,
+ int64_t nsample,
+ size_t num_variates, int64_t *variates)
+{
+ size_t *choices;
+ bool more_than_half;
+
+ if ((total == 0) || (nsample == 0) || (num_variates == 0)) {
+ // Nothing to do.
+ return 0;
+ }
+
+ choices = malloc(total * (sizeof *choices));
+ if (choices == NULL) {
+ return -1;
+ }
+
+ /*
+ * If colors contains, for example, [3 2 5], then choices
+ * will contain [0 0 0 1 1 2 2 2 2 2].
+ */
+ for (size_t i = 0, k = 0; i < num_colors; ++i) {
+ for (int64_t j = 0; j < colors[i]; ++j) {
+ choices[k] = i;
+ ++k;
+ }
+ }
+
+ more_than_half = nsample > (total / 2);
+ if (more_than_half) {
+ nsample = total - nsample;
+ }
+
+ for (size_t i = 0; i < num_variates * num_colors; i += num_colors) {
+ /*
+ * Fisher-Yates shuffle, but only loop through the first
+ * `nsample` entries of `choices`. After the loop,
+ * choices[:nsample] contains a random sample from the
+ * the full array.
+ */
+ for (size_t j = 0; j < (size_t) nsample; ++j) {
+ size_t tmp, k;
+ // Note: nsample is not greater than total, so there is no danger
+ // of integer underflow in `(size_t) total - j - 1`.
+ k = j + (size_t) random_interval(bitgen_state,
+ (size_t) total - j - 1);
+ tmp = choices[k];
+ choices[k] = choices[j];
+ choices[j] = tmp;
+ }
+ /*
+ * Count the number of occurrences of each value in choices[:nsample].
+ * The result, stored in sample[i:i+num_colors], is the sample from
+ * the multivariate hypergeometric distribution.
+ */
+ for (size_t j = 0; j < (size_t) nsample; ++j) {
+ variates[i + choices[j]] += 1;
+ }
+
+ if (more_than_half) {
+ for (size_t k = 0; k < num_colors; ++k) {
+ variates[i + k] = colors[k] - variates[i + k];
+ }
+ }
+ }
+
+ free(choices);
+
+ return 0;
+}
diff --git a/numpy/random/src/distributions/random_mvhg_marginals.c b/numpy/random/src/distributions/random_mvhg_marginals.c
new file mode 100644
index 000000000..301a4acad
--- /dev/null
+++ b/numpy/random/src/distributions/random_mvhg_marginals.c
@@ -0,0 +1,138 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <math.h>
+
+#include "include/distributions.h"
+#include "logfactorial.h"
+
+
+/*
+ * random_mvhg_marginals
+ *
+ * Draw samples from the multivariate hypergeometric distribution--
+ * the "marginals" algorithm.
+ *
+ * This version generates the sample by iteratively calling
+ * hypergeometric() (the univariate hypergeometric distribution).
+ *
+ * Parameters
+ * ----------
+ * bitgen_t *bitgen_state
+ * Pointer to a `bitgen_t` instance.
+ * int64_t total
+ * The sum of the values in the array `colors`. (This is redundant
+ * information, but we know the caller has already computed it, so
+ * we might as well use it.)
+ * size_t num_colors
+ * The length of the `colors` array. The functions assumes
+ * num_colors > 0.
+ * int64_t *colors
+ * The array of colors (i.e. the number of each type in the collection
+ * from which the random variate is drawn).
+ * int64_t nsample
+ * The number of objects drawn without replacement for each variate.
+ * `nsample` must not exceed sum(colors). This condition is not checked;
+ * it is assumed that the caller has already validated the value.
+ * size_t num_variates
+ * The number of variates to be produced and put in the array
+ * pointed to by `variates`. One variate is a vector of length
+ * `num_colors`, so the array pointed to by `variates` must have length
+ * `num_variates * num_colors`.
+ * int64_t *variates
+ * The array that will hold the result. It must have length
+ * `num_variates * num_colors`.
+ * The array is not initialized in the function; it is expected that the
+ * array has been initialized with zeros when the function is called.
+ *
+ * Notes
+ * -----
+ * Here's an example that demonstrates the idea of this algorithm.
+ *
+ * Suppose the urn contains red, green, blue and yellow marbles.
+ * Let nred be the number of red marbles, and define the quantities for
+ * the other colors similarly. The total number of marbles is
+ *
+ * total = nred + ngreen + nblue + nyellow.
+ *
+ * To generate a sample using rk_hypergeometric:
+ *
+ * red_sample = hypergeometric(ngood=nred, nbad=total - nred,
+ * nsample=nsample)
+ *
+ * This gives us the number of red marbles in the sample. The number of
+ * marbles in the sample that are *not* red is nsample - red_sample.
+ * To figure out the distribution of those marbles, we again use
+ * rk_hypergeometric:
+ *
+ * green_sample = hypergeometric(ngood=ngreen,
+ * nbad=total - nred - ngreen,
+ * nsample=nsample - red_sample)
+ *
+ * Similarly,
+ *
+ * blue_sample = hypergeometric(
+ * ngood=nblue,
+ * nbad=total - nred - ngreen - nblue,
+ * nsample=nsample - red_sample - green_sample)
+ *
+ * Finally,
+ *
+ * yellow_sample = total - (red_sample + green_sample + blue_sample).
+ *
+ * The above sequence of steps is implemented as a loop for an arbitrary
+ * number of colors in the innermost loop in the code below. `remaining`
+ * is the value passed to `nbad`; it is `total - colors[0]` in the first
+ * call to random_hypergeometric(), and then decreases by `colors[j]` in
+ * each iteration. `num_to_sample` is the `nsample` argument. It
+ * starts at this function's `nsample` input, and is decreased by the
+ * result of the call to random_hypergeometric() in each iteration.
+ *
+ * Assumptions on the arguments (not checked in the function):
+ * * colors[k] >= 0 for k in range(num_colors)
+ * * total = sum(colors)
+ * * 0 <= nsample <= total
+ * * the product num_variates * num_colors does not overflow
+ */
+
+void random_mvhg_marginals(bitgen_t *bitgen_state,
+ int64_t total,
+ size_t num_colors, int64_t *colors,
+ int64_t nsample,
+ size_t num_variates, int64_t *variates)
+{
+ bool more_than_half;
+
+ if ((total == 0) || (nsample == 0) || (num_variates == 0)) {
+ // Nothing to do.
+ return;
+ }
+
+ more_than_half = nsample > (total / 2);
+ if (more_than_half) {
+ nsample = total - nsample;
+ }
+
+ for (size_t i = 0; i < num_variates * num_colors; i += num_colors) {
+ int64_t num_to_sample = nsample;
+ int64_t remaining = total;
+ for (size_t j = 0; (num_to_sample > 0) && (j + 1 < num_colors); ++j) {
+ int64_t r;
+ remaining -= colors[j];
+ r = random_hypergeometric(bitgen_state,
+ colors[j], remaining, num_to_sample);
+ variates[i + j] = r;
+ num_to_sample -= r;
+ }
+
+ if (num_to_sample > 0) {
+ variates[i + num_colors - 1] = num_to_sample;
+ }
+
+ if (more_than_half) {
+ for (size_t k = 0; k < num_colors; ++k) {
+ variates[i + k] = colors[k] - variates[i + k];
+ }
+ }
+ }
+}