diff options
author | Warren Weckesser <warren.weckesser@gmail.com> | 2019-10-18 04:12:24 -0400 |
---|---|---|
committer | Warren Weckesser <warren.weckesser@gmail.com> | 2019-10-18 04:12:24 -0400 |
commit | 8634b18abdb30629b4b869f0c81bb7a350832baa (patch) | |
tree | 81ded688d97dcca235c9e6eded5210d7d94b5919 /numpy/random/src | |
parent | 9ee262b5bf89e8c866a507e4d62b78532361adc2 (diff) | |
download | numpy-8634b18abdb30629b4b869f0c81bb7a350832baa.tar.gz |
ENH: random: Add the multivariate hypergeometric distribution
The new method
multivariate_hypergeometric(self, object colors, object nsample,
size=None, method='marginals')
of the class numpy.random.Generator implements the multivariate
hypergeometric distribution; see
https://en.wikipedia.org/wiki/Hypergeometric_distribution,
specifically the section "Multivariate hypergeometric distribution".
Two algorithms are implemented. The user selects which algorithm
to use with the `method` parameter. The default, `method='marginals'`,
is based on repeated calls of the univariate hypergeometric
distribution function. The other algorithm, selected with
`method='count'`, is a brute-force method that allocates an
internal array of length ``sum(colors)``. It should only be used
when that value is small, but it can be much faster than the
"marginals" algorithm in that case.
The C implementations of the two methods are in the files
random_mvhg_count.c and random_mvhg_marginals.c in
numpy/random/src/distributions.
Diffstat (limited to 'numpy/random/src')
-rw-r--r-- | numpy/random/src/distributions/random_mvhg_count.c | 131 | ||||
-rw-r--r-- | numpy/random/src/distributions/random_mvhg_marginals.c | 138 |
2 files changed, 269 insertions, 0 deletions
diff --git a/numpy/random/src/distributions/random_mvhg_count.c b/numpy/random/src/distributions/random_mvhg_count.c new file mode 100644 index 000000000..9c0cc045d --- /dev/null +++ b/numpy/random/src/distributions/random_mvhg_count.c @@ -0,0 +1,131 @@ +#include <stdint.h> +#include <stdlib.h> +#include <stdbool.h> + +#include "include/distributions.h" + +/* + * random_mvhg_count + * + * Draw variates from the multivariate hypergeometric distribution-- + * the "count" algorithm. + * + * Parameters + * ---------- + * bitgen_t *bitgen_state + * Pointer to a `bitgen_t` instance. + * int64_t total + * The sum of the values in the array `colors`. (This is redundant + * information, but we know the caller has already computed it, so + * we might as well use it.) + * size_t num_colors + * The length of the `colors` array. + * int64_t *colors + * The array of colors (i.e. the number of each type in the collection + * from which the random variate is drawn). + * int64_t nsample + * The number of objects drawn without replacement for each variate. + * `nsample` must not exceed sum(colors). This condition is not checked; + * it is assumed that the caller has already validated the value. + * size_t num_variates + * The number of variates to be produced and put in the array + * pointed to by `variates`. One variate is a vector of length + * `num_colors`, so the array pointed to by `variates` must have length + * `num_variates * num_colors`. + * int64_t *variates + * The array that will hold the result. It must have length + * `num_variates * num_colors`. + * The array is not initialized in the function; it is expected that the + * array has been initialized with zeros when the function is called. + * + * Notes + * ----- + * The "count" algorithm for drawing one variate is roughly equivalent to the + * following numpy code: + * + * choices = np.repeat(np.arange(len(colors)), colors) + * selection = np.random.choice(choices, nsample, replace=False) + * variate = np.bincount(selection, minlength=len(colors)) + * + * This function uses a temporary array with length sum(colors). + * + * Assumptions on the arguments (not checked in the function): + * * colors[k] >= 0 for k in range(num_colors) + * * total = sum(colors) + * * 0 <= nsample <= total + * * the product total * sizeof(size_t) does not exceed SIZE_MAX + * * the product num_variates * num_colors does not overflow + */ + +int random_mvhg_count(bitgen_t *bitgen_state, + int64_t total, + size_t num_colors, int64_t *colors, + int64_t nsample, + size_t num_variates, int64_t *variates) +{ + size_t *choices; + bool more_than_half; + + if ((total == 0) || (nsample == 0) || (num_variates == 0)) { + // Nothing to do. + return 0; + } + + choices = malloc(total * (sizeof *choices)); + if (choices == NULL) { + return -1; + } + + /* + * If colors contains, for example, [3 2 5], then choices + * will contain [0 0 0 1 1 2 2 2 2 2]. + */ + for (size_t i = 0, k = 0; i < num_colors; ++i) { + for (int64_t j = 0; j < colors[i]; ++j) { + choices[k] = i; + ++k; + } + } + + more_than_half = nsample > (total / 2); + if (more_than_half) { + nsample = total - nsample; + } + + for (size_t i = 0; i < num_variates * num_colors; i += num_colors) { + /* + * Fisher-Yates shuffle, but only loop through the first + * `nsample` entries of `choices`. After the loop, + * choices[:nsample] contains a random sample from the + * the full array. + */ + for (size_t j = 0; j < (size_t) nsample; ++j) { + size_t tmp, k; + // Note: nsample is not greater than total, so there is no danger + // of integer underflow in `(size_t) total - j - 1`. + k = j + (size_t) random_interval(bitgen_state, + (size_t) total - j - 1); + tmp = choices[k]; + choices[k] = choices[j]; + choices[j] = tmp; + } + /* + * Count the number of occurrences of each value in choices[:nsample]. + * The result, stored in sample[i:i+num_colors], is the sample from + * the multivariate hypergeometric distribution. + */ + for (size_t j = 0; j < (size_t) nsample; ++j) { + variates[i + choices[j]] += 1; + } + + if (more_than_half) { + for (size_t k = 0; k < num_colors; ++k) { + variates[i + k] = colors[k] - variates[i + k]; + } + } + } + + free(choices); + + return 0; +} diff --git a/numpy/random/src/distributions/random_mvhg_marginals.c b/numpy/random/src/distributions/random_mvhg_marginals.c new file mode 100644 index 000000000..301a4acad --- /dev/null +++ b/numpy/random/src/distributions/random_mvhg_marginals.c @@ -0,0 +1,138 @@ +#include <stdint.h> +#include <stddef.h> +#include <stdbool.h> +#include <math.h> + +#include "include/distributions.h" +#include "logfactorial.h" + + +/* + * random_mvhg_marginals + * + * Draw samples from the multivariate hypergeometric distribution-- + * the "marginals" algorithm. + * + * This version generates the sample by iteratively calling + * hypergeometric() (the univariate hypergeometric distribution). + * + * Parameters + * ---------- + * bitgen_t *bitgen_state + * Pointer to a `bitgen_t` instance. + * int64_t total + * The sum of the values in the array `colors`. (This is redundant + * information, but we know the caller has already computed it, so + * we might as well use it.) + * size_t num_colors + * The length of the `colors` array. The functions assumes + * num_colors > 0. + * int64_t *colors + * The array of colors (i.e. the number of each type in the collection + * from which the random variate is drawn). + * int64_t nsample + * The number of objects drawn without replacement for each variate. + * `nsample` must not exceed sum(colors). This condition is not checked; + * it is assumed that the caller has already validated the value. + * size_t num_variates + * The number of variates to be produced and put in the array + * pointed to by `variates`. One variate is a vector of length + * `num_colors`, so the array pointed to by `variates` must have length + * `num_variates * num_colors`. + * int64_t *variates + * The array that will hold the result. It must have length + * `num_variates * num_colors`. + * The array is not initialized in the function; it is expected that the + * array has been initialized with zeros when the function is called. + * + * Notes + * ----- + * Here's an example that demonstrates the idea of this algorithm. + * + * Suppose the urn contains red, green, blue and yellow marbles. + * Let nred be the number of red marbles, and define the quantities for + * the other colors similarly. The total number of marbles is + * + * total = nred + ngreen + nblue + nyellow. + * + * To generate a sample using rk_hypergeometric: + * + * red_sample = hypergeometric(ngood=nred, nbad=total - nred, + * nsample=nsample) + * + * This gives us the number of red marbles in the sample. The number of + * marbles in the sample that are *not* red is nsample - red_sample. + * To figure out the distribution of those marbles, we again use + * rk_hypergeometric: + * + * green_sample = hypergeometric(ngood=ngreen, + * nbad=total - nred - ngreen, + * nsample=nsample - red_sample) + * + * Similarly, + * + * blue_sample = hypergeometric( + * ngood=nblue, + * nbad=total - nred - ngreen - nblue, + * nsample=nsample - red_sample - green_sample) + * + * Finally, + * + * yellow_sample = total - (red_sample + green_sample + blue_sample). + * + * The above sequence of steps is implemented as a loop for an arbitrary + * number of colors in the innermost loop in the code below. `remaining` + * is the value passed to `nbad`; it is `total - colors[0]` in the first + * call to random_hypergeometric(), and then decreases by `colors[j]` in + * each iteration. `num_to_sample` is the `nsample` argument. It + * starts at this function's `nsample` input, and is decreased by the + * result of the call to random_hypergeometric() in each iteration. + * + * Assumptions on the arguments (not checked in the function): + * * colors[k] >= 0 for k in range(num_colors) + * * total = sum(colors) + * * 0 <= nsample <= total + * * the product num_variates * num_colors does not overflow + */ + +void random_mvhg_marginals(bitgen_t *bitgen_state, + int64_t total, + size_t num_colors, int64_t *colors, + int64_t nsample, + size_t num_variates, int64_t *variates) +{ + bool more_than_half; + + if ((total == 0) || (nsample == 0) || (num_variates == 0)) { + // Nothing to do. + return; + } + + more_than_half = nsample > (total / 2); + if (more_than_half) { + nsample = total - nsample; + } + + for (size_t i = 0; i < num_variates * num_colors; i += num_colors) { + int64_t num_to_sample = nsample; + int64_t remaining = total; + for (size_t j = 0; (num_to_sample > 0) && (j + 1 < num_colors); ++j) { + int64_t r; + remaining -= colors[j]; + r = random_hypergeometric(bitgen_state, + colors[j], remaining, num_to_sample); + variates[i + j] = r; + num_to_sample -= r; + } + + if (num_to_sample > 0) { + variates[i + num_colors - 1] = num_to_sample; + } + + if (more_than_half) { + for (size_t k = 0; k < num_colors; ++k) { + variates[i + k] = colors[k] - variates[i + k]; + } + } + } +} |