diff options
author | Harish Murthy <harish.v.murthy@gmail.com> | 2016-12-09 14:28:19 +0530 |
---|---|---|
committer | antirez <antirez@gmail.com> | 2016-12-16 11:07:30 +0100 |
commit | c55e3fbae5273d8a6fd5582e44d8745b2b81b7df (patch) | |
tree | 855c3c2879e5b3fd2e03e28f5766265ba7cadcc0 /src/hyperloglog.c | |
parent | 5ad2a94a1696fd7a87070836fdb081c4027730f5 (diff) | |
download | redis-c55e3fbae5273d8a6fd5582e44d8745b2b81b7df.tar.gz |
LogLog-Beta Algorithm support within HLL
Config option to use LogLog-Beta Algorithm for Cardinality
Diffstat (limited to 'src/hyperloglog.c')
-rw-r--r-- | src/hyperloglog.c | 70 |
1 files changed, 44 insertions, 26 deletions
diff --git a/src/hyperloglog.c b/src/hyperloglog.c index 8ccc16be2..67a928729 100644 --- a/src/hyperloglog.c +++ b/src/hyperloglog.c @@ -993,33 +993,51 @@ uint64_t hllCount(struct hllhdr *hdr, int *invalid) { } else { serverPanic("Unknown HyperLogLog encoding in hllCount()"); } - - /* Muliply the inverse of E for alpha_m * m^2 to have the raw estimate. */ - E = (1/E)*alpha*m*m; - - /* Use the LINEARCOUNTING algorithm for small cardinalities. - * For larger values but up to 72000 HyperLogLog raw approximation is - * used since linear counting error starts to increase. However HyperLogLog - * shows a strong bias in the range 2.5*16384 - 72000, so we try to - * compensate for it. */ - if (E < m*2.5 && ez != 0) { - E = m*log(m/ez); /* LINEARCOUNTING() */ - } else if (m == 16384 && E < 72000) { - /* We did polynomial regression of the bias for this range, this - * way we can compute the bias for a given cardinality and correct - * according to it. Only apply the correction for P=14 that's what - * we use and the value the correction was verified with. */ - double bias = 5.9119*1.0e-18*(E*E*E*E) - -1.4253*1.0e-12*(E*E*E)+ - 1.2940*1.0e-7*(E*E) - -5.2921*1.0e-3*E+ - 83.3216; - E -= E*(bias/100); + + if(server.hll_use_loglogbeta) { + /* For loglog-beta there is a single formula to compute + * cardinality for the enture range + */ + + double zl = log(ez + 1); + double beta = -0.370393911*ez + + 0.070471823*zl + + 0.17393686*pow(zl,2) + + 0.16339839*pow(zl,3) + + -0.09237745*pow(zl,4) + + 0.03738027*pow(zl,5) + + -0.005384159*pow(zl,6) + + 0.00042419*pow(zl,7); + + E = alpha*m*(m-ez)*(1/(E+beta)); + } else { + /* Muliply the inverse of E for alpha_m * m^2 to have the raw estimate. */ + E = (1/E)*alpha*m*m; + + /* Use the LINEARCOUNTING algorithm for small cardinalities. + * For larger values but up to 72000 HyperLogLog raw approximation is + * used since linear counting error starts to increase. However HyperLogLog + * shows a strong bias in the range 2.5*16384 - 72000, so we try to + * compensate for it. */ + if (E < m*2.5 && ez != 0) { + E = m*log(m/ez); /* LINEARCOUNTING() */ + } else if (m == 16384 && E < 72000) { + /* We did polynomial regression of the bias for this range, this + * way we can compute the bias for a given cardinality and correct + * according to it. Only apply the correction for P=14 that's what + * we use and the value the correction was verified with. */ + double bias = 5.9119*1.0e-18*(E*E*E*E) + -1.4253*1.0e-12*(E*E*E)+ + 1.2940*1.0e-7*(E*E) + -5.2921*1.0e-3*E+ + 83.3216; + E -= E*(bias/100); + } + /* We don't apply the correction for E > 1/30 of 2^32 since we use + * a 64 bit function and 6 bit counters. To apply the correction for + * 1/30 of 2^64 is not needed since it would require a huge set + * to approach such a value. */ } - /* We don't apply the correction for E > 1/30 of 2^32 since we use - * a 64 bit function and 6 bit counters. To apply the correction for - * 1/30 of 2^64 is not needed since it would require a huge set - * to approach such a value. */ return (uint64_t) E; } |