summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorantirez <antirez@gmail.com>2014-04-17 17:29:04 +0200
committerantirez <antirez@gmail.com>2014-04-18 16:16:20 +0200
commitf0eca148fffc265def5b5e07b679495d95e83f60 (patch)
tree1c2f27157b6dd40e9f2f23f96329f46f56ae64bc
parentd25fd8be641a0f036c5ab9632032c93d19dbfc71 (diff)
downloadredis-f0eca148fffc265def5b5e07b679495d95e83f60.tar.gz
PFCOUNT support for multi-key union.
-rw-r--r--src/hyperloglog.c77
-rw-r--r--src/redis.c2
2 files changed, 73 insertions, 6 deletions
diff --git a/src/hyperloglog.c b/src/hyperloglog.c
index 1b984c15f..a44df87c9 100644
--- a/src/hyperloglog.c
+++ b/src/hyperloglog.c
@@ -198,8 +198,9 @@ struct hllhdr {
#define HLL_REGISTER_MAX ((1<<HLL_BITS)-1)
#define HLL_HDR_SIZE sizeof(struct hllhdr)
#define HLL_DENSE_SIZE (HLL_HDR_SIZE+((HLL_REGISTERS*HLL_BITS+7)/8))
-#define HLL_DENSE 0 /* Dense encoding */
-#define HLL_SPARSE 1 /* Sparse encoding */
+#define HLL_DENSE 0 /* Dense encoding. */
+#define HLL_SPARSE 1 /* Sparse encoding. */
+#define HLL_RAW 255 /* Only used internally, never exposed. */
#define HLL_MAX_ENCODING 1
static char *invalid_hll_err = "-INVALIDOBJ Corrupted HLL object detected\r\n";
@@ -920,12 +921,37 @@ double hllSparseSum(uint8_t *sparse, int sparselen, double *PE, int *ezp, int *i
* as helpers to compute the SUM(2^-reg) part of the computation, which is
* representation-specific, while all the rest is common. */
+/* Implements the SUM operation for uint8_t data type which is only used
+ * internally as speedup for PFCOUNT with multiple keys. */
+double hllRawSum(uint8_t *registers, double *PE, int *ezp) {
+ double E = 0;
+ int j, ez = 0;
+ unsigned long reg;
+
+ for (j = 0; j < HLL_REGISTERS; j++) {
+ reg = registers[j];
+ if (reg == 0) {
+ ez++;
+ E += 1; /* 2^(-reg[j]) is 1 when m is 0. */
+ } else {
+ E += PE[reg]; /* Precomputed 2^(-reg[j]). */
+ }
+ }
+ *ezp = ez;
+ return E;
+}
+
/* Return the approximated cardinality of the set based on the armonic
* mean of the registers values. 'hdr' points to the start of the SDS
* representing the String object holding the HLL representation.
*
* If the sparse representation of the HLL object is not valid, the integer
- * pointed by 'invalid' is set to non-zero, otherwise it is left untouched. */
+ * pointed by 'invalid' is set to non-zero, otherwise it is left untouched.
+ *
+ * hllCount() supports a special internal-only encoding of HLL_RAW, that
+ * is, hdr->registers will point to an uint8_t array of HLL_REGISTERS element.
+ * This is useful in order to speedup PFCOUNT when called against multiple
+ * keys (no need to work with 6-bit integers encoding). */
uint64_t hllCount(struct hllhdr *hdr, int *invalid) {
double m = HLL_REGISTERS;
double E, alpha = 0.7213/(1+1.079/m);
@@ -947,9 +973,13 @@ uint64_t hllCount(struct hllhdr *hdr, int *invalid) {
/* Compute SUM(2^-register[0..i]). */
if (hdr->encoding == HLL_DENSE) {
E = hllDenseSum(hdr->registers,PE,&ez);
- } else {
+ } else if (hdr->encoding == HLL_SPARSE) {
E = hllSparseSum(hdr->registers,
sdslen((sds)hdr)-HLL_HDR_SIZE,PE,&ez,invalid);
+ } else if (hdr->encoding == HLL_RAW) {
+ E = hllRawSum(hdr->registers,PE,&ez);
+ } else {
+ redisPanic("Unknown HyperLogLog encoding in hllCount()");
}
/* Muliply the inverse of E for alpha_m * m^2 to have the raw estimate. */
@@ -1151,10 +1181,47 @@ void pfaddCommand(redisClient *c) {
/* PFCOUNT var -> approximated cardinality of set. */
void pfcountCommand(redisClient *c) {
- robj *o = lookupKeyRead(c->db,c->argv[1]);
+ robj *o;
struct hllhdr *hdr;
uint64_t card;
+ /* Case 1: multi-key keys, cardinality of the union.
+ *
+ * When multiple keys are specified, PFCOUNT actually computes
+ * the cardinality of the merge of the N HLLs specified. */
+ if (c->argc > 2) {
+ uint8_t max[HLL_HDR_SIZE+HLL_REGISTERS], *registers;
+ int j;
+
+ /* Compute an HLL with M[i] = MAX(M[i]_j). */
+ memset(max,0,sizeof(max));
+ hdr = (struct hllhdr*) max;
+ hdr->encoding = HLL_RAW; /* Special internal-only encoding. */
+ registers = max + HLL_HDR_SIZE;
+ for (j = 1; j < c->argc; j++) {
+ /* Check type and size. */
+ robj *o = lookupKeyRead(c->db,c->argv[j]);
+ if (o == NULL) continue; /* Assume empty HLL for non existing var. */
+ if (isHLLObjectOrReply(c,o) != REDIS_OK) return;
+
+ /* Merge with this HLL with our 'max' HHL by setting max[i]
+ * to MAX(max[i],hll[i]). */
+ if (hllMerge(registers,o) == REDIS_ERR) {
+ addReplySds(c,sdsnew(invalid_hll_err));
+ return;
+ }
+ }
+
+ /* Compute cardinality of the resulting set. */
+ addReplyLongLong(c,hllCount(hdr,NULL));
+ return;
+ }
+
+ /* Case 2: cardinality of the single HLL.
+ *
+ * The user specified a single key. Either return the cached value
+ * or compute one and update the cache. */
+ o = lookupKeyRead(c->db,c->argv[1]);
if (o == NULL) {
/* No key? Cardinality is zero since no element was added, otherwise
* we would have a key as HLLADD creates it as a side effect. */
diff --git a/src/redis.c b/src/redis.c
index d1cc2a476..bd66fcb9a 100644
--- a/src/redis.c
+++ b/src/redis.c
@@ -265,7 +265,7 @@ struct redisCommand redisCommandTable[] = {
{"bitpos",bitposCommand,-3,"r",0,NULL,1,1,1,0,0},
{"pfselftest",pfselftestCommand,1,"r",0,NULL,0,0,0,0,0},
{"pfadd",pfaddCommand,-2,"wm",0,NULL,1,1,1,0,0},
- {"pfcount",pfcountCommand,2,"w",0,NULL,1,1,1,0,0},
+ {"pfcount",pfcountCommand,-2,"w",0,NULL,1,1,1,0,0},
{"pfmerge",pfmergeCommand,-2,"wm",0,NULL,1,-1,1,0,0},
{"pfdebug",pfdebugCommand,-3,"w",0,NULL,0,0,0,0,0}
};