summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorantirez <antirez@gmail.com>2020-04-06 13:51:55 +0200
committerantirez <antirez@gmail.com>2020-04-06 13:51:55 +0200
commit121c51f4f338dcb160398c3254024867807aa112 (patch)
treea9486ab9ac6e7fea00fe27f4e6f5df7f4dc8dbb8
parentaf5c11874c4e534db0ca1c44c2ca20fec1ab2700 (diff)
parentaf3c722feccb2d002e57f02ee5ef623f86e7ea2f (diff)
downloadredis-121c51f4f338dcb160398c3254024867807aa112.tar.gz
Merge branch 'lcs' into unstable
-rw-r--r--src/db.c26
-rw-r--r--src/server.c6
-rw-r--r--src/server.h2
-rw-r--r--src/t_string.c204
-rw-r--r--tests/unit/type/string.tcl30
5 files changed, 267 insertions, 1 deletions
diff --git a/src/db.c b/src/db.c
index cadfeb77b..6e5a8bf3a 100644
--- a/src/db.c
+++ b/src/db.c
@@ -1554,6 +1554,32 @@ int *georadiusGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numk
return keys;
}
+/* LCS ... [KEYS <key1> <key2>] ... */
+int *lcsGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys)
+{
+ int i;
+ int *keys = getKeysTempBuffer;
+ UNUSED(cmd);
+
+ /* We need to parse the options of the command in order to check for the
+ * "KEYS" argument before the "STRINGS" argument. */
+ for (i = 1; i < argc; i++) {
+ char *arg = argv[i]->ptr;
+ int moreargs = (argc-1) - i;
+
+ if (!strcasecmp(arg, "strings")) {
+ break;
+ } else if (!strcasecmp(arg, "keys") && moreargs >= 2) {
+ keys[0] = i+1;
+ keys[1] = i+2;
+ *numkeys = 2;
+ return keys;
+ }
+ }
+ *numkeys = 0;
+ return keys;
+}
+
/* Helper function to extract keys from memory command.
* MEMORY USAGE <key> */
int *memoryGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys) {
diff --git a/src/server.c b/src/server.c
index d6abc72d9..56feb09a3 100644
--- a/src/server.c
+++ b/src/server.c
@@ -1004,7 +1004,11 @@ struct redisCommand redisCommandTable[] = {
{"acl",aclCommand,-2,
"admin no-script no-slowlog ok-loading ok-stale",
- 0,NULL,0,0,0,0,0,0}
+ 0,NULL,0,0,0,0,0,0},
+
+ {"lcs",lcsCommand,-4,
+ "write use-memory @string",
+ 0,lcsGetKeys,0,0,0,0,0,0}
};
/*============================ Utility functions ============================ */
diff --git a/src/server.h b/src/server.h
index f37fe4b12..b17995948 100644
--- a/src/server.h
+++ b/src/server.h
@@ -2102,6 +2102,7 @@ int *migrateGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkey
int *georadiusGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys);
int *xreadGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys);
int *memoryGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys);
+int *lcsGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys);
/* Cluster */
void clusterInit(void);
@@ -2373,6 +2374,7 @@ void xdelCommand(client *c);
void xtrimCommand(client *c);
void lolwutCommand(client *c);
void aclCommand(client *c);
+void lcsCommand(client *c);
#if defined(__GNUC__)
void *calloc(size_t count, size_t size) __attribute__ ((deprecated));
diff --git a/src/t_string.c b/src/t_string.c
index 3174e9ccd..335bda404 100644
--- a/src/t_string.c
+++ b/src/t_string.c
@@ -479,3 +479,207 @@ void strlenCommand(client *c) {
checkType(c,o,OBJ_STRING)) return;
addReplyLongLong(c,stringObjectLen(o));
}
+
+/* LCS -- Longest common subsequence.
+ *
+ * LCS [IDX] [MINMATCHLEN <len>]
+ * STRINGS <string> <string> | KEYS <keya> <keyb> */
+void lcsCommand(client *c) {
+ uint32_t i, j;
+ long long minmatchlen = 0;
+ sds a = NULL, b = NULL;
+ int getlen = 0, getidx = 0, withmatchlen = 0;
+ robj *obja = NULL, *objb = NULL;
+
+ for (j = 1; j < (uint32_t)c->argc; j++) {
+ char *opt = c->argv[j]->ptr;
+ int moreargs = (c->argc-1) - j;
+
+ if (!strcasecmp(opt,"IDX")) {
+ getidx = 1;
+ } else if (!strcasecmp(opt,"LEN")) {
+ getlen = 1;
+ } else if (!strcasecmp(opt,"WITHMATCHLEN")) {
+ withmatchlen = 1;
+ } else if (!strcasecmp(opt,"MINMATCHLEN") && moreargs) {
+ if (getLongLongFromObjectOrReply(c,c->argv[j+1],&minmatchlen,NULL)
+ != C_OK) return;
+ if (minmatchlen < 0) minmatchlen = 0;
+ j++;
+ } else if (!strcasecmp(opt,"STRINGS")) {
+ if (a != NULL) {
+ addReplyError(c,"Either use STRINGS or KEYS");
+ return;
+ }
+ a = c->argv[j+1]->ptr;
+ b = c->argv[j+2]->ptr;
+ j += 2;
+ } else if (!strcasecmp(opt,"KEYS")) {
+ if (a != NULL) {
+ addReplyError(c,"Either use STRINGS or KEYS");
+ return;
+ }
+ obja = lookupKeyRead(c->db,c->argv[j+1]);
+ objb = lookupKeyRead(c->db,c->argv[j+2]);
+ obja = obja ? getDecodedObject(obja) : createStringObject("",0);
+ objb = objb ? getDecodedObject(objb) : createStringObject("",0);
+ a = obja->ptr;
+ b = objb->ptr;
+ j += 2;
+ } else {
+ addReply(c,shared.syntaxerr);
+ return;
+ }
+ }
+
+ /* Complain if the user passed ambiguous parameters. */
+ if (a == NULL) {
+ addReplyError(c,"Please specify two strings: "
+ "STRINGS or KEYS options are mandatory");
+ return;
+ } else if (getlen && getidx) {
+ addReplyError(c,
+ "If you want both the length and indexes, please "
+ "just use IDX.");
+ return;
+ }
+
+ /* Compute the LCS using the vanilla dynamic programming technique of
+ * building a table of LCS(x,y) substrings. */
+ uint32_t alen = sdslen(a);
+ uint32_t blen = sdslen(b);
+
+ /* Setup an uint32_t array to store at LCS[i,j] the length of the
+ * LCS A0..i-1, B0..j-1. Note that we have a linear array here, so
+ * we index it as LCS[j+(blen+1)*j] */
+ uint32_t *lcs = zmalloc((alen+1)*(blen+1)*sizeof(uint32_t));
+ #define LCS(A,B) lcs[(B)+((A)*(blen+1))]
+
+ /* Start building the LCS table. */
+ for (uint32_t i = 0; i <= alen; i++) {
+ for (uint32_t j = 0; j <= blen; j++) {
+ if (i == 0 || j == 0) {
+ /* If one substring has length of zero, the
+ * LCS length is zero. */
+ LCS(i,j) = 0;
+ } else if (a[i-1] == b[j-1]) {
+ /* The len LCS (and the LCS itself) of two
+ * sequences with the same final character, is the
+ * LCS of the two sequences without the last char
+ * plus that last char. */
+ LCS(i,j) = LCS(i-1,j-1)+1;
+ } else {
+ /* If the last character is different, take the longest
+ * between the LCS of the first string and the second
+ * minus the last char, and the reverse. */
+ uint32_t lcs1 = LCS(i-1,j);
+ uint32_t lcs2 = LCS(i,j-1);
+ LCS(i,j) = lcs1 > lcs2 ? lcs1 : lcs2;
+ }
+ }
+ }
+
+ /* Store the actual LCS string in "result" if needed. We create
+ * it backward, but the length is already known, we store it into idx. */
+ uint32_t idx = LCS(alen,blen);
+ sds result = NULL; /* Resulting LCS string. */
+ void *arraylenptr = NULL; /* Deffered length of the array for IDX. */
+ uint32_t arange_start = alen, /* alen signals that values are not set. */
+ arange_end = 0,
+ brange_start = 0,
+ brange_end = 0;
+
+ /* Do we need to compute the actual LCS string? Allocate it in that case. */
+ int computelcs = getidx || !getlen;
+ if (computelcs) result = sdsnewlen(SDS_NOINIT,idx);
+
+ /* Start with a deferred array if we have to emit the ranges. */
+ uint32_t arraylen = 0; /* Number of ranges emitted in the array. */
+ if (getidx) {
+ addReplyMapLen(c,2);
+ addReplyBulkCString(c,"matches");
+ arraylenptr = addReplyDeferredLen(c);
+ }
+
+ i = alen, j = blen;
+ while (computelcs && i > 0 && j > 0) {
+ int emit_range = 0;
+ if (a[i-1] == b[j-1]) {
+ /* If there is a match, store the character and reduce
+ * the indexes to look for a new match. */
+ result[idx-1] = a[i-1];
+
+ /* Track the current range. */
+ if (arange_start == alen) {
+ arange_start = i-1;
+ arange_end = i-1;
+ brange_start = j-1;
+ brange_end = j-1;
+ } else {
+ /* Let's see if we can extend the range backward since
+ * it is contiguous. */
+ if (arange_start == i && brange_start == j) {
+ arange_start--;
+ brange_start--;
+ } else {
+ emit_range = 1;
+ }
+ }
+ /* Emit the range if we matched with the first byte of
+ * one of the two strings. We'll exit the loop ASAP. */
+ if (arange_start == 0 || brange_start == 0) emit_range = 1;
+ idx--; i--; j--;
+ } else {
+ /* Otherwise reduce i and j depending on the largest
+ * LCS between, to understand what direction we need to go. */
+ uint32_t lcs1 = LCS(i-1,j);
+ uint32_t lcs2 = LCS(i,j-1);
+ if (lcs1 > lcs2)
+ i--;
+ else
+ j--;
+ if (arange_start != alen) emit_range = 1;
+ }
+
+ /* Emit the current range if needed. */
+ uint32_t match_len = arange_end - arange_start + 1;
+ if (emit_range) {
+ if (minmatchlen == 0 || match_len >= minmatchlen) {
+ if (arraylenptr) {
+ addReplyArrayLen(c,2+withmatchlen);
+ addReplyArrayLen(c,2);
+ addReplyLongLong(c,arange_start);
+ addReplyLongLong(c,arange_end);
+ addReplyArrayLen(c,2);
+ addReplyLongLong(c,brange_start);
+ addReplyLongLong(c,brange_end);
+ if (withmatchlen) addReplyLongLong(c,match_len);
+ arraylen++;
+ }
+ }
+ arange_start = alen; /* Restart at the next match. */
+ }
+ }
+
+ /* Signal modified key, increment dirty, ... */
+
+ /* Reply depending on the given options. */
+ if (arraylenptr) {
+ addReplyBulkCString(c,"len");
+ addReplyLongLong(c,LCS(alen,blen));
+ setDeferredArrayLen(c,arraylenptr,arraylen);
+ } else if (getlen) {
+ addReplyLongLong(c,LCS(alen,blen));
+ } else {
+ addReplyBulkSds(c,result);
+ result = NULL;
+ }
+
+ /* Cleanup. */
+ if (obja) decrRefCount(obja);
+ if (objb) decrRefCount(objb);
+ sdsfree(result);
+ zfree(lcs);
+ return;
+}
+
diff --git a/tests/unit/type/string.tcl b/tests/unit/type/string.tcl
index 7122fd987..b9ef9de7a 100644
--- a/tests/unit/type/string.tcl
+++ b/tests/unit/type/string.tcl
@@ -419,4 +419,34 @@ start_server {tags {"string"}} {
r set foo bar
r getrange foo 0 4294967297
} {bar}
+
+ set rna1 {CACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTG}
+ set rna2 {ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTT}
+ set rnalcs {ACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTT}
+
+ test {LCS string output with STRINGS option} {
+ r LCS STRINGS $rna1 $rna2
+ } $rnalcs
+
+ test {LCS len} {
+ r LCS LEN STRINGS $rna1 $rna2
+ } [string length $rnalcs]
+
+ test {LCS with KEYS option} {
+ r set virus1 $rna1
+ r set virus2 $rna2
+ r LCS KEYS virus1 virus2
+ } $rnalcs
+
+ test {LCS indexes} {
+ dict get [r LCS IDX KEYS virus1 virus2] matches
+ } {{{238 238} {239 239}} {{236 236} {238 238}} {{229 230} {236 237}} {{224 224} {235 235}} {{1 222} {13 234}}}
+
+ test {LCS indexes with match len} {
+ dict get [r LCS IDX KEYS virus1 virus2 WITHMATCHLEN] matches
+ } {{{238 238} {239 239} 1} {{236 236} {238 238} 1} {{229 230} {236 237} 2} {{224 224} {235 235} 1} {{1 222} {13 234} 222}}
+
+ test {LCS indexes with match len and minimum match len} {
+ dict get [r LCS IDX KEYS virus1 virus2 WITHMATCHLEN MINMATCHLEN 5] matches
+ } {{{1 222} {13 234} 222}}
}