diff --git a/src/db.c b/src/db.c index cadfeb77..6e5a8bf3 100644 --- a/src/db.c +++ b/src/db.c @@ -1554,6 +1554,32 @@ int *georadiusGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numk return keys; } +/* LCS ... [KEYS ] ... */ +int *lcsGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys) +{ + int i; + int *keys = getKeysTempBuffer; + UNUSED(cmd); + + /* We need to parse the options of the command in order to check for the + * "KEYS" argument before the "STRINGS" argument. */ + for (i = 1; i < argc; i++) { + char *arg = argv[i]->ptr; + int moreargs = (argc-1) - i; + + if (!strcasecmp(arg, "strings")) { + break; + } else if (!strcasecmp(arg, "keys") && moreargs >= 2) { + keys[0] = i+1; + keys[1] = i+2; + *numkeys = 2; + return keys; + } + } + *numkeys = 0; + return keys; +} + /* Helper function to extract keys from memory command. * MEMORY USAGE */ int *memoryGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys) { diff --git a/src/server.c b/src/server.c index d6abc72d..56feb09a 100644 --- a/src/server.c +++ b/src/server.c @@ -1004,7 +1004,11 @@ struct redisCommand redisCommandTable[] = { {"acl",aclCommand,-2, "admin no-script no-slowlog ok-loading ok-stale", - 0,NULL,0,0,0,0,0,0} + 0,NULL,0,0,0,0,0,0}, + + {"lcs",lcsCommand,-4, + "write use-memory @string", + 0,lcsGetKeys,0,0,0,0,0,0} }; /*============================ Utility functions ============================ */ diff --git a/src/server.h b/src/server.h index f37fe4b1..b1799594 100644 --- a/src/server.h +++ b/src/server.h @@ -2102,6 +2102,7 @@ int *migrateGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkey int *georadiusGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys); int *xreadGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys); int *memoryGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys); +int *lcsGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys); /* Cluster */ void clusterInit(void); @@ -2373,6 +2374,7 @@ void xdelCommand(client *c); void xtrimCommand(client *c); void lolwutCommand(client *c); void aclCommand(client *c); +void lcsCommand(client *c); #if defined(__GNUC__) void *calloc(size_t count, size_t size) __attribute__ ((deprecated)); diff --git a/src/t_string.c b/src/t_string.c index 3174e9cc..335bda40 100644 --- a/src/t_string.c +++ b/src/t_string.c @@ -479,3 +479,207 @@ void strlenCommand(client *c) { checkType(c,o,OBJ_STRING)) return; addReplyLongLong(c,stringObjectLen(o)); } + +/* LCS -- Longest common subsequence. + * + * LCS [IDX] [MINMATCHLEN ] + * STRINGS | KEYS */ +void lcsCommand(client *c) { + uint32_t i, j; + long long minmatchlen = 0; + sds a = NULL, b = NULL; + int getlen = 0, getidx = 0, withmatchlen = 0; + robj *obja = NULL, *objb = NULL; + + for (j = 1; j < (uint32_t)c->argc; j++) { + char *opt = c->argv[j]->ptr; + int moreargs = (c->argc-1) - j; + + if (!strcasecmp(opt,"IDX")) { + getidx = 1; + } else if (!strcasecmp(opt,"LEN")) { + getlen = 1; + } else if (!strcasecmp(opt,"WITHMATCHLEN")) { + withmatchlen = 1; + } else if (!strcasecmp(opt,"MINMATCHLEN") && moreargs) { + if (getLongLongFromObjectOrReply(c,c->argv[j+1],&minmatchlen,NULL) + != C_OK) return; + if (minmatchlen < 0) minmatchlen = 0; + j++; + } else if (!strcasecmp(opt,"STRINGS")) { + if (a != NULL) { + addReplyError(c,"Either use STRINGS or KEYS"); + return; + } + a = c->argv[j+1]->ptr; + b = c->argv[j+2]->ptr; + j += 2; + } else if (!strcasecmp(opt,"KEYS")) { + if (a != NULL) { + addReplyError(c,"Either use STRINGS or KEYS"); + return; + } + obja = lookupKeyRead(c->db,c->argv[j+1]); + objb = lookupKeyRead(c->db,c->argv[j+2]); + obja = obja ? getDecodedObject(obja) : createStringObject("",0); + objb = objb ? getDecodedObject(objb) : createStringObject("",0); + a = obja->ptr; + b = objb->ptr; + j += 2; + } else { + addReply(c,shared.syntaxerr); + return; + } + } + + /* Complain if the user passed ambiguous parameters. */ + if (a == NULL) { + addReplyError(c,"Please specify two strings: " + "STRINGS or KEYS options are mandatory"); + return; + } else if (getlen && getidx) { + addReplyError(c, + "If you want both the length and indexes, please " + "just use IDX."); + return; + } + + /* Compute the LCS using the vanilla dynamic programming technique of + * building a table of LCS(x,y) substrings. */ + uint32_t alen = sdslen(a); + uint32_t blen = sdslen(b); + + /* Setup an uint32_t array to store at LCS[i,j] the length of the + * LCS A0..i-1, B0..j-1. Note that we have a linear array here, so + * we index it as LCS[j+(blen+1)*j] */ + uint32_t *lcs = zmalloc((alen+1)*(blen+1)*sizeof(uint32_t)); + #define LCS(A,B) lcs[(B)+((A)*(blen+1))] + + /* Start building the LCS table. */ + for (uint32_t i = 0; i <= alen; i++) { + for (uint32_t j = 0; j <= blen; j++) { + if (i == 0 || j == 0) { + /* If one substring has length of zero, the + * LCS length is zero. */ + LCS(i,j) = 0; + } else if (a[i-1] == b[j-1]) { + /* The len LCS (and the LCS itself) of two + * sequences with the same final character, is the + * LCS of the two sequences without the last char + * plus that last char. */ + LCS(i,j) = LCS(i-1,j-1)+1; + } else { + /* If the last character is different, take the longest + * between the LCS of the first string and the second + * minus the last char, and the reverse. */ + uint32_t lcs1 = LCS(i-1,j); + uint32_t lcs2 = LCS(i,j-1); + LCS(i,j) = lcs1 > lcs2 ? lcs1 : lcs2; + } + } + } + + /* Store the actual LCS string in "result" if needed. We create + * it backward, but the length is already known, we store it into idx. */ + uint32_t idx = LCS(alen,blen); + sds result = NULL; /* Resulting LCS string. */ + void *arraylenptr = NULL; /* Deffered length of the array for IDX. */ + uint32_t arange_start = alen, /* alen signals that values are not set. */ + arange_end = 0, + brange_start = 0, + brange_end = 0; + + /* Do we need to compute the actual LCS string? Allocate it in that case. */ + int computelcs = getidx || !getlen; + if (computelcs) result = sdsnewlen(SDS_NOINIT,idx); + + /* Start with a deferred array if we have to emit the ranges. */ + uint32_t arraylen = 0; /* Number of ranges emitted in the array. */ + if (getidx) { + addReplyMapLen(c,2); + addReplyBulkCString(c,"matches"); + arraylenptr = addReplyDeferredLen(c); + } + + i = alen, j = blen; + while (computelcs && i > 0 && j > 0) { + int emit_range = 0; + if (a[i-1] == b[j-1]) { + /* If there is a match, store the character and reduce + * the indexes to look for a new match. */ + result[idx-1] = a[i-1]; + + /* Track the current range. */ + if (arange_start == alen) { + arange_start = i-1; + arange_end = i-1; + brange_start = j-1; + brange_end = j-1; + } else { + /* Let's see if we can extend the range backward since + * it is contiguous. */ + if (arange_start == i && brange_start == j) { + arange_start--; + brange_start--; + } else { + emit_range = 1; + } + } + /* Emit the range if we matched with the first byte of + * one of the two strings. We'll exit the loop ASAP. */ + if (arange_start == 0 || brange_start == 0) emit_range = 1; + idx--; i--; j--; + } else { + /* Otherwise reduce i and j depending on the largest + * LCS between, to understand what direction we need to go. */ + uint32_t lcs1 = LCS(i-1,j); + uint32_t lcs2 = LCS(i,j-1); + if (lcs1 > lcs2) + i--; + else + j--; + if (arange_start != alen) emit_range = 1; + } + + /* Emit the current range if needed. */ + uint32_t match_len = arange_end - arange_start + 1; + if (emit_range) { + if (minmatchlen == 0 || match_len >= minmatchlen) { + if (arraylenptr) { + addReplyArrayLen(c,2+withmatchlen); + addReplyArrayLen(c,2); + addReplyLongLong(c,arange_start); + addReplyLongLong(c,arange_end); + addReplyArrayLen(c,2); + addReplyLongLong(c,brange_start); + addReplyLongLong(c,brange_end); + if (withmatchlen) addReplyLongLong(c,match_len); + arraylen++; + } + } + arange_start = alen; /* Restart at the next match. */ + } + } + + /* Signal modified key, increment dirty, ... */ + + /* Reply depending on the given options. */ + if (arraylenptr) { + addReplyBulkCString(c,"len"); + addReplyLongLong(c,LCS(alen,blen)); + setDeferredArrayLen(c,arraylenptr,arraylen); + } else if (getlen) { + addReplyLongLong(c,LCS(alen,blen)); + } else { + addReplyBulkSds(c,result); + result = NULL; + } + + /* Cleanup. */ + if (obja) decrRefCount(obja); + if (objb) decrRefCount(objb); + sdsfree(result); + zfree(lcs); + return; +} + diff --git a/tests/unit/type/string.tcl b/tests/unit/type/string.tcl index 7122fd98..b9ef9de7 100644 --- a/tests/unit/type/string.tcl +++ b/tests/unit/type/string.tcl @@ -419,4 +419,34 @@ start_server {tags {"string"}} { r set foo bar r getrange foo 0 4294967297 } {bar} + + set rna1 {CACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTG} + set rna2 {ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTT} + set rnalcs {ACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTT} + + test {LCS string output with STRINGS option} { + r LCS STRINGS $rna1 $rna2 + } $rnalcs + + test {LCS len} { + r LCS LEN STRINGS $rna1 $rna2 + } [string length $rnalcs] + + test {LCS with KEYS option} { + r set virus1 $rna1 + r set virus2 $rna2 + r LCS KEYS virus1 virus2 + } $rnalcs + + test {LCS indexes} { + dict get [r LCS IDX KEYS virus1 virus2] matches + } {{{238 238} {239 239}} {{236 236} {238 238}} {{229 230} {236 237}} {{224 224} {235 235}} {{1 222} {13 234}}} + + test {LCS indexes with match len} { + dict get [r LCS IDX KEYS virus1 virus2 WITHMATCHLEN] matches + } {{{238 238} {239 239} 1} {{236 236} {238 238} 1} {{229 230} {236 237} 2} {{224 224} {235 235} 1} {{1 222} {13 234} 222}} + + test {LCS indexes with match len and minimum match len} { + dict get [r LCS IDX KEYS virus1 virus2 WITHMATCHLEN MINMATCHLEN 5] matches + } {{{1 222} {13 234} 222}} }