Merge branch 'lcs' into unstable

This commit is contained in:
antirez 2020-04-06 13:51:55 +02:00
commit 121c51f4f3
5 changed files with 267 additions and 1 deletions

View File

@ -1554,6 +1554,32 @@ int *georadiusGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numk
return keys;
}
/* LCS ... [KEYS <key1> <key2>] ... */
int *lcsGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys)
{
int i;
int *keys = getKeysTempBuffer;
UNUSED(cmd);
/* We need to parse the options of the command in order to check for the
* "KEYS" argument before the "STRINGS" argument. */
for (i = 1; i < argc; i++) {
char *arg = argv[i]->ptr;
int moreargs = (argc-1) - i;
if (!strcasecmp(arg, "strings")) {
break;
} else if (!strcasecmp(arg, "keys") && moreargs >= 2) {
keys[0] = i+1;
keys[1] = i+2;
*numkeys = 2;
return keys;
}
}
*numkeys = 0;
return keys;
}
/* Helper function to extract keys from memory command.
* MEMORY USAGE <key> */
int *memoryGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys) {

View File

@ -1004,7 +1004,11 @@ struct redisCommand redisCommandTable[] = {
{"acl",aclCommand,-2,
"admin no-script no-slowlog ok-loading ok-stale",
0,NULL,0,0,0,0,0,0}
0,NULL,0,0,0,0,0,0},
{"lcs",lcsCommand,-4,
"write use-memory @string",
0,lcsGetKeys,0,0,0,0,0,0}
};
/*============================ Utility functions ============================ */

View File

@ -2102,6 +2102,7 @@ int *migrateGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkey
int *georadiusGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys);
int *xreadGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys);
int *memoryGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys);
int *lcsGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys);
/* Cluster */
void clusterInit(void);
@ -2373,6 +2374,7 @@ void xdelCommand(client *c);
void xtrimCommand(client *c);
void lolwutCommand(client *c);
void aclCommand(client *c);
void lcsCommand(client *c);
#if defined(__GNUC__)
void *calloc(size_t count, size_t size) __attribute__ ((deprecated));

View File

@ -479,3 +479,207 @@ void strlenCommand(client *c) {
checkType(c,o,OBJ_STRING)) return;
addReplyLongLong(c,stringObjectLen(o));
}
/* LCS -- Longest common subsequence.
*
* LCS [IDX] [MINMATCHLEN <len>]
* STRINGS <string> <string> | KEYS <keya> <keyb> */
void lcsCommand(client *c) {
uint32_t i, j;
long long minmatchlen = 0;
sds a = NULL, b = NULL;
int getlen = 0, getidx = 0, withmatchlen = 0;
robj *obja = NULL, *objb = NULL;
for (j = 1; j < (uint32_t)c->argc; j++) {
char *opt = c->argv[j]->ptr;
int moreargs = (c->argc-1) - j;
if (!strcasecmp(opt,"IDX")) {
getidx = 1;
} else if (!strcasecmp(opt,"LEN")) {
getlen = 1;
} else if (!strcasecmp(opt,"WITHMATCHLEN")) {
withmatchlen = 1;
} else if (!strcasecmp(opt,"MINMATCHLEN") && moreargs) {
if (getLongLongFromObjectOrReply(c,c->argv[j+1],&minmatchlen,NULL)
!= C_OK) return;
if (minmatchlen < 0) minmatchlen = 0;
j++;
} else if (!strcasecmp(opt,"STRINGS")) {
if (a != NULL) {
addReplyError(c,"Either use STRINGS or KEYS");
return;
}
a = c->argv[j+1]->ptr;
b = c->argv[j+2]->ptr;
j += 2;
} else if (!strcasecmp(opt,"KEYS")) {
if (a != NULL) {
addReplyError(c,"Either use STRINGS or KEYS");
return;
}
obja = lookupKeyRead(c->db,c->argv[j+1]);
objb = lookupKeyRead(c->db,c->argv[j+2]);
obja = obja ? getDecodedObject(obja) : createStringObject("",0);
objb = objb ? getDecodedObject(objb) : createStringObject("",0);
a = obja->ptr;
b = objb->ptr;
j += 2;
} else {
addReply(c,shared.syntaxerr);
return;
}
}
/* Complain if the user passed ambiguous parameters. */
if (a == NULL) {
addReplyError(c,"Please specify two strings: "
"STRINGS or KEYS options are mandatory");
return;
} else if (getlen && getidx) {
addReplyError(c,
"If you want both the length and indexes, please "
"just use IDX.");
return;
}
/* Compute the LCS using the vanilla dynamic programming technique of
* building a table of LCS(x,y) substrings. */
uint32_t alen = sdslen(a);
uint32_t blen = sdslen(b);
/* Setup an uint32_t array to store at LCS[i,j] the length of the
* LCS A0..i-1, B0..j-1. Note that we have a linear array here, so
* we index it as LCS[j+(blen+1)*j] */
uint32_t *lcs = zmalloc((alen+1)*(blen+1)*sizeof(uint32_t));
#define LCS(A,B) lcs[(B)+((A)*(blen+1))]
/* Start building the LCS table. */
for (uint32_t i = 0; i <= alen; i++) {
for (uint32_t j = 0; j <= blen; j++) {
if (i == 0 || j == 0) {
/* If one substring has length of zero, the
* LCS length is zero. */
LCS(i,j) = 0;
} else if (a[i-1] == b[j-1]) {
/* The len LCS (and the LCS itself) of two
* sequences with the same final character, is the
* LCS of the two sequences without the last char
* plus that last char. */
LCS(i,j) = LCS(i-1,j-1)+1;
} else {
/* If the last character is different, take the longest
* between the LCS of the first string and the second
* minus the last char, and the reverse. */
uint32_t lcs1 = LCS(i-1,j);
uint32_t lcs2 = LCS(i,j-1);
LCS(i,j) = lcs1 > lcs2 ? lcs1 : lcs2;
}
}
}
/* Store the actual LCS string in "result" if needed. We create
* it backward, but the length is already known, we store it into idx. */
uint32_t idx = LCS(alen,blen);
sds result = NULL; /* Resulting LCS string. */
void *arraylenptr = NULL; /* Deffered length of the array for IDX. */
uint32_t arange_start = alen, /* alen signals that values are not set. */
arange_end = 0,
brange_start = 0,
brange_end = 0;
/* Do we need to compute the actual LCS string? Allocate it in that case. */
int computelcs = getidx || !getlen;
if (computelcs) result = sdsnewlen(SDS_NOINIT,idx);
/* Start with a deferred array if we have to emit the ranges. */
uint32_t arraylen = 0; /* Number of ranges emitted in the array. */
if (getidx) {
addReplyMapLen(c,2);
addReplyBulkCString(c,"matches");
arraylenptr = addReplyDeferredLen(c);
}
i = alen, j = blen;
while (computelcs && i > 0 && j > 0) {
int emit_range = 0;
if (a[i-1] == b[j-1]) {
/* If there is a match, store the character and reduce
* the indexes to look for a new match. */
result[idx-1] = a[i-1];
/* Track the current range. */
if (arange_start == alen) {
arange_start = i-1;
arange_end = i-1;
brange_start = j-1;
brange_end = j-1;
} else {
/* Let's see if we can extend the range backward since
* it is contiguous. */
if (arange_start == i && brange_start == j) {
arange_start--;
brange_start--;
} else {
emit_range = 1;
}
}
/* Emit the range if we matched with the first byte of
* one of the two strings. We'll exit the loop ASAP. */
if (arange_start == 0 || brange_start == 0) emit_range = 1;
idx--; i--; j--;
} else {
/* Otherwise reduce i and j depending on the largest
* LCS between, to understand what direction we need to go. */
uint32_t lcs1 = LCS(i-1,j);
uint32_t lcs2 = LCS(i,j-1);
if (lcs1 > lcs2)
i--;
else
j--;
if (arange_start != alen) emit_range = 1;
}
/* Emit the current range if needed. */
uint32_t match_len = arange_end - arange_start + 1;
if (emit_range) {
if (minmatchlen == 0 || match_len >= minmatchlen) {
if (arraylenptr) {
addReplyArrayLen(c,2+withmatchlen);
addReplyArrayLen(c,2);
addReplyLongLong(c,arange_start);
addReplyLongLong(c,arange_end);
addReplyArrayLen(c,2);
addReplyLongLong(c,brange_start);
addReplyLongLong(c,brange_end);
if (withmatchlen) addReplyLongLong(c,match_len);
arraylen++;
}
}
arange_start = alen; /* Restart at the next match. */
}
}
/* Signal modified key, increment dirty, ... */
/* Reply depending on the given options. */
if (arraylenptr) {
addReplyBulkCString(c,"len");
addReplyLongLong(c,LCS(alen,blen));
setDeferredArrayLen(c,arraylenptr,arraylen);
} else if (getlen) {
addReplyLongLong(c,LCS(alen,blen));
} else {
addReplyBulkSds(c,result);
result = NULL;
}
/* Cleanup. */
if (obja) decrRefCount(obja);
if (objb) decrRefCount(objb);
sdsfree(result);
zfree(lcs);
return;
}

View File

@ -419,4 +419,34 @@ start_server {tags {"string"}} {
r set foo bar
r getrange foo 0 4294967297
} {bar}
set rna1 {CACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTG}
set rna2 {ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTT}
set rnalcs {ACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTT}
test {LCS string output with STRINGS option} {
r LCS STRINGS $rna1 $rna2
} $rnalcs
test {LCS len} {
r LCS LEN STRINGS $rna1 $rna2
} [string length $rnalcs]
test {LCS with KEYS option} {
r set virus1 $rna1
r set virus2 $rna2
r LCS KEYS virus1 virus2
} $rnalcs
test {LCS indexes} {
dict get [r LCS IDX KEYS virus1 virus2] matches
} {{{238 238} {239 239}} {{236 236} {238 238}} {{229 230} {236 237}} {{224 224} {235 235}} {{1 222} {13 234}}}
test {LCS indexes with match len} {
dict get [r LCS IDX KEYS virus1 virus2 WITHMATCHLEN] matches
} {{{238 238} {239 239} 1} {{236 236} {238 238} 1} {{229 230} {236 237} 2} {{224 224} {235 235} 1} {{1 222} {13 234} 222}}
test {LCS indexes with match len and minimum match len} {
dict get [r LCS IDX KEYS virus1 virus2 WITHMATCHLEN MINMATCHLEN 5] matches
} {{{1 222} {13 234} 222}}
}