summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSalvatore Sanfilippo <antirez@gmail.com>2020-02-20 13:00:39 +0100
committerGitHub <noreply@github.com>2020-02-20 13:00:39 +0100
commite741b0c257dff894c9999c145c664617b4400c17 (patch)
treec233995d11d380f24ab3def0cdc691c9bd4a8e45
parent9fe02ce1cda2673888b8f44072858389fb22f32f (diff)
parent485425cec76e2bfeb3e11503c4519f3d526bc09e (diff)
downloadredis-e741b0c257dff894c9999c145c664617b4400c17.tar.gz
Merge pull request #6903 from oranagra/defrag_lists
Defrag big lists in portions to avoid latency and freeze
-rw-r--r--src/defrag.c96
-rw-r--r--src/quicklist.c150
-rw-r--r--src/quicklist.h46
-rw-r--r--tests/unit/memefficiency.tcl92
4 files changed, 350 insertions, 34 deletions
diff --git a/src/defrag.c b/src/defrag.c
index 04e57955b..e729297a5 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -5,8 +5,8 @@
* We do that by scanning the keyspace and for each pointer we have, we can try to
* ask the allocator if moving it to a new address will help reduce fragmentation.
*
- * Copyright (c) 2017, Oran Agra
- * Copyright (c) 2017, Redis Labs, Inc
+ * Copyright (c) 2020, Oran Agra
+ * Copyright (c) 2020, Redis Labs, Inc
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -408,25 +408,32 @@ dictEntry* replaceSateliteDictKeyPtrAndOrDefragDictEntry(dict *d, sds oldkey, sd
return NULL;
}
-long activeDefragQuickListNodes(quicklist *ql) {
- quicklistNode *node = ql->head, *newnode;
+long activeDefragQuickListNode(quicklist *ql, quicklistNode **node_ref) {
+ quicklistNode *newnode, *node = *node_ref;
long defragged = 0;
unsigned char *newzl;
+ if ((newnode = activeDefragAlloc(node))) {
+ if (newnode->prev)
+ newnode->prev->next = newnode;
+ else
+ ql->head = newnode;
+ if (newnode->next)
+ newnode->next->prev = newnode;
+ else
+ ql->tail = newnode;
+ *node_ref = node = newnode;
+ defragged++;
+ }
+ if ((newzl = activeDefragAlloc(node->zl)))
+ defragged++, node->zl = newzl;
+ return defragged;
+}
+
+long activeDefragQuickListNodes(quicklist *ql) {
+ quicklistNode *node = ql->head;
+ long defragged = 0;
while (node) {
- if ((newnode = activeDefragAlloc(node))) {
- if (newnode->prev)
- newnode->prev->next = newnode;
- else
- ql->head = newnode;
- if (newnode->next)
- newnode->next->prev = newnode;
- else
- ql->tail = newnode;
- node = newnode;
- defragged++;
- }
- if ((newzl = activeDefragAlloc(node->zl)))
- defragged++, node->zl = newzl;
+ defragged += activeDefragQuickListNode(ql, &node);
node = node->next;
}
return defragged;
@@ -440,12 +447,48 @@ void defragLater(redisDb *db, dictEntry *kde) {
listAddNodeTail(db->defrag_later, key);
}
-long scanLaterList(robj *ob) {
+/* returns 0 if no more work needs to be been done, and 1 if time is up and more work is needed. */
+long scanLaterList(robj *ob, unsigned long *cursor, long long endtime, long long *defragged) {
quicklist *ql = ob->ptr;
+ quicklistNode *node;
+ long iterations = 0;
+ int bookmark_failed = 0;
if (ob->type != OBJ_LIST || ob->encoding != OBJ_ENCODING_QUICKLIST)
return 0;
- server.stat_active_defrag_scanned+=ql->len;
- return activeDefragQuickListNodes(ql);
+
+ if (*cursor == 0) {
+ /* if cursor is 0, we start new iteration */
+ node = ql->head;
+ } else {
+ node = quicklistBookmarkFind(ql, "_AD");
+ if (!node) {
+ /* if the bookmark was deleted, it means we reached the end. */
+ *cursor = 0;
+ return 0;
+ }
+ node = node->next;
+ }
+
+ (*cursor)++;
+ while (node) {
+ (*defragged) += activeDefragQuickListNode(ql, &node);
+ server.stat_active_defrag_scanned++;
+ if (++iterations > 128 && !bookmark_failed) {
+ if (ustime() > endtime) {
+ if (!quicklistBookmarkCreate(&ql, "_AD", node)) {
+ bookmark_failed = 1;
+ } else {
+ ob->ptr = ql; /* bookmark creation may have re-allocated the quicklist */
+ return 1;
+ }
+ }
+ iterations = 0;
+ }
+ node = node->next;
+ }
+ quicklistBookmarkDelete(ql, "_AD");
+ *cursor = 0;
+ return bookmark_failed? 1: 0;
}
typedef struct {
@@ -638,7 +681,8 @@ int scanLaterStraemListpacks(robj *ob, unsigned long *cursor, long long endtime,
void *newdata = activeDefragAlloc(ri.data);
if (newdata)
raxSetData(ri.node, ri.data=newdata), (*defragged)++;
- if (++iterations > 16) {
+ server.stat_active_defrag_scanned++;
+ if (++iterations > 128) {
if (ustime() > endtime) {
serverAssert(ri.key_len==sizeof(last));
memcpy(last,ri.key,ri.key_len);
@@ -900,8 +944,7 @@ int defragLaterItem(dictEntry *de, unsigned long *cursor, long long endtime) {
if (de) {
robj *ob = dictGetVal(de);
if (ob->type == OBJ_LIST) {
- server.stat_active_defrag_hits += scanLaterList(ob);
- *cursor = 0; /* list has no scan, we must finish it in one go */
+ return scanLaterList(ob, cursor, endtime, &server.stat_active_defrag_hits);
} else if (ob->type == OBJ_SET) {
server.stat_active_defrag_hits += scanLaterSet(ob, cursor);
} else if (ob->type == OBJ_ZSET) {
@@ -961,11 +1004,6 @@ int defragLaterStep(redisDb *db, long long endtime) {
if (defragLaterItem(de, &defrag_later_cursor, endtime))
quit = 1; /* time is up, we didn't finish all the work */
- /* Don't start a new BIG key in this loop, this is because the
- * next key can be a list, and scanLaterList must be done in once cycle */
- if (!defrag_later_cursor)
- quit = 1;
-
/* Once in 16 scan iterations, 512 pointer reallocations, or 64 fields
* (if we have a lot of pointers in one hash bucket, or rehashing),
* check if we reached the time limit. */
diff --git a/src/quicklist.c b/src/quicklist.c
index 7b5484116..ae183ffd8 100644
--- a/src/quicklist.c
+++ b/src/quicklist.c
@@ -70,6 +70,12 @@ static const size_t optimization_level[] = {4096, 8192, 16384, 32768, 65536};
} while (0);
#endif
+/* Bookmarks forward declarations */
+#define QL_MAX_BM ((1 << QL_BM_BITS)-1)
+quicklistBookmark *_quicklistBookmarkFindByName(quicklist *ql, const char *name);
+quicklistBookmark *_quicklistBookmarkFindByNode(quicklist *ql, quicklistNode *node);
+void _quicklistBookmarkDelete(quicklist *ql, quicklistBookmark *bm);
+
/* Simple way to give quicklistEntry structs default values with one call. */
#define initEntry(e) \
do { \
@@ -100,10 +106,11 @@ quicklist *quicklistCreate(void) {
quicklist->count = 0;
quicklist->compress = 0;
quicklist->fill = -2;
+ quicklist->bookmark_count = 0;
return quicklist;
}
-#define COMPRESS_MAX (1 << 16)
+#define COMPRESS_MAX (1 << QL_COMP_BITS)
void quicklistSetCompressDepth(quicklist *quicklist, int compress) {
if (compress > COMPRESS_MAX) {
compress = COMPRESS_MAX;
@@ -113,7 +120,7 @@ void quicklistSetCompressDepth(quicklist *quicklist, int compress) {
quicklist->compress = compress;
}
-#define FILL_MAX (1 << 15)
+#define FILL_MAX (1 << (QL_FILL_BITS-1))
void quicklistSetFill(quicklist *quicklist, int fill) {
if (fill > FILL_MAX) {
fill = FILL_MAX;
@@ -169,6 +176,7 @@ void quicklistRelease(quicklist *quicklist) {
quicklist->len--;
current = next;
}
+ quicklistBookmarksClear(quicklist);
zfree(quicklist);
}
@@ -578,6 +586,15 @@ quicklist *quicklistCreateFromZiplist(int fill, int compress,
REDIS_STATIC void __quicklistDelNode(quicklist *quicklist,
quicklistNode *node) {
+ /* Update the bookmark if any */
+ quicklistBookmark *bm = _quicklistBookmarkFindByNode(quicklist, node);
+ if (bm) {
+ bm->node = node->next;
+ /* if the bookmark was to the last node, delete it. */
+ if (!bm->node)
+ _quicklistBookmarkDelete(quicklist, bm);
+ }
+
if (node->next)
node->next->prev = node->prev;
if (node->prev)
@@ -1410,6 +1427,87 @@ void quicklistPush(quicklist *quicklist, void *value, const size_t sz,
}
}
+/* Create or update a bookmark in the list which will be updated to the next node
+ * automatically when the one referenced gets deleted.
+ * Returns 1 on success (creation of new bookmark or override of an existing one).
+ * Returns 0 on failure (reached the maximum supported number of bookmarks).
+ * NOTE: use short simple names, so that string compare on find is quick.
+ * NOTE: bookmakrk creation may re-allocate the quicklist, so the input pointer
+ may change and it's the caller responsibilty to update the reference.
+ */
+int quicklistBookmarkCreate(quicklist **ql_ref, const char *name, quicklistNode *node) {
+ quicklist *ql = *ql_ref;
+ if (ql->bookmark_count >= QL_MAX_BM)
+ return 0;
+ quicklistBookmark *bm = _quicklistBookmarkFindByName(ql, name);
+ if (bm) {
+ bm->node = node;
+ return 1;
+ }
+ ql = zrealloc(ql, sizeof(quicklist) + (ql->bookmark_count+1) * sizeof(quicklistBookmark));
+ *ql_ref = ql;
+ ql->bookmarks[ql->bookmark_count].node = node;
+ ql->bookmarks[ql->bookmark_count].name = zstrdup(name);
+ ql->bookmark_count++;
+ return 1;
+}
+
+/* Find the quicklist node referenced by a named bookmark.
+ * When the bookmarked node is deleted the bookmark is updated to the next node,
+ * and if that's the last node, the bookmark is deleted (so find returns NULL). */
+quicklistNode *quicklistBookmarkFind(quicklist *ql, const char *name) {
+ quicklistBookmark *bm = _quicklistBookmarkFindByName(ql, name);
+ if (!bm) return NULL;
+ return bm->node;
+}
+
+/* Delete a named bookmark.
+ * returns 0 if bookmark was not found, and 1 if deleted.
+ * Note that the bookmark memory is not freed yet, and is kept for future use. */
+int quicklistBookmarkDelete(quicklist *ql, const char *name) {
+ quicklistBookmark *bm = _quicklistBookmarkFindByName(ql, name);
+ if (!bm)
+ return 0;
+ _quicklistBookmarkDelete(ql, bm);
+ return 1;
+}
+
+quicklistBookmark *_quicklistBookmarkFindByName(quicklist *ql, const char *name) {
+ unsigned i;
+ for (i=0; i<ql->bookmark_count; i++) {
+ if (!strcmp(ql->bookmarks[i].name, name)) {
+ return &ql->bookmarks[i];
+ }
+ }
+ return NULL;
+}
+
+quicklistBookmark *_quicklistBookmarkFindByNode(quicklist *ql, quicklistNode *node) {
+ unsigned i;
+ for (i=0; i<ql->bookmark_count; i++) {
+ if (ql->bookmarks[i].node == node) {
+ return &ql->bookmarks[i];
+ }
+ }
+ return NULL;
+}
+
+void _quicklistBookmarkDelete(quicklist *ql, quicklistBookmark *bm) {
+ int index = bm - ql->bookmarks;
+ zfree(bm->name);
+ ql->bookmark_count--;
+ memmove(bm, bm+1, (ql->bookmark_count - index)* sizeof(*bm));
+ /* NOTE: We do not shrink (realloc) the quicklist yet (to avoid resonance,
+ * it may be re-used later (a call to realloc may NOP). */
+}
+
+void quicklistBookmarksClear(quicklist *ql) {
+ while (ql->bookmark_count)
+ zfree(ql->bookmarks[--ql->bookmark_count].name);
+ /* NOTE: We do not shrink (realloc) the quick list. main use case for this
+ * function is just before releasing the allocation. */
+}
+
/* The rest of this file is test cases and test helpers. */
#ifdef REDIS_TEST
#include <stdint.h>
@@ -2641,6 +2739,54 @@ int quicklistTest(int argc, char *argv[]) {
printf("Compressions: %0.2f seconds.\n", (float)(stop - start) / 1000);
printf("\n");
+ TEST("bookmark get updated to next item") {
+ quicklist *ql = quicklistNew(1, 0);
+ quicklistPushTail(ql, "1", 1);
+ quicklistPushTail(ql, "2", 1);
+ quicklistPushTail(ql, "3", 1);
+ quicklistPushTail(ql, "4", 1);
+ quicklistPushTail(ql, "5", 1);
+ assert(ql->len==5);
+ /* add two bookmarks, one pointing to the node before the last. */
+ assert(quicklistBookmarkCreate(&ql, "_dummy", ql->head->next));
+ assert(quicklistBookmarkCreate(&ql, "_test", ql->tail->prev));
+ /* test that the bookmark returns the right node, delete it and see that the bookmark points to the last node */
+ assert(quicklistBookmarkFind(ql, "_test") == ql->tail->prev);
+ assert(quicklistDelRange(ql, -2, 1));
+ assert(quicklistBookmarkFind(ql, "_test") == ql->tail);
+ /* delete the last node, and see that the bookmark was deleted. */
+ assert(quicklistDelRange(ql, -1, 1));
+ assert(quicklistBookmarkFind(ql, "_test") == NULL);
+ /* test that other bookmarks aren't affected */
+ assert(quicklistBookmarkFind(ql, "_dummy") == ql->head->next);
+ assert(quicklistBookmarkFind(ql, "_missing") == NULL);
+ assert(ql->len==3);
+ quicklistBookmarksClear(ql); /* for coverage */
+ assert(quicklistBookmarkFind(ql, "_dummy") == NULL);
+ quicklistRelease(ql);
+ }
+
+ TEST("bookmark limit") {
+ int i;
+ quicklist *ql = quicklistNew(1, 0);
+ quicklistPushHead(ql, "1", 1);
+ for (i=0; i<QL_MAX_BM; i++)
+ assert(quicklistBookmarkCreate(&ql, genstr("",i), ql->head));
+ /* when all bookmarks are used, creation fails */
+ assert(!quicklistBookmarkCreate(&ql, "_test", ql->head));
+ /* delete one and see that we can now create another */
+ assert(quicklistBookmarkDelete(ql, "0"));
+ assert(quicklistBookmarkCreate(&ql, "_test", ql->head));
+ /* delete one and see that the rest survive */
+ assert(quicklistBookmarkDelete(ql, "_test"));
+ for (i=1; i<QL_MAX_BM; i++)
+ assert(quicklistBookmarkFind(ql, genstr("",i)) == ql->head);
+ /* make sure the deleted ones are indeed gone */
+ assert(!quicklistBookmarkFind(ql, "0"));
+ assert(!quicklistBookmarkFind(ql, "_test"));
+ quicklistRelease(ql);
+ }
+
if (!err)
printf("ALL TESTS PASSED!\n");
else
diff --git a/src/quicklist.h b/src/quicklist.h
index a7e27a2dd..8b553c119 100644
--- a/src/quicklist.h
+++ b/src/quicklist.h
@@ -28,6 +28,8 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
+#include <stdint.h> // for UINTPTR_MAX
+
#ifndef __QUICKLIST_H__
#define __QUICKLIST_H__
@@ -64,19 +66,51 @@ typedef struct quicklistLZF {
char compressed[];
} quicklistLZF;
+/* Bookmarks are padded with realloc at the end of of the quicklist struct.
+ * They should only be used for very big lists if thousands of nodes were the
+ * excess memory usage is negligible, and there's a real need to iterate on them
+ * in portions.
+ * When not used, they don't add any memory overhead, but when used and then
+ * deleted, some overhead remains (to avoid resonance).
+ * The number of bookmarks used should be kept to minimum since it also adds
+ * overhead on node deletion (searching for a bookmark to update). */
+typedef struct quicklistBookmark {
+ quicklistNode *node;
+ char *name;
+} quicklistBookmark;
+
+#if UINTPTR_MAX == 0xffffffff
+/* 32-bit */
+# define QL_FILL_BITS 14
+# define QL_COMP_BITS 14
+# define QL_BM_BITS 4
+#elif UINTPTR_MAX == 0xffffffffffffffff
+/* 64-bit */
+# define QL_FILL_BITS 16
+# define QL_COMP_BITS 16
+# define QL_BM_BITS 4 /* we can encode more, but we rather limit the user
+ since they cause performance degradation. */
+#else
+# error unknown arch bits count
+#endif
+
/* quicklist is a 40 byte struct (on 64-bit systems) describing a quicklist.
* 'count' is the number of total entries.
* 'len' is the number of quicklist nodes.
* 'compress' is: -1 if compression disabled, otherwise it's the number
* of quicklistNodes to leave uncompressed at ends of quicklist.
- * 'fill' is the user-requested (or default) fill factor. */
+ * 'fill' is the user-requested (or default) fill factor.
+ * 'bookmakrs are an optional feature that is used by realloc this struct,
+ * so that they don't consume memory when not used. */
typedef struct quicklist {
quicklistNode *head;
quicklistNode *tail;
unsigned long count; /* total count of all entries in all ziplists */
unsigned long len; /* number of quicklistNodes */
- int fill : 16; /* fill factor for individual nodes */
- unsigned int compress : 16; /* depth of end nodes not to compress;0=off */
+ int fill : QL_FILL_BITS; /* fill factor for individual nodes */
+ unsigned int compress : QL_COMP_BITS; /* depth of end nodes not to compress;0=off */
+ unsigned int bookmark_count: QL_BM_BITS;
+ quicklistBookmark bookmarks[];
} quicklist;
typedef struct quicklistIter {
@@ -158,6 +192,12 @@ unsigned long quicklistCount(const quicklist *ql);
int quicklistCompare(unsigned char *p1, unsigned char *p2, int p2_len);
size_t quicklistGetLzf(const quicklistNode *node, void **data);
+/* bookmarks */
+int quicklistBookmarkCreate(quicklist **ql_ref, const char *name, quicklistNode *node);
+int quicklistBookmarkDelete(quicklist *ql, const char *name);
+quicklistNode *quicklistBookmarkFind(quicklist *ql, const char *name);
+void quicklistBookmarksClear(quicklist *ql);
+
#ifdef REDIS_TEST
int quicklistTest(int argc, char *argv[]);
#endif
diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl
index d152e212c..ec80c7384 100644
--- a/tests/unit/memefficiency.tcl
+++ b/tests/unit/memefficiency.tcl
@@ -209,5 +209,97 @@ start_server {tags {"defrag"}} {
assert {$digest eq $newdigest}
r save ;# saving an rdb iterates over all the data / pointers
} {OK}
+
+ test "Active defrag big list" {
+ r flushdb
+ r config resetstat
+ r config set save "" ;# prevent bgsave from interfereing with save below
+ r config set hz 100
+ r config set activedefrag no
+ r config set active-defrag-max-scan-fields 1000
+ r config set active-defrag-threshold-lower 5
+ r config set active-defrag-cycle-min 65
+ r config set active-defrag-cycle-max 75
+ r config set active-defrag-ignore-bytes 2mb
+ r config set maxmemory 0
+ r config set list-max-ziplist-size 5 ;# list of 500k items will have 100k quicklist nodes
+
+ # create big keys with 10k items
+ set rd [redis_deferring_client]
+
+ set expected_frag 1.7
+ # add a mass of list nodes to two lists (allocations are interlaced)
+ set val [string repeat A 100] ;# 5 items of 100 bytes puts us in the 640 bytes bin, which has 32 regs, so high potential for fragmentation
+ for {set j 0} {$j < 500000} {incr j} {
+ $rd lpush biglist1 $val
+ $rd lpush biglist2 $val
+ }
+ for {set j 0} {$j < 500000} {incr j} {
+ $rd read ; # Discard replies
+ $rd read ; # Discard replies
+ }
+
+ # create some fragmentation
+ r del biglist2
+
+ # start defrag
+ after 120 ;# serverCron only updates the info once in 100ms
+ set frag [s allocator_frag_ratio]
+ if {$::verbose} {
+ puts "frag $frag"
+ }
+
+ assert {$frag >= $expected_frag}
+ r config set latency-monitor-threshold 5
+ r latency reset
+
+ set digest [r debug digest]
+ catch {r config set activedefrag yes} e
+ if {![string match {DISABLED*} $e]} {
+ # wait for the active defrag to start working (decision once a second)
+ wait_for_condition 50 100 {
+ [s active_defrag_running] ne 0
+ } else {
+ fail "defrag not started."
+ }
+
+ # wait for the active defrag to stop working
+ wait_for_condition 500 100 {
+ [s active_defrag_running] eq 0
+ } else {
+ after 120 ;# serverCron only updates the info once in 100ms
+ puts [r info memory]
+ puts [r info stats]
+ puts [r memory malloc-stats]
+ fail "defrag didn't stop."
+ }
+
+ # test the the fragmentation is lower
+ after 120 ;# serverCron only updates the info once in 100ms
+ set frag [s allocator_frag_ratio]
+ set max_latency 0
+ foreach event [r latency latest] {
+ lassign $event eventname time latency max
+ if {$eventname == "active-defrag-cycle"} {
+ set max_latency $max
+ }
+ }
+ if {$::verbose} {
+ puts "frag $frag"
+ puts "max latency $max_latency"
+ puts [r latency latest]
+ puts [r latency history active-defrag-cycle]
+ }
+ assert {$frag < 1.1}
+ # due to high fragmentation, 100hz, and active-defrag-cycle-max set to 75,
+ # we expect max latency to be not much higher than 7.5ms
+ assert {$max_latency <= 12}
+ }
+ # verify the data isn't corrupted or changed
+ set newdigest [r debug digest]
+ assert {$digest eq $newdigest}
+ r save ;# saving an rdb iterates over all the data / pointers
+ r del biglist1 ;# coverage for quicklistBookmarksClear
+ } {1}
}
}