Remove the bucket-cb from dictScan and move dictEntry defrag to dictScanDefrag

This change deletes the dictGetNext and dictGetNextRef functions, so the dict API doesn't expose the next field at all. The bucket function in dictScan is deleted. A separate dictScanDefrag function is added which takes a defrag alloc function to defrag-reallocate the dict entries. "Dirty" code accessing the dict internals in active defrag is removed. An 'afterReplaceEntry' is added to dictType, which allows the dict user to keep the dictEntry metadata up to date after reallocation/defrag/move. Additionally, for updating the cluster slot-to-key mapping, after a dictEntry has been reallocated, we need to know which db a dict belongs to, so we store a pointer to the db in a new metadata section in the dict struct, which is a new mechanism similar to dictEntry metadata. This adds some complexity but provides better isolation.
author: Viktor Söderqvist <viktor.soderqvist@est.tech> 2022-11-20 23:23:54 +0100
committer: Viktor Söderqvist <viktor.soderqvist@est.tech> 2023-01-11 10:25:20 +0100
commit: b60d33c91eef09aea34cecad789c552405737c55 (patch)
tree: d106927d7b6ef9d64d530a9722c6affa3dbe7e83
parent: d4e9e0aebdc2c44c252e8ca27644b4392a6e820b (diff)
download: redis-b60d33c91eef09aea34cecad789c552405737c55.tar.gz
10 files changed, 158 insertions, 109 deletions
diff --git a/src/cluster.c b/src/cluster.c
index ecebc511b..82f3e36a8 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -93,9 +93,9 @@ static void clusterBuildMessageHdr(clusterMsg *hdr, int type, size_t msglen);
 /* Links to the next and previous entries for keys in the same slot are stored
  * in the dict entry metadata. See Slot to Key API below. */
 #define dictEntryNextInSlot(de) \
-    (((clusterDictEntryMetadata *)dictMetadata(de))->next)
+    (((clusterDictEntryMetadata *)dictEntryMetadata(de))->next)
 #define dictEntryPrevInSlot(de) \
-    (((clusterDictEntryMetadata *)dictMetadata(de))->prev)
+    (((clusterDictEntryMetadata *)dictEntryMetadata(de))->prev)
 
 #define RCVBUF_INIT_LEN 1024
 #define RCVBUF_MAX_PREALLOC (1<<20) /* 1MB */
@@ -7327,7 +7327,7 @@ void slotToKeyDelEntry(dictEntry *entry, redisDb *db) {
 
 /* Updates neighbour entries when an entry has been replaced (e.g. reallocated
  * during active defrag). */
-void slotToKeyReplaceEntry(dictEntry *entry, redisDb *db) {
+void slotToKeyReplaceEntry(dict *d, dictEntry *entry) {
     dictEntry *next = dictEntryNextInSlot(entry);
     dictEntry *prev = dictEntryPrevInSlot(entry);
     if (next != NULL) {
@@ -7339,6 +7339,8 @@ void slotToKeyReplaceEntry(dictEntry *entry, redisDb *db) {
         /* The replaced entry was the first in the list. */
         sds key = dictGetKey(entry);
         unsigned int hashslot = keyHashSlot(key, sdslen(key));
+        clusterDictMetadata *dictmeta = dictMetadata(d);
+        redisDb *db = dictmeta->db;
         slotToKeys *slot_to_keys = &(*db->slots_to_keys).by_slot[hashslot];
         slot_to_keys->head = entry;
     }
@@ -7347,6 +7349,8 @@ void slotToKeyReplaceEntry(dictEntry *entry, redisDb *db) {
 /* Initialize slots-keys map of given db. */
 void slotToKeyInit(redisDb *db) {
     db->slots_to_keys = zcalloc(sizeof(clusterSlotToKeyMapping));
+    clusterDictMetadata *dictmeta = dictMetadata(db->dict);
+    dictmeta->db = db;
 }
 
 /* Empty slots-keys map of given db. */
diff --git a/src/cluster.h b/src/cluster.h
index 4c93dbc8d..b76a9a7cd 100644
--- a/src/cluster.h
+++ b/src/cluster.h
@@ -169,6 +169,9 @@ typedef struct clusterDictEntryMetadata {
     dictEntry *next;            /* Next entry with key in the same slot */
 } clusterDictEntryMetadata;
 
+typedef struct {
+    redisDb *db;                /* A link back to the db this dict belongs to */
+} clusterDictMetadata;
 
 typedef struct clusterState {
     clusterNode *myself;  /* This node */
@@ -409,7 +412,7 @@ void clusterPropagatePublish(robj *channel, robj *message, int sharded);
 unsigned int keyHashSlot(char *key, int keylen);
 void slotToKeyAddEntry(dictEntry *entry, redisDb *db);
 void slotToKeyDelEntry(dictEntry *entry, redisDb *db);
-void slotToKeyReplaceEntry(dictEntry *entry, redisDb *db);
+void slotToKeyReplaceEntry(dict *d, dictEntry *entry);
 void slotToKeyInit(redisDb *db);
 void slotToKeyFlush(redisDb *db);
 void slotToKeyDestroy(redisDb *db);
diff --git a/src/db.c b/src/db.c
index abe717ace..8511135c6 100644
--- a/src/db.c
+++ b/src/db.c
@@ -939,7 +939,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long cursor) {
         privdata[0] = keys;
         privdata[1] = o;
         do {
-            cursor = dictScan(ht, cursor, scanCallback, NULL, privdata);
+            cursor = dictScan(ht, cursor, scanCallback, privdata);
         } while (cursor &&
               maxiterations-- &&
               listLength(keys) < (unsigned long)count);
diff --git a/src/defrag.c b/src/defrag.c
index 8839e1107..0d6c6b45a 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -45,10 +45,6 @@
  * pointers are worthwhile moving and which aren't */
 int je_get_defrag_hint(void* ptr);
 
-/* forward declarations*/
-void defragDictBucketCallback(dict *d, dictEntry **bucketref);
-dictEntry* replaceSatelliteDictKeyPtrAndOrDefragDictEntry(dict *d, sds oldkey, sds newkey, uint64_t hash, long *defragged);
-
 /* Defrag helper for generic allocations.
  *
  * returns NULL in case the allocation wasn't moved.
@@ -289,8 +285,8 @@ long activeDefragSdsDict(dict* d, int val_type) {
     activeDefragSdsDictData data = {d, val_type, 0};
     unsigned long cursor = 0;
     do {
-        cursor = dictScan(d, cursor, activeDefragSdsDictCallback,
-                          defragDictBucketCallback, &data);
+        cursor = dictScanDefrag(d, cursor, activeDefragSdsDictCallback,
+                                activeDefragAlloc, &data);
     } while (cursor != 0);
     return data.defragged;
 }
@@ -329,29 +325,6 @@ long activeDefragList(list *l, int val_type) {
     return defragged;
 }
 
-/* Utility function that replaces an old key pointer in the dictionary with a
- * new pointer. Additionally, we try to defrag the dictEntry in that dict.
- * Oldkey may be a dead pointer and should not be accessed (we get a
- * pre-calculated hash value). Newkey may be null if the key pointer wasn't
- * moved. Return value is the dictEntry if found, or NULL if not found.
- * NOTE: this is very ugly code, but it let's us avoid the complication of
- * doing a scan on another dict. */
-dictEntry* replaceSatelliteDictKeyPtrAndOrDefragDictEntry(dict *d, sds oldkey, sds newkey, uint64_t hash, long *defragged) {
-    dictEntry **deref = dictFindEntryRefByPtrAndHash(d, oldkey, hash);
-    if (deref) {
-        dictEntry *de = *deref;
-        dictEntry *newde = activeDefragAlloc(de);
-        if (newde) {
-            de = *deref = newde;
-            (*defragged)++;
-        }
-        if (newkey)
-            dictSetKey(d, de, newkey);
-        return de;
-    }
-    return NULL;
-}
-
 long activeDefragQuickListNode(quicklist *ql, quicklistNode **node_ref) {
     quicklistNode *newnode, *node = *node_ref;
     long defragged = 0;
@@ -453,7 +426,7 @@ long scanLaterZset(robj *ob, unsigned long *cursor) {
     zset *zs = (zset*)ob->ptr;
     dict *d = zs->dict;
     scanLaterZsetData data = {zs, 0};
-    *cursor = dictScan(d, *cursor, scanLaterZsetCallback, defragDictBucketCallback, &data);
+    *cursor = dictScanDefrag(d, *cursor, scanLaterZsetCallback, activeDefragAlloc, &data);
     return data.defragged;
 }
 
@@ -476,7 +449,7 @@ long scanLaterSet(robj *ob, unsigned long *cursor) {
         return 0;
     dict *d = ob->ptr;
     scanLaterDictData data = {d, 0};
-    *cursor = dictScan(d, *cursor, scanLaterSetCallback, defragDictBucketCallback, &data);
+    *cursor = dictScanDefrag(d, *cursor, scanLaterSetCallback, activeDefragAlloc, &data);
     return data.defragged;
 }
 
@@ -497,7 +470,7 @@ long scanLaterHash(robj *ob, unsigned long *cursor) {
         return 0;
     dict *d = ob->ptr;
     scanLaterDictData data = {d, 0};
-    *cursor = dictScan(d, *cursor, scanLaterHashCallback, defragDictBucketCallback, &data);
+    *cursor = dictScanDefrag(d, *cursor, scanLaterHashCallback, activeDefragAlloc, &data);
     return data.defragged;
 }
 
@@ -777,14 +750,17 @@ long defragKey(redisDb *db, dictEntry *de) {
 
     /* Try to defrag the key name. */
     newsds = activeDefragSds(keysds);
-    if (newsds)
-        defragged++, dictSetKey(db->dict, de, newsds);
-    if (dictSize(db->expires)) {
-         /* Dirty code:
-          * I can't search in db->expires for that key after i already released
-          * the pointer it holds it won't be able to do the string compare */
-        uint64_t hash = dictGetHash(db->dict, dictGetKey(de));
-        replaceSatelliteDictKeyPtrAndOrDefragDictEntry(db->expires, keysds, newsds, hash, &defragged);
+    if (newsds) {
+        defragged++;
+        dictSetKey(db->dict, de, newsds);
+        if (dictSize(db->expires)) {
+            /* We can't search in db->expires for that key after we've released
+             * the pointer it holds, since it won't be able to do the string
+             * compare, but we can find the entry using key hash and pointer. */
+            uint64_t hash = dictGetHash(db->dict, newsds);
+            dictEntry *expire_de = dictFindEntryByPtrAndHash(db->expires, keysds, hash);
+            if (expire_de) dictSetKey(db->expires, expire_de, newsds);
+        }
     }
 
     /* Try to defrag robj and / or string value. */
@@ -856,20 +832,12 @@ void defragScanCallback(void *privdata, const dictEntry *de) {
     server.stat_active_defrag_scanned++;
 }
 
-/* Defrag scan callback for each hash table bucket,
- * used in order to defrag the dictEntry allocations. */
-void defragDictBucketCallback(dict *d, dictEntry **bucketref) {
-    while (bucketref && *bucketref) {
-        dictEntry *de = *bucketref, *newde;
-        if ((newde = activeDefragAlloc(de))) {
-            *bucketref = newde;
-            if (server.cluster_enabled && d == server.db[0].dict) {
-                /* Cluster keyspace dict. Update slot-to-entries mapping. */
-                slotToKeyReplaceEntry(newde, server.db);
-            }
-        }
-        bucketref = dictGetNextRef(*bucketref);
-    }
+/* Dummy scan callback used when defragging the expire dictionary. We only
+ * defrag the entries, which is done per bucket. */
+void defragExpireScanCallback(void *privdata, const dictEntry *de) {
+    UNUSED(privdata);
+    UNUSED(de);
+    server.stat_active_defrag_scanned++;
 }
 
 /* Utility function to get the fragmentation ratio from jemalloc.
@@ -1037,6 +1005,7 @@ void computeDefragCycles() {
 void activeDefragCycle(void) {
     static int current_db = -1;
     static unsigned long cursor = 0;
+    static unsigned long expires_cursor = 0;
     static redisDb *db = NULL;
     static long long start_scan, start_stat;
     unsigned int iterations = 0;
@@ -1082,7 +1051,7 @@ void activeDefragCycle(void) {
 
     do {
         /* if we're not continuing a scan from the last call or loop, start a new one */
-        if (!cursor) {
+        if (!cursor && !expires_cursor) {
             /* finish any leftovers from previous db before moving to the next one */
             if (db && defragLaterStep(db, endtime)) {
                 quit = 1; /* time is up, we didn't finish all the work */
@@ -1129,16 +1098,32 @@ void activeDefragCycle(void) {
                 break; /* this will exit the function and we'll continue on the next cycle */
             }
 
-            cursor = dictScan(db->dict, cursor, defragScanCallback, defragDictBucketCallback, db);
+            /* Scan the keyspace dict unless we're scanning the expire dict. */
+            if (!expires_cursor)
+                cursor = dictScanDefrag(db->dict,
+                                        cursor,
+                                        defragScanCallback,
+                                        activeDefragAlloc,
+                                        db);
+
+            /* When done scanning the keyspace dict, we scan the expire dict. */
+            if (!cursor)
+                expires_cursor = dictScanDefrag(db->expires,
+                                                expires_cursor,
+                                                defragExpireScanCallback,
+                                                activeDefragAlloc,
+                                                NULL);
 
             /* Once in 16 scan iterations, 512 pointer reallocations. or 64 keys
              * (if we have a lot of pointers in one hash bucket or rehashing),
              * check if we reached the time limit.
              * But regardless, don't start a new db in this loop, this is because after
              * the last db we call defragOtherGlobals, which must be done in one cycle */
-            if (!cursor || (++iterations > 16 ||
-                            server.stat_active_defrag_hits - prev_defragged > 512 ||
-                            server.stat_active_defrag_scanned - prev_scanned > 64)) {
+            if (!(cursor || expires_cursor) ||
+                ++iterations > 16 ||
+                server.stat_active_defrag_hits - prev_defragged > 512 ||
+                server.stat_active_defrag_scanned - prev_scanned > 64)
+            {
                 if (!cursor || ustime() > endtime) {
                     quit = 1;
                     break;
@@ -1147,7 +1132,7 @@ void activeDefragCycle(void) {
                 prev_defragged = server.stat_active_defrag_hits;
                 prev_scanned = server.stat_active_defrag_scanned;
             }
-        } while(cursor && !quit);
+        } while((cursor || expires_cursor) && !quit);
     } while(!quit);
 
     latencyEndMonitor(latency);
diff --git a/src/dict.c b/src/dict.c
index 193f619b7..ec2d4dca0 100644
--- a/src/dict.c
+++ b/src/dict.c
@@ -120,7 +120,11 @@ static void _dictReset(dict *d, int htidx)
 /* Create a new hash table */
 dict *dictCreate(dictType *type)
 {
-    dict *d = zmalloc(sizeof(*d));
+    size_t metasize = type->dictMetadataBytes ? type->dictMetadataBytes() : 0;
+    dict *d = zmalloc(sizeof(*d) + metasize);
+    if (metasize) {
+        memset(dictMetadata(d), 0, metasize);
+    }
 
     _dictInit(d,type);
     return d;
@@ -319,6 +323,11 @@ static void _dictRehashStep(dict *d) {
     if (d->pauserehash == 0) dictRehash(d,1);
 }
 
+/* Return a pointer to the metadata section within the dict. */
+void *dictMetadata(dict *d) {
+    return &d->metadata;
+}
+
 /* Add an element to the target hash table */
 int dictAdd(dict *d, void *key, void *val)
 {
@@ -365,10 +374,10 @@ dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing)
      * system it is more likely that recently added entries are accessed
      * more frequently. */
     htidx = dictIsRehashing(d) ? 1 : 0;
-    size_t metasize = dictMetadataSize(d);
+    size_t metasize = dictEntryMetadataSize(d);
     entry = zmalloc(sizeof(*entry) + metasize);
     if (metasize > 0) {
-        memset(dictMetadata(entry), 0, metasize);
+        memset(dictEntryMetadata(entry), 0, metasize);
     }
     entry->next = d->ht_table[htidx][index];
     d->ht_table[htidx][index] = entry;
@@ -648,7 +657,7 @@ double dictIncrDoubleVal(dictEntry *de, double val) {
 }
 
 /* A pointer to the metadata section within the dict entry. */
-void *dictMetadata(dictEntry *de) {
+void *dictEntryMetadata(dictEntry *de) {
     return &de->metadata;
 }
 
@@ -677,17 +686,6 @@ double *dictGetDoubleValPtr(dictEntry *de) {
     return &de->v.d;
 }
 
-/* The next entry in same hash bucket. */
-dictEntry *dictGetNext(const dictEntry *de) {
-    return de->next;
-}
-
-/* A pointer to the 'next' field within the entry, or NULL if there is no next
- * field. */
-dictEntry **dictGetNextRef(dictEntry *de) {
-    return &de->next;
-}
-
 /* Returns the memory usage in bytes of the dict, excluding the size of the keys
  * and values. */
 size_t dictMemUsage(const dict *d) {
@@ -949,6 +947,21 @@ unsigned int dictGetSomeKeys(dict *d, dictEntry **des, unsigned int count) {
     return stored;
 }
 
+
+/* Reallocate the dictEntry allocations in a bucket using the provided
+ * allocation function in order to defrag them. */
+static void dictDefragBucket(dict *d, dictEntry **bucketref, dictDefragAllocFunction *allocfn) {
+    while (bucketref && *bucketref) {
+        dictEntry *de = *bucketref, *newde;
+        if ((newde = allocfn(de))) {
+            *bucketref = newde;
+            if (d->type->afterReplaceEntry)
+                d->type->afterReplaceEntry(d, newde);
+        }
+        bucketref = &(*bucketref)->next;
+    }
+}
+
 /* This is like dictGetRandomKey() from the POV of the API, but will do more
  * work to ensure a better distribution of the returned element.
  *
@@ -1072,9 +1085,25 @@ static unsigned long rev(unsigned long v) {
 unsigned long dictScan(dict *d,
                        unsigned long v,
                        dictScanFunction *fn,
-                       dictScanBucketFunction* bucketfn,
                        void *privdata)
 {
+    return dictScanDefrag(d, v, fn, NULL, privdata);
+}
+
+/* Like dictScan, but additionally reallocates the memory used by the dict
+ * entries using the provided allocation function. This feature was added for
+ * the active defrag feature.
+ *
+ * The 'defracallocfn' callback is called with a pointer to memory that callback
+ * can reallocate. The callback should return a new memory address or NULL,
+ * where NULL means that no reallocation happened and the old memory is still
+ * valid. */
+unsigned long dictScanDefrag(dict *d,
+                             unsigned long v,
+                             dictScanFunction *fn,
+                             dictDefragAllocFunction *defragallocfn,
+                             void *privdata)
+{
     int htidx0, htidx1;
     const dictEntry *de, *next;
     unsigned long m0, m1;
@@ -1089,7 +1118,9 @@ unsigned long dictScan(dict *d,
         m0 = DICTHT_SIZE_MASK(d->ht_size_exp[htidx0]);
 
         /* Emit entries at cursor */
-        if (bucketfn) bucketfn(d, &d->ht_table[htidx0][v & m0]);
+        if (defragallocfn) {
+            dictDefragBucket(d, &d->ht_table[htidx0][v & m0], defragallocfn);
+        }
         de = d->ht_table[htidx0][v & m0];
         while (de) {
             next = de->next;
@@ -1120,7 +1151,9 @@ unsigned long dictScan(dict *d,
         m1 = DICTHT_SIZE_MASK(d->ht_size_exp[htidx1]);
 
         /* Emit entries at cursor */
-        if (bucketfn) bucketfn(d, &d->ht_table[htidx0][v & m0]);
+        if (defragallocfn) {
+            dictDefragBucket(d, &d->ht_table[htidx0][v & m0], defragallocfn);
+        }
         de = d->ht_table[htidx0][v & m0];
         while (de) {
             next = de->next;
@@ -1132,7 +1165,9 @@ unsigned long dictScan(dict *d,
          * of the index pointed to by the cursor in the smaller table */
         do {
             /* Emit entries at cursor */
-            if (bucketfn) bucketfn(d, &d->ht_table[htidx1][v & m1]);
+            if (defragallocfn) {
+                dictDefragBucket(d, &d->ht_table[htidx1][v & m1], defragallocfn);
+            }
             de = d->ht_table[htidx1][v & m1];
             while (de) {
                 next = de->next;
@@ -1253,25 +1288,23 @@ uint64_t dictGetHash(dict *d, const void *key) {
     return dictHashKey(d, key);
 }
 
-/* Finds the dictEntry reference by using pointer and pre-calculated hash.
+/* Finds the dictEntry using pointer and pre-calculated hash.
  * oldkey is a dead pointer and should not be accessed.
  * the hash value should be provided using dictGetHash.
  * no string / key comparison is performed.
- * return value is the reference to the dictEntry if found, or NULL if not found. */
-dictEntry **dictFindEntryRefByPtrAndHash(dict *d, const void *oldptr, uint64_t hash) {
-    dictEntry *he, **heref;
+ * return value is a pointer to the dictEntry if found, or NULL if not found. */
+dictEntry *dictFindEntryByPtrAndHash(dict *d, const void *oldptr, uint64_t hash) {
+    dictEntry *he;
     unsigned long idx, table;
 
     if (dictSize(d) == 0) return NULL; /* dict is empty */
     for (table = 0; table <= 1; table++) {
         idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[table]);
-        heref = &d->ht_table[table][idx];
-        he = *heref;
+        he = d->ht_table[table][idx];
         while(he) {
             if (oldptr==he->key)
-                return heref;
-            heref = &he->next;
-            he = *heref;
+                return he;
+            he = he->next;
         }
         if (!dictIsRehashing(d)) return NULL;
     }
diff --git a/src/dict.h b/src/dict.h
index 99ff3b769..a8a76e69c 100644
--- a/src/dict.h
+++ b/src/dict.h
@@ -56,9 +56,13 @@ typedef struct dictType {
     void (*keyDestructor)(dict *d, void *key);
     void (*valDestructor)(dict *d, void *obj);
     int (*expandAllowed)(size_t moreMem, double usedRatio);
-    /* Allow a dictEntry to carry extra caller-defined metadata.  The
-     * extra memory is initialized to 0 when a dictEntry is allocated. */
+    /* Allow each dict and dictEntry to carry extra caller-defined metadata. The
+     * extra memory is initialized to 0 when allocated. */
     size_t (*dictEntryMetadataBytes)(dict *d);
+    size_t (*dictMetadataBytes)(void);
+    /* Optional callback called after an entry has been reallocated (due to
+     * active defrag). Only called if the entry has metadata. */
+    void (*afterReplaceEntry)(dict *d, dictEntry *entry);
 } dictType;
 
 #define DICTHT_SIZE(exp) ((exp) == -1 ? 0 : (unsigned long)1<<(exp))
@@ -75,6 +79,10 @@ struct dict {
     /* Keep small vars at end for optimal (minimal) struct padding */
     int16_t pauserehash; /* If >0 rehashing is paused (<0 indicates coding error) */
     signed char ht_size_exp[2]; /* exponent of size. (size = 1<<exp) */
+
+    void *metadata[];           /* An arbitrary number of bytes (starting at a
+                                 * pointer-aligned address) of size as defined
+                                 * by dictType's dictEntryBytes. */
 };
 
 /* If safe is set to 1 this is a safe iterator, that means, you can call
@@ -91,7 +99,7 @@ typedef struct dictIterator {
 } dictIterator;
 
 typedef void (dictScanFunction)(void *privdata, const dictEntry *de);
-typedef void (dictScanBucketFunction)(dict *d, dictEntry **bucketref);
+typedef void *(dictDefragAllocFunction)(void *ptr);
 
 /* This is the initial size of every hash table */
 #define DICT_HT_INITIAL_EXP      2
@@ -112,8 +120,10 @@ typedef void (dictScanBucketFunction)(dict *d, dictEntry **bucketref);
         (d)->type->keyCompare((d), key1, key2) : \
         (key1) == (key2))
 
-#define dictMetadataSize(d) ((d)->type->dictEntryMetadataBytes \
-                             ? (d)->type->dictEntryMetadataBytes(d) : 0)
+#define dictEntryMetadataSize(d) ((d)->type->dictEntryMetadataBytes     \
+                                  ? (d)->type->dictEntryMetadataBytes(d) : 0)
+#define dictMetadataSize(d) ((d)->type->dictMetadataBytes               \
+                             ? (d)->type->dictMetadataBytes() : 0)
 
 #define dictHashKey(d, key) ((d)->type->hashFunction(key))
 #define dictSlots(d) (DICTHT_SIZE((d)->ht_size_exp[0])+DICTHT_SIZE((d)->ht_size_exp[1]))
@@ -139,6 +149,7 @@ typedef enum {
 dict *dictCreate(dictType *type);
 int dictExpand(dict *d, unsigned long size);
 int dictTryExpand(dict *d, unsigned long size);
+void *dictMetadata(dict *d);
 int dictAdd(dict *d, void *key, void *val);
 dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing);
 dictEntry *dictAddOrFind(dict *d, void *key);
@@ -160,15 +171,13 @@ void dictSetDoubleVal(dictEntry *de, double val);
 int64_t dictIncrSignedIntegerVal(dictEntry *de, int64_t val);
 uint64_t dictIncrUnsignedIntegerVal(dictEntry *de, uint64_t val);
 double dictIncrDoubleVal(dictEntry *de, double val);
-void *dictMetadata(dictEntry *de);
+void *dictEntryMetadata(dictEntry *de);
 void *dictGetKey(const dictEntry *de);
 void *dictGetVal(const dictEntry *de);
 int64_t dictGetSignedIntegerVal(const dictEntry *de);
 uint64_t dictGetUnsignedIntegerVal(const dictEntry *de);
 double dictGetDoubleVal(const dictEntry *de);
 double *dictGetDoubleValPtr(dictEntry *de);
-dictEntry *dictGetNext(const dictEntry *de);
-dictEntry **dictGetNextRef(dictEntry *de);
 size_t dictMemUsage(const dict *d);
 size_t dictEntryMemUsage(void);
 dictIterator *dictGetIterator(dict *d);
@@ -190,9 +199,10 @@ int dictRehash(dict *d, int n);
 int dictRehashMilliseconds(dict *d, int ms);
 void dictSetHashFunctionSeed(uint8_t *seed);
 uint8_t *dictGetHashFunctionSeed(void);
-unsigned long dictScan(dict *d, unsigned long v, dictScanFunction *fn, dictScanBucketFunction *bucketfn, void *privdata);
+unsigned long dictScan(dict *d, unsigned long v, dictScanFunction *fn, void *privdata);
+unsigned long dictScanDefrag(dict *d, unsigned long v, dictScanFunction *fn, dictDefragAllocFunction *allocfn, void *privdata);
 uint64_t dictGetHash(dict *d, const void *key);
-dictEntry **dictFindEntryRefByPtrAndHash(dict *d, const void *oldptr, uint64_t hash);
+dictEntry *dictFindEntryByPtrAndHash(dict *d, const void *oldptr, uint64_t hash);
 
 #ifdef REDIS_TEST
 int dictTest(int argc, char *argv[], int flags);
diff --git a/src/expire.c b/src/expire.c
index e357b9986..d07e22f66 100644
--- a/src/expire.c
+++ b/src/expire.c
@@ -271,7 +271,7 @@ void activeExpireCycle(int type) {
 
             while (data.sampled < num && checked_buckets < max_buckets) {
                 db->expires_cursor = dictScan(db->expires, db->expires_cursor,
-                                              expireScanCallback, NULL, &data);
+                                              expireScanCallback, &data);
                 checked_buckets++;
             }
             total_expired += data.expired;
diff --git a/src/module.c b/src/module.c
index cc6934dca..b745e8ee4 100644
--- a/src/module.c
+++ b/src/module.c
@@ -10336,7 +10336,7 @@ int RM_Scan(RedisModuleCtx *ctx, RedisModuleScanCursor *cursor, RedisModuleScanC
     }
     int ret = 1;
     ScanCBData data = { ctx, privdata, fn };
-    cursor->cursor = dictScan(ctx->client->db->dict, cursor->cursor, moduleScanCallback, NULL, &data);
+    cursor->cursor = dictScan(ctx->client->db->dict, cursor->cursor, moduleScanCallback, &data);
     if (cursor->cursor == 0) {
         cursor->done = 1;
         ret = 0;
@@ -10448,7 +10448,7 @@ int RM_ScanKey(RedisModuleKey *key, RedisModuleScanCursor *cursor, RedisModuleSc
     int ret = 1;
     if (ht) {
         ScanKeyCBData data = { key, privdata, fn };
-        cursor->cursor = dictScan(ht, cursor->cursor, moduleScanKeyCallback, NULL, &data);
+        cursor->cursor = dictScan(ht, cursor->cursor, moduleScanKeyCallback, &data);
         if (cursor->cursor == 0) {
             cursor->done = 1;
             ret = 0;
diff --git a/src/object.c b/src/object.c
index ca9a24088..67e372032 100644
--- a/src/object.c
+++ b/src/object.c
@@ -1252,7 +1252,8 @@ struct redisMemOverhead *getMemoryOverheadData(void) {
         mem_total+=mem;
 
         /* Account for the slot to keys map in cluster mode */
-        mem = dictSize(db->dict) * dictMetadataSize(db->dict);
+        mem = dictSize(db->dict) * dictEntryMetadataSize(db->dict) +
+              dictMetadataSize(db->dict);
         mh->db[mh->num_dbs].overhead_ht_slot_to_keys = mem;
         mem_total+=mem;
 
diff --git a/src/server.c b/src/server.c
index ae83ec25c..910f299f3 100644
--- a/src/server.c
+++ b/src/server.c
@@ -399,13 +399,24 @@ int dictExpandAllowed(size_t moreMem, double usedRatio) {
 /* Returns the size of the DB dict entry metadata in bytes. In cluster mode, the
  * metadata is used for constructing a doubly linked list of the dict entries
  * belonging to the same cluster slot. See the Slot to Key API in cluster.c. */
-size_t dictEntryMetadataSize(dict *d) {
+size_t dbDictEntryMetadataSize(dict *d) {
     UNUSED(d);
     /* NOTICE: this also affects overhead_ht_slot_to_keys in getMemoryOverheadData.
      * If we ever add non-cluster related data here, that code must be modified too. */
     return server.cluster_enabled ? sizeof(clusterDictEntryMetadata) : 0;
 }
 
+/* Returns the size of the DB dict metadata in bytes. In cluster mode, we store
+ * a pointer to the db in the main db dict, used for updating the slot-to-key
+ * mapping when a dictEntry is reallocated. */
+size_t dbDictMetadataSize(void) {
+    return server.cluster_enabled ? sizeof(clusterDictMetadata) : 0;
+}
+
+void dbDictAfterReplaceEntry(dict *d, dictEntry *de) {
+    if (server.cluster_enabled) slotToKeyReplaceEntry(d, de);
+}
+
 /* Generic hash table type where keys are Redis Objects, Values
  * dummy pointers. */
 dictType objectKeyPointerValueDictType = {
@@ -460,7 +471,9 @@ dictType dbDictType = {
     dictSdsDestructor,          /* key destructor */
     dictObjectDestructor,       /* val destructor */
     dictExpandAllowed,          /* allow to expand */
-    dictEntryMetadataSize       /* size of entry metadata in bytes */
+    dbDictEntryMetadataSize,    /* size of entry metadata in bytes */
+    dbDictMetadataSize,         /* size of dict metadata in bytes */
+    dbDictAfterReplaceEntry     /* notify entry moved/reallocated */
 };
 
 /* Db->expires */
author	Viktor Söderqvist <viktor.soderqvist@est.tech>	2022-11-20 23:23:54 +0100
committer	Viktor Söderqvist <viktor.soderqvist@est.tech>	2023-01-11 10:25:20 +0100
commit	b60d33c91eef09aea34cecad789c552405737c55 (patch)
tree	d106927d7b6ef9d64d530a9722c6affa3dbe7e83
parent	d4e9e0aebdc2c44c252e8ca27644b4392a6e820b (diff)
download	redis-b60d33c91eef09aea34cecad789c552405737c55.tar.gz