summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorOran Agra <oran@redislabs.com>2018-02-21 20:18:34 +0200
committerOran Agra <oran@redislabs.com>2018-07-16 16:43:42 +0300
commitbf680b6f8cdaee2c5588c5c8932a7f3b7fa70b15 (patch)
treebc0c6ef60ab8748cf1585406c4d27fb4e3083187 /src
parentab33bcd34640306cdf70fd4fda0af41d93c687bf (diff)
downloadredis-bf680b6f8cdaee2c5588c5c8932a7f3b7fa70b15.tar.gz
slave buffers were wasteful and incorrectly counted causing eviction
A) slave buffers didn't count internal fragmentation and sds unused space, this caused them to induce eviction although we didn't mean for it. B) slave buffers were consuming about twice the memory of what they actually needed. - this was mainly due to sdsMakeRoomFor growing to twice as much as needed each time but networking.c not storing more than 16k (partially fixed recently in 237a38737). - besides it wasn't able to store half of the new string into one buffer and the other half into the next (so the above mentioned fix helped mainly for small items). - lastly, the sds buffers had up to 30% internal fragmentation that was wasted, consumed but not used. C) inefficient performance due to starting from a small string and reallocing many times. what i changed: - creating dedicated buffers for reply list, counting their size with zmalloc_size - when creating a new reply node from, preallocate it to at least 16k. - when appending a new reply to the buffer, first fill all the unused space of the previous node before starting a new one. other changes: - expose mem_not_counted_for_evict info field for the benefit of the test suite - add a test to make sure slave buffers are counted correctly and that they don't cause eviction
Diffstat (limited to 'src')
-rw-r--r--src/aof.c2
-rw-r--r--src/module.c4
-rw-r--r--src/networking.c106
-rw-r--r--src/replication.c1
-rw-r--r--src/scripting.c4
-rw-r--r--src/server.c10
-rw-r--r--src/server.h7
-rw-r--r--src/zmalloc.c3
-rw-r--r--src/zmalloc.h3
9 files changed, 90 insertions, 50 deletions
diff --git a/src/aof.c b/src/aof.c
index 2d5f259fb..be416ec4e 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -645,7 +645,7 @@ struct client *createFakeClient(void) {
c->obuf_soft_limit_reached_time = 0;
c->watched_keys = listCreate();
c->peerid = NULL;
- listSetFreeMethod(c->reply,decrRefCountVoid);
+ listSetFreeMethod(c->reply,freeClientReplyValue);
listSetDupMethod(c->reply,dupClientReplyValue);
initClientMultiState(c);
return c;
diff --git a/src/module.c b/src/module.c
index a46b86a50..9809cd74e 100644
--- a/src/module.c
+++ b/src/module.c
@@ -2712,9 +2712,9 @@ RedisModuleCallReply *RM_Call(RedisModuleCtx *ctx, const char *cmdname, const ch
sds proto = sdsnewlen(c->buf,c->bufpos);
c->bufpos = 0;
while(listLength(c->reply)) {
- sds o = listNodeValue(listFirst(c->reply));
+ clientReplyBlock *o = listNodeValue(listFirst(c->reply));
- proto = sdscatsds(proto,o);
+ proto = sdscatlen(proto,o->buf,o->used);
listDelNode(c->reply,listFirst(c->reply));
}
reply = moduleCreateCallReplyFromProto(ctx,proto);
diff --git a/src/networking.c b/src/networking.c
index 58b553fe6..36ae26e0b 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -56,11 +56,14 @@ size_t getStringObjectSdsUsedMemory(robj *o) {
/* Client.reply list dup and free methods. */
void *dupClientReplyValue(void *o) {
- return sdsdup(o);
+ clientReplyBlock *old = o;
+ clientReplyBlock *buf = zmalloc(sizeof(clientReplyBlock) + old->size);
+ memcpy(buf, o, sizeof(clientReplyBlock) + old->size);
+ return buf;
}
void freeClientReplyValue(void *o) {
- sdsfree(o);
+ zfree(o);
}
int listMatchObjects(void *a, void *b) {
@@ -240,25 +243,31 @@ int _addReplyToBuffer(client *c, const char *s, size_t len) {
void _addReplyStringToList(client *c, const char *s, size_t len) {
if (c->flags & CLIENT_CLOSE_AFTER_REPLY) return;
- if (listLength(c->reply) == 0) {
- sds node = sdsnewlen(s,len);
- listAddNodeTail(c->reply,node);
- c->reply_bytes += len;
- } else {
- listNode *ln = listLast(c->reply);
- sds tail = listNodeValue(ln);
-
- /* Append to this object when possible. If tail == NULL it was
- * set via addDeferredMultiBulkLength(). */
- if (tail && (sdsavail(tail) >= len || sdslen(tail)+len <= PROTO_REPLY_CHUNK_BYTES)) {
- tail = sdscatlen(tail,s,len);
- listNodeValue(ln) = tail;
- c->reply_bytes += len;
- } else {
- sds node = sdsnewlen(s,len);
- listAddNodeTail(c->reply,node);
- c->reply_bytes += len;
- }
+ listNode *ln = listLast(c->reply);
+ clientReplyBlock *tail = ln? listNodeValue(ln): NULL;
+ /* It is possible that we have a tail list node, but no tail buffer.
+ * if addDeferredMultiBulkLength() was used. */
+
+ /* Append to tail string when possible. */
+ if (tail) {
+ /* Copy the part we can fit into the tail, and leave the rest for a new node */
+ size_t avail = tail->size - tail->used;
+ size_t copy = avail >= len? len: avail;
+ memcpy(tail->buf + tail->used, s, copy);
+ tail->used += copy;
+ s += copy;
+ len -= copy;
+ }
+ if (len) {
+ /* Create a new node, make sure it is allocated to at least PROTO_REPLY_CHUNK_BYTES */
+ size_t size = len < PROTO_REPLY_CHUNK_BYTES? PROTO_REPLY_CHUNK_BYTES: len;
+ tail = zmalloc(size + sizeof(clientReplyBlock));
+ /* take over the allocation's internal fragmentation */
+ tail->size = zmalloc_usable(tail) - sizeof(clientReplyBlock);
+ tail->used = len;
+ memcpy(tail->buf, s, len);
+ listAddNodeTail(c->reply, tail);
+ c->reply_bytes += tail->size;
}
asyncCloseClientOnOutputBufferLimitReached(c);
}
@@ -390,26 +399,35 @@ void *addDeferredMultiBulkLength(client *c) {
/* Populate the length object and try gluing it to the next chunk. */
void setDeferredMultiBulkLength(client *c, void *node, long length) {
listNode *ln = (listNode*)node;
- sds len, next;
+ clientReplyBlock *next;
+ char lenstr[128];
+ size_t lenstr_len = sprintf(lenstr, "*%ld\r\n", length);
/* Abort when *node is NULL: when the client should not accept writes
* we return NULL in addDeferredMultiBulkLength() */
if (node == NULL) return;
-
- len = sdscatprintf(sdsnewlen("*",1),"%ld\r\n",length);
- listNodeValue(ln) = len;
- c->reply_bytes += sdslen(len);
- if (ln->next != NULL) {
- next = listNodeValue(ln->next);
-
- /* Only glue when the next node is non-NULL (an sds in this case) */
- if (next != NULL) {
- len = sdscatsds(len,next);
- listDelNode(c->reply,ln->next);
- listNodeValue(ln) = len;
- /* No need to update c->reply_bytes: we are just moving the same
- * amount of bytes from one node to another. */
- }
+ serverAssert(!listNodeValue(ln));
+
+ /* Glue into next node when:
+ * - the next node is non-NULL,
+ * - it has enough room already allocated
+ * - and not too large (avoid large memmove) */
+ if (ln->next != NULL && (next = listNodeValue(ln->next)) &&
+ next->size - next->used >= lenstr_len &&
+ next->used < PROTO_REPLY_CHUNK_BYTES * 4) {
+ memmove(next->buf + lenstr_len, next->buf, next->used);
+ memcpy(next->buf, lenstr, lenstr_len);
+ next->used += lenstr_len;
+ listDelNode(c->reply,ln);
+ } else {
+ /* Create a new node */
+ clientReplyBlock *buf = zmalloc(lenstr_len + sizeof(clientReplyBlock));
+ /* take over the allocation's internal fragmentation */
+ buf->size = zmalloc_usable(buf) - sizeof(clientReplyBlock);
+ buf->used = lenstr_len;
+ memcpy(buf->buf, lenstr, lenstr_len);
+ listNodeValue(ln) = buf;
+ c->reply_bytes += buf->size;
}
asyncCloseClientOnOutputBufferLimitReached(c);
}
@@ -895,7 +913,7 @@ client *lookupClientByID(uint64_t id) {
int writeToClient(int fd, client *c, int handler_installed) {
ssize_t nwritten = 0, totwritten = 0;
size_t objlen;
- sds o;
+ clientReplyBlock *o;
while(clientHasPendingReplies(c)) {
if (c->bufpos > 0) {
@@ -912,23 +930,24 @@ int writeToClient(int fd, client *c, int handler_installed) {
}
} else {
o = listNodeValue(listFirst(c->reply));
- objlen = sdslen(o);
+ objlen = o->used;
if (objlen == 0) {
+ c->reply_bytes -= o->size;
listDelNode(c->reply,listFirst(c->reply));
continue;
}
- nwritten = write(fd, o + c->sentlen, objlen - c->sentlen);
+ nwritten = write(fd, o->buf + c->sentlen, objlen - c->sentlen);
if (nwritten <= 0) break;
c->sentlen += nwritten;
totwritten += nwritten;
/* If we fully sent the object on head go to the next one */
if (c->sentlen == objlen) {
+ c->reply_bytes -= o->size;
listDelNode(c->reply,listFirst(c->reply));
c->sentlen = 0;
- c->reply_bytes -= objlen;
/* If there are no longer objects in the list, we expect
* the count of reply bytes to be exactly zero. */
if (listLength(c->reply) == 0)
@@ -1899,10 +1918,7 @@ void rewriteClientCommandArgument(client *c, int i, robj *newval) {
* the caller wishes. The main usage of this function currently is
* enforcing the client output length limits. */
unsigned long getClientOutputBufferMemoryUsage(client *c) {
- unsigned long list_item_size = sizeof(listNode)+5;
- /* The +5 above means we assume an sds16 hdr, may not be true
- * but is not going to be a problem. */
-
+ unsigned long list_item_size = sizeof(listNode) + sizeof(clientReplyBlock);
return c->reply_bytes + (list_item_size*listLength(c->reply));
}
diff --git a/src/replication.c b/src/replication.c
index 0adef8aec..bc3755398 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -2148,6 +2148,7 @@ void replicationCacheMaster(client *c) {
server.master->read_reploff = server.master->reploff;
if (c->flags & CLIENT_MULTI) discardTransaction(c);
listEmpty(c->reply);
+ c->reply_bytes = 0;
c->bufpos = 0;
resetClient(c);
diff --git a/src/scripting.c b/src/scripting.c
index 857859e70..328e3d681 100644
--- a/src/scripting.c
+++ b/src/scripting.c
@@ -575,9 +575,9 @@ int luaRedisGenericCommand(lua_State *lua, int raise_error) {
reply = sdsnewlen(c->buf,c->bufpos);
c->bufpos = 0;
while(listLength(c->reply)) {
- sds o = listNodeValue(listFirst(c->reply));
+ clientReplyBlock *o = listNodeValue(listFirst(c->reply));
- reply = sdscatsds(reply,o);
+ reply = sdscatlen(reply,o->buf,o->used);
listDelNode(c->reply,listFirst(c->reply));
}
}
diff --git a/src/server.c b/src/server.c
index 502589cf0..971595872 100644
--- a/src/server.c
+++ b/src/server.c
@@ -3101,6 +3101,11 @@ sds genRedisInfoString(char *section) {
"rss_overhead_bytes:%zu\r\n"
"mem_fragmentation_ratio:%.2f\r\n"
"mem_fragmentation_bytes:%zu\r\n"
+ "mem_not_counted_for_evict:%zu\r\n"
+ "mem_replication_backlog:%zu\r\n"
+ "mem_clients_slaves:%zu\r\n"
+ "mem_clients_normal:%zu\r\n"
+ "mem_aof_buffer:%zu\r\n"
"mem_allocator:%s\r\n"
"active_defrag_running:%d\r\n"
"lazyfree_pending_objects:%zu\r\n",
@@ -3133,6 +3138,11 @@ sds genRedisInfoString(char *section) {
mh->rss_extra_bytes,
mh->total_frag, /* this is the total RSS overhead, including fragmentation, */
mh->total_frag_bytes, /* named so for backwards compatibility */
+ freeMemoryGetNotCountedMemory(),
+ mh->repl_backlog,
+ mh->clients_slaves,
+ mh->clients_normal,
+ mh->aof_buffer,
ZMALLOC_LIB,
server.active_defrag_running,
lazyfreeGetPendingObjectsCount()
diff --git a/src/server.h b/src/server.h
index f3df6b46b..c35333bd3 100644
--- a/src/server.h
+++ b/src/server.h
@@ -619,6 +619,11 @@ typedef struct redisObject {
struct evictionPoolEntry; /* Defined in evict.c */
+typedef struct clientReplyBlock {
+ size_t size, used;
+ char buf[];
+} clientReplyBlock;
+
/* Redis database representation. There are multiple databases identified
* by integers from 0 (the default database) up to the max configured
* database. The database number is the 'id' field in the structure. */
@@ -1423,6 +1428,7 @@ void addReplySubcommandSyntaxError(client *c);
void copyClientOutputBuffer(client *dst, client *src);
size_t sdsZmallocSize(sds s);
size_t getStringObjectSdsUsedMemory(robj *o);
+void freeClientReplyValue(void *o);
void *dupClientReplyValue(void *o);
void getClientsMaxBuffers(unsigned long *longest_output_list,
unsigned long *biggest_input_buffer);
@@ -1664,6 +1670,7 @@ int zslLexValueLteMax(sds value, zlexrangespec *spec);
/* Core functions */
int getMaxmemoryState(size_t *total, size_t *logical, size_t *tofree, float *level);
+size_t freeMemoryGetNotCountedMemory();
int freeMemoryIfNeeded(void);
int processCommand(client *c);
void setupSignalHandlers(void);
diff --git a/src/zmalloc.c b/src/zmalloc.c
index 4956f905f..0d9917510 100644
--- a/src/zmalloc.c
+++ b/src/zmalloc.c
@@ -182,6 +182,9 @@ size_t zmalloc_size(void *ptr) {
if (size&(sizeof(long)-1)) size += sizeof(long)-(size&(sizeof(long)-1));
return size+PREFIX_SIZE;
}
+size_t zmalloc_usable(void *ptr) {
+ return zmalloc_usable(ptr)-PREFIX_SIZE;
+}
#endif
void zfree(void *ptr) {
diff --git a/src/zmalloc.h b/src/zmalloc.h
index 49b33b883..9c9229907 100644
--- a/src/zmalloc.h
+++ b/src/zmalloc.h
@@ -98,6 +98,9 @@ void *zmalloc_no_tcache(size_t size);
#ifndef HAVE_MALLOC_SIZE
size_t zmalloc_size(void *ptr);
+size_t zmalloc_usable(void *ptr);
+#else
+#define zmalloc_usable(p) zmalloc_size(p)
#endif
#endif /* __ZMALLOC_H */