refactor chunk chaining for memory efficiency1.4.36

Memory chunk chains would simply stitch multiple chunks of the highest slab class together. If your item was 17k and the chunk limit is 16k, the item would use 32k of space instead of a bit over 17k. This refactor simplifies the slab allocation path and pulls the allocation of chunks into the upload process. A "large" item gets a small chunk assigned as an object header, rather than attempting to inline a slab chunk into a parent chunk. It then gets chunks individually allocated and added into the chain while the object uploads. This solves a lot of issues: 1) When assembling new, potentially very large items, we don't have to sit and spin evicting objects all at once. If there are 20 16k chunks in the tail and we allocate a 1 meg item, the new item will evict one of those chunks inbetween each read, rather than trying to guess how many loops to run before giving up. Very large objects take time to read from the socket anyway. 2) Simplifies code around the initial chunk. Originally embedding data into the top chunk and embedding data at the same time required a good amount of fiddling. (Though this might flip back to embedding the initial chunk if I can clean it up a bit more). 3) Pulling chunks individually means the slabber code can be flatened to not think about chunks aside from freeing them, which culled a lot of code and removed branches from a hot path. 4) The size of the final chunk is naturally set to the remaining about of bytes that need to be stored, which means chunks from another slab class can be pulled to "cap off" a large item, reducing memory overhead.
author: dormando <dormando@rydia.net> 2017-02-22 19:32:50 -0800
committer: dormando <dormando@rydia.net> 2017-03-19 18:32:33 -0700
commit: ae84d771811394a9f6f2ff12022707f69bdc311f (patch)
tree: c84a692064c2604be72c097072afd4f6d53f4762 /items.c
parent: 2f9f51d48af688b1283a235de9d918963666e166 (diff)
download: memcached-ae84d771811394a9f6f2ff12022707f69bdc311f.tar.gz
1 files changed, 85 insertions, 38 deletions
diff --git a/items.c b/items.c
index 78be101..8bd7d18 100644
--- a/items.c
+++ b/items.c
@@ -169,39 +169,15 @@ static size_t item_make_header(const uint8_t nkey, const unsigned int flags, con
     return sizeof(item) + nkey + *nsuffix + nbytes;
 }
 
-item *do_item_alloc(char *key, const size_t nkey, const unsigned int flags,
-                    const rel_time_t exptime, const int nbytes) {
-    int i;
-    uint8_t nsuffix;
+static item *do_item_alloc_pull(const size_t ntotal, const unsigned int id) {
     item *it = NULL;
-    char suffix[40];
-    // Avoid potential underflows.
-    if (nbytes < 2)
-        return 0;
-
-    size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix);
-    if (settings.use_cas) {
-        ntotal += sizeof(uint64_t);
-    }
-
-    unsigned int id = slabs_clsid(ntotal);
-    if (id == 0)
-        return 0;
-
+    int i;
     /* If no memory is available, attempt a direct LRU juggle/eviction */
     /* This is a race in order to simplify lru_pull_tail; in cases where
      * locked items are on the tail, you want them to fall out and cause
      * occasional OOM's, rather than internally work around them.
      * This also gives one fewer code path for slab alloc/free
      */
-    /* TODO: if power_largest, try a lot more times? or a number of times
-     * based on how many chunks the new object should take up?
-     * or based on the size of an object lru_pull_tail() says it evicted?
-     * This is a classical GC problem if "large items" are of too varying of
-     * sizes. This is actually okay here since the larger the data, the more
-     * bandwidth it takes, the more time we can loop in comparison to serving
-     * and replacing small items.
-     */
     for (i = 0; i < 10; i++) {
         uint64_t total_bytes;
         /* Try to reclaim memory first */
@@ -214,13 +190,12 @@ item *do_item_alloc(char *key, const size_t nkey, const unsigned int flags,
             total_bytes -= temp_lru_size(id);
 
         if (it == NULL) {
-            if (settings.lru_segmented) {
-                if (lru_pull_tail(id, COLD_LRU, total_bytes, LRU_PULL_EVICT, 0) <= 0) {
+            if (lru_pull_tail(id, COLD_LRU, total_bytes, LRU_PULL_EVICT, 0) <= 0) {
+                if (settings.lru_segmented) {
                     lru_pull_tail(id, HOT_LRU, total_bytes, 0, 0);
-                }
-            } else {
-                if (lru_pull_tail(id, COLD_LRU, 0, LRU_PULL_EVICT, 0) <= 0)
+                } else {
                     break;
+                }
             }
         } else {
             break;
@@ -233,6 +208,80 @@ item *do_item_alloc(char *key, const size_t nkey, const unsigned int flags,
         pthread_mutex_unlock(&lru_locks[id]);
     }
 
+    return it;
+}
+
+/* Chain another chunk onto this chunk. */
+/* slab mover: if it finds a chunk without ITEM_CHUNK flag, and no ITEM_LINKED
+ * flag, it counts as busy and skips.
+ * I think it might still not be safe to do linking outside of the slab lock
+ */
+item_chunk *do_item_alloc_chunk(item_chunk *ch, const size_t bytes_remain) {
+    // TODO: Should be a cleaner way of finding real size with slabber calls
+    size_t size = bytes_remain + sizeof(item_chunk);
+    if (size > settings.slab_chunk_size_max)
+        size = settings.slab_chunk_size_max;
+    unsigned int id = slabs_clsid(size);
+
+    item_chunk *nch = (item_chunk *) do_item_alloc_pull(size, id);
+    if (nch == NULL)
+        return NULL;
+
+    // link in.
+    // ITEM_CHUNK[ED] bits need to be protected by the slabs lock.
+    slabs_mlock();
+    nch->head = ch->head;
+    ch->next = nch;
+    nch->prev = ch;
+    nch->next = 0;
+    nch->used = 0;
+    nch->slabs_clsid = id;
+    nch->size = size - sizeof(item_chunk);
+    nch->it_flags |= ITEM_CHUNK;
+    slabs_munlock();
+    return nch;
+}
+
+item *do_item_alloc(char *key, const size_t nkey, const unsigned int flags,
+                    const rel_time_t exptime, const int nbytes) {
+    uint8_t nsuffix;
+    item *it = NULL;
+    char suffix[40];
+    // Avoid potential underflows.
+    if (nbytes < 2)
+        return 0;
+
+    size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix);
+    if (settings.use_cas) {
+        ntotal += sizeof(uint64_t);
+    }
+
+    unsigned int id = slabs_clsid(ntotal);
+    unsigned int hdr_id = 0;
+    if (id == 0)
+        return 0;
+
+    /* This is a large item. Allocate a header object now, lazily allocate
+     *  chunks while reading the upload.
+     */
+    if (ntotal > settings.slab_chunk_size_max) {
+        /* We still link this item into the LRU for the larger slab class, but
+         * we're pulling a header from an entirely different slab class. The
+         * free routines handle large items specifically.
+         */
+        int htotal = nkey + 1 + nsuffix + sizeof(item) + sizeof(item_chunk);
+        if (settings.use_cas) {
+            htotal += sizeof(uint64_t);
+        }
+        hdr_id = slabs_clsid(htotal);
+        it = do_item_alloc_pull(htotal, hdr_id);
+        /* setting ITEM_CHUNKED is fine here because we aren't LINKED yet. */
+        if (it != NULL)
+            it->it_flags |= ITEM_CHUNKED;
+    } else {
+        it = do_item_alloc_pull(ntotal, id);
+    }
+
     if (it == NULL) {
         pthread_mutex_lock(&lru_locks[id]);
         itemstats[id].outofmemory++;
@@ -273,18 +322,16 @@ item *do_item_alloc(char *key, const size_t nkey, const unsigned int flags,
     }
     it->nsuffix = nsuffix;
 
-    /* Need to shuffle the pointer stored in h_next into it->data. */
+    /* Initialize internal chunk. */
     if (it->it_flags & ITEM_CHUNKED) {
         item_chunk *chunk = (item_chunk *) ITEM_data(it);
 
-        chunk->next = (item_chunk *) it->h_next;
+        chunk->next = 0;
         chunk->prev = 0;
-        chunk->head = it;
-        /* Need to chain back into the head's chunk */
-        chunk->next->prev = chunk;
-        chunk->size = chunk->next->size - ((char *)chunk - (char *)it);
         chunk->used = 0;
-        assert(chunk->size > 0);
+        chunk->size = 0;
+        chunk->head = it;
+        chunk->orig_clsid = hdr_id;
     }
     it->h_next = 0;
author	dormando <dormando@rydia.net>	2017-02-22 19:32:50 -0800
committer	dormando <dormando@rydia.net>	2017-03-19 18:32:33 -0700
commit	ae84d771811394a9f6f2ff12022707f69bdc311f (patch)
tree	c84a692064c2604be72c097072afd4f6d53f4762 /items.c
parent	2f9f51d48af688b1283a235de9d918963666e166 (diff)
download	memcached-ae84d771811394a9f6f2ff12022707f69bdc311f.tar.gz