diff options
author | dormando <dormando@rydia.net> | 2017-02-22 19:32:50 -0800 |
---|---|---|
committer | dormando <dormando@rydia.net> | 2017-03-19 18:32:33 -0700 |
commit | ae84d771811394a9f6f2ff12022707f69bdc311f (patch) | |
tree | c84a692064c2604be72c097072afd4f6d53f4762 | |
parent | 2f9f51d48af688b1283a235de9d918963666e166 (diff) | |
download | memcached-ae84d771811394a9f6f2ff12022707f69bdc311f.tar.gz |
refactor chunk chaining for memory efficiency1.4.36
Memory chunk chains would simply stitch multiple chunks of the highest slab
class together. If your item was 17k and the chunk limit is 16k, the item
would use 32k of space instead of a bit over 17k.
This refactor simplifies the slab allocation path and pulls the allocation of
chunks into the upload process. A "large" item gets a small chunk assigned as
an object header, rather than attempting to inline a slab chunk into a parent
chunk. It then gets chunks individually allocated and added into the chain
while the object uploads.
This solves a lot of issues:
1) When assembling new, potentially very large items, we don't have to sit and
spin evicting objects all at once. If there are 20 16k chunks in the tail and
we allocate a 1 meg item, the new item will evict one of those chunks
inbetween each read, rather than trying to guess how many loops to run before
giving up. Very large objects take time to read from the socket anyway.
2) Simplifies code around the initial chunk. Originally embedding data into
the top chunk and embedding data at the same time required a good amount of
fiddling. (Though this might flip back to embedding the initial chunk if I can
clean it up a bit more).
3) Pulling chunks individually means the slabber code can be flatened to not
think about chunks aside from freeing them, which culled a lot of code and
removed branches from a hot path.
4) The size of the final chunk is naturally set to the remaining about of
bytes that need to be stored, which means chunks from another slab class can
be pulled to "cap off" a large item, reducing memory overhead.
-rw-r--r-- | items.c | 123 | ||||
-rw-r--r-- | items.h | 1 | ||||
-rw-r--r-- | memcached.c | 115 | ||||
-rw-r--r-- | memcached.h | 2 | ||||
-rw-r--r-- | slabs.c | 180 | ||||
-rw-r--r-- | slabs.h | 3 | ||||
-rw-r--r-- | t/slabs-reassign-chunked.t | 5 |
7 files changed, 244 insertions, 185 deletions
@@ -169,39 +169,15 @@ static size_t item_make_header(const uint8_t nkey, const unsigned int flags, con return sizeof(item) + nkey + *nsuffix + nbytes; } -item *do_item_alloc(char *key, const size_t nkey, const unsigned int flags, - const rel_time_t exptime, const int nbytes) { - int i; - uint8_t nsuffix; +static item *do_item_alloc_pull(const size_t ntotal, const unsigned int id) { item *it = NULL; - char suffix[40]; - // Avoid potential underflows. - if (nbytes < 2) - return 0; - - size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix); - if (settings.use_cas) { - ntotal += sizeof(uint64_t); - } - - unsigned int id = slabs_clsid(ntotal); - if (id == 0) - return 0; - + int i; /* If no memory is available, attempt a direct LRU juggle/eviction */ /* This is a race in order to simplify lru_pull_tail; in cases where * locked items are on the tail, you want them to fall out and cause * occasional OOM's, rather than internally work around them. * This also gives one fewer code path for slab alloc/free */ - /* TODO: if power_largest, try a lot more times? or a number of times - * based on how many chunks the new object should take up? - * or based on the size of an object lru_pull_tail() says it evicted? - * This is a classical GC problem if "large items" are of too varying of - * sizes. This is actually okay here since the larger the data, the more - * bandwidth it takes, the more time we can loop in comparison to serving - * and replacing small items. - */ for (i = 0; i < 10; i++) { uint64_t total_bytes; /* Try to reclaim memory first */ @@ -214,13 +190,12 @@ item *do_item_alloc(char *key, const size_t nkey, const unsigned int flags, total_bytes -= temp_lru_size(id); if (it == NULL) { - if (settings.lru_segmented) { - if (lru_pull_tail(id, COLD_LRU, total_bytes, LRU_PULL_EVICT, 0) <= 0) { + if (lru_pull_tail(id, COLD_LRU, total_bytes, LRU_PULL_EVICT, 0) <= 0) { + if (settings.lru_segmented) { lru_pull_tail(id, HOT_LRU, total_bytes, 0, 0); - } - } else { - if (lru_pull_tail(id, COLD_LRU, 0, LRU_PULL_EVICT, 0) <= 0) + } else { break; + } } } else { break; @@ -233,6 +208,80 @@ item *do_item_alloc(char *key, const size_t nkey, const unsigned int flags, pthread_mutex_unlock(&lru_locks[id]); } + return it; +} + +/* Chain another chunk onto this chunk. */ +/* slab mover: if it finds a chunk without ITEM_CHUNK flag, and no ITEM_LINKED + * flag, it counts as busy and skips. + * I think it might still not be safe to do linking outside of the slab lock + */ +item_chunk *do_item_alloc_chunk(item_chunk *ch, const size_t bytes_remain) { + // TODO: Should be a cleaner way of finding real size with slabber calls + size_t size = bytes_remain + sizeof(item_chunk); + if (size > settings.slab_chunk_size_max) + size = settings.slab_chunk_size_max; + unsigned int id = slabs_clsid(size); + + item_chunk *nch = (item_chunk *) do_item_alloc_pull(size, id); + if (nch == NULL) + return NULL; + + // link in. + // ITEM_CHUNK[ED] bits need to be protected by the slabs lock. + slabs_mlock(); + nch->head = ch->head; + ch->next = nch; + nch->prev = ch; + nch->next = 0; + nch->used = 0; + nch->slabs_clsid = id; + nch->size = size - sizeof(item_chunk); + nch->it_flags |= ITEM_CHUNK; + slabs_munlock(); + return nch; +} + +item *do_item_alloc(char *key, const size_t nkey, const unsigned int flags, + const rel_time_t exptime, const int nbytes) { + uint8_t nsuffix; + item *it = NULL; + char suffix[40]; + // Avoid potential underflows. + if (nbytes < 2) + return 0; + + size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix); + if (settings.use_cas) { + ntotal += sizeof(uint64_t); + } + + unsigned int id = slabs_clsid(ntotal); + unsigned int hdr_id = 0; + if (id == 0) + return 0; + + /* This is a large item. Allocate a header object now, lazily allocate + * chunks while reading the upload. + */ + if (ntotal > settings.slab_chunk_size_max) { + /* We still link this item into the LRU for the larger slab class, but + * we're pulling a header from an entirely different slab class. The + * free routines handle large items specifically. + */ + int htotal = nkey + 1 + nsuffix + sizeof(item) + sizeof(item_chunk); + if (settings.use_cas) { + htotal += sizeof(uint64_t); + } + hdr_id = slabs_clsid(htotal); + it = do_item_alloc_pull(htotal, hdr_id); + /* setting ITEM_CHUNKED is fine here because we aren't LINKED yet. */ + if (it != NULL) + it->it_flags |= ITEM_CHUNKED; + } else { + it = do_item_alloc_pull(ntotal, id); + } + if (it == NULL) { pthread_mutex_lock(&lru_locks[id]); itemstats[id].outofmemory++; @@ -273,18 +322,16 @@ item *do_item_alloc(char *key, const size_t nkey, const unsigned int flags, } it->nsuffix = nsuffix; - /* Need to shuffle the pointer stored in h_next into it->data. */ + /* Initialize internal chunk. */ if (it->it_flags & ITEM_CHUNKED) { item_chunk *chunk = (item_chunk *) ITEM_data(it); - chunk->next = (item_chunk *) it->h_next; + chunk->next = 0; chunk->prev = 0; - chunk->head = it; - /* Need to chain back into the head's chunk */ - chunk->next->prev = chunk; - chunk->size = chunk->next->size - ((char *)chunk - (char *)it); chunk->used = 0; - assert(chunk->size > 0); + chunk->size = 0; + chunk->head = it; + chunk->orig_clsid = hdr_id; } it->h_next = 0; @@ -10,6 +10,7 @@ uint64_t get_cas_id(void); /*@null@*/ item *do_item_alloc(char *key, const size_t nkey, const unsigned int flags, const rel_time_t exptime, const int nbytes); +item_chunk *do_item_alloc_chunk(item_chunk *ch, const size_t bytes_remain); void item_free(item *it); bool item_size_ok(const size_t nkey, const int flags, const int nbytes); diff --git a/memcached.c b/memcached.c index 016517a..63de2fb 100644 --- a/memcached.c +++ b/memcached.c @@ -1398,17 +1398,10 @@ static void complete_update_bin(conn *c) { item_chunk *ch = (item_chunk *) c->ritem; if (ch->size == ch->used) ch = ch->next; - if (ch->size - ch->used > 1) { - ch->data[ch->used + 1] = '\r'; - ch->data[ch->used + 2] = '\n'; - ch->used += 2; - } else { - ch->data[ch->used + 1] = '\r'; - ch->next->data[0] = '\n'; - ch->used++; - ch->next->used++; - assert(ch->size == ch->used); - } + assert(ch->size - ch->used >= 2); + ch->data[ch->used + 1] = '\r'; + ch->data[ch->used + 2] = '\n'; + ch->used += 2; } ret = store_item(it, c->cmd, c); @@ -2534,11 +2527,15 @@ static void complete_nread(conn *c) { /* Destination must always be chunked */ /* This should be part of item.c */ -static void _store_item_copy_chunks(item *d_it, item *s_it, const int len) { +static int _store_item_copy_chunks(item *d_it, item *s_it, const int len) { item_chunk *dch = (item_chunk *) ITEM_data(d_it); /* Advance dch until we find free space */ while (dch->size == dch->used) { - dch = dch->next; + if (dch->next) { + dch = dch->next; + } else { + break; + } } if (s_it->it_flags & ITEM_CHUNKED) { @@ -2560,7 +2557,12 @@ static void _store_item_copy_chunks(item *d_it, item *s_it, const int len) { remain -= todo; assert(dch->used <= dch->size); if (dch->size == dch->used) { - dch = dch->next; + item_chunk *tch = do_item_alloc_chunk(dch, remain); + if (tch) { + dch = tch; + } else { + return -1; + } } assert(copied <= sch->used); if (copied == sch->used) { @@ -2576,23 +2578,32 @@ static void _store_item_copy_chunks(item *d_it, item *s_it, const int len) { while (len > done && dch) { int todo = (dch->size - dch->used < len - done) ? dch->size - dch->used : len - done; - assert(dch->size - dch->used != 0); + //assert(dch->size - dch->used != 0); memcpy(dch->data + dch->used, ITEM_data(s_it) + done, todo); done += todo; dch->used += todo; assert(dch->used <= dch->size); - if (dch->size == dch->used) - dch = dch->next; + if (dch->size == dch->used) { + item_chunk *tch = do_item_alloc_chunk(dch, len - done); + if (tch) { + dch = tch; + } else { + return -1; + } + } } assert(len == done); } + return 0; } -static void _store_item_copy_data(int comm, item *old_it, item *new_it, item *add_it) { +static int _store_item_copy_data(int comm, item *old_it, item *new_it, item *add_it) { if (comm == NREAD_APPEND) { if (new_it->it_flags & ITEM_CHUNKED) { - _store_item_copy_chunks(new_it, old_it, old_it->nbytes - 2); - _store_item_copy_chunks(new_it, add_it, add_it->nbytes); + if (_store_item_copy_chunks(new_it, old_it, old_it->nbytes - 2) == -1 || + _store_item_copy_chunks(new_it, add_it, add_it->nbytes) == -1) { + return -1; + } } else { memcpy(ITEM_data(new_it), ITEM_data(old_it), old_it->nbytes); memcpy(ITEM_data(new_it) + old_it->nbytes - 2 /* CRLF */, ITEM_data(add_it), add_it->nbytes); @@ -2600,13 +2611,16 @@ static void _store_item_copy_data(int comm, item *old_it, item *new_it, item *ad } else { /* NREAD_PREPEND */ if (new_it->it_flags & ITEM_CHUNKED) { - _store_item_copy_chunks(new_it, add_it, add_it->nbytes - 2); - _store_item_copy_chunks(new_it, old_it, old_it->nbytes); + if (_store_item_copy_chunks(new_it, add_it, add_it->nbytes - 2) == -1 || + _store_item_copy_chunks(new_it, old_it, old_it->nbytes) == -1) { + return -1; + } } else { memcpy(ITEM_data(new_it), ITEM_data(add_it), add_it->nbytes); memcpy(ITEM_data(new_it) + add_it->nbytes - 2 /* CRLF */, ITEM_data(old_it), old_it->nbytes); } } + return 0; } /* @@ -2690,13 +2704,14 @@ enum store_item_type do_store_item(item *it, int comm, conn *c, const uint32_t h new_it = do_item_alloc(key, it->nkey, flags, old_it->exptime, it->nbytes + old_it->nbytes - 2 /* CRLF */); - if (new_it == NULL) { + /* copy data from it and old_it to new_it */ + if (new_it == NULL || _store_item_copy_data(comm, old_it, new_it, it) == -1) { failed_alloc = 1; stored = NOT_STORED; + // failed data copy, free up. + if (new_it != NULL) + item_remove(new_it); } else { - /* copy data from it and old_it to new_it */ - _store_item_copy_data(comm, old_it, new_it, it); - it = new_it; } } @@ -4608,6 +4623,26 @@ static int read_into_chunked_item(conn *c) { while (c->rlbytes > 0) { item_chunk *ch = (item_chunk *)c->ritem; + assert(ch->used <= ch->size); + if (ch->size == ch->used) { + // FIXME: ch->next is currently always 0. remove this? + if (ch->next) { + c->ritem = (char *) ch->next; + } else { + /* Allocate next chunk. Binary protocol needs 2b for \r\n */ + c->ritem = (char *) do_item_alloc_chunk(ch, c->rlbytes + + ((c->protocol == binary_prot) ? 2 : 0)); + if (!c->ritem) { + // We failed an allocation. Let caller handle cleanup. + total = -2; + break; + } + // ritem has new chunk, restart the loop. + continue; + //assert(c->rlbytes == 0); + } + } + int unused = ch->size - ch->used; /* first check if we have leftovers in the conn_read buffer */ if (c->rbytes > 0) { @@ -4642,15 +4677,19 @@ static int read_into_chunked_item(conn *c) { break; } } + } - assert(ch->used <= ch->size); - if (ch->size == ch->used) { - if (ch->next) { - c->ritem = (char *) ch->next; - } else { - /* No space left. */ - assert(c->rlbytes == 0); - break; + /* At some point I will be able to ditch the \r\n from item storage and + remove all of these kludges. + The above binprot check ensures inline space for \r\n, but if we do + exactly enough allocs there will be no additional chunk for \r\n. + */ + if (c->rlbytes == 0 && c->protocol == binary_prot && total >= 0) { + item_chunk *ch = (item_chunk *)c->ritem; + if (ch->size - ch->used < 2) { + c->ritem = (char *) do_item_alloc_chunk(ch, 2); + if (!c->ritem) { + total = -2; } } } @@ -4864,6 +4903,14 @@ static void drive_machine(conn *c) { stop = true; break; } + + /* Memory allocation failure */ + if (res == -2) { + out_of_memory(c, "SERVER_ERROR Out of memory during read"); + c->sbytes = c->rlbytes; + c->write_and_go = conn_swallow; + break; + } /* otherwise we have a real error, on which we close the connection */ if (settings.verbose > 0) { fprintf(stderr, "Failed to read, and not due to blocking:\n" diff --git a/memcached.h b/memcached.h index 9b65ce6..1a760e6 100644 --- a/memcached.h +++ b/memcached.h @@ -459,7 +459,7 @@ typedef struct _strchunk { int used; /* chunk space used */ int nbytes; /* used. */ unsigned short refcount; /* used? */ - uint8_t nsuffix; /* unused */ + uint8_t orig_clsid; /* For obj hdr chunks slabs_clsid is fake. */ uint8_t it_flags; /* ITEM_* above. */ uint8_t slabs_clsid; /* Same as above. */ char data[]; @@ -246,64 +246,6 @@ static int do_slabs_newslab(const unsigned int id) { return 1; } -/* This calculation ends up adding sizeof(void *) to the item size. */ -static void *do_slabs_alloc_chunked(const size_t size, slabclass_t *p, unsigned int id) { - void *ret = NULL; - item *it = NULL; - int x; - int csize = p->size - sizeof(item_chunk); - unsigned int chunks_req = size / csize; - if (size % csize != 0) - chunks_req++; - while (p->sl_curr < chunks_req) { - if (do_slabs_newslab(id) == 0) - break; - } - - if (p->sl_curr >= chunks_req) { - item_chunk *chunk = NULL; - - /* Configure the head item in the chain. */ - it = (item *)p->slots; - p->slots = it->next; - if (it->next) it->next->prev = 0; - - /* Squirrel away the "top chunk" into h_next for now */ - it->h_next = (item *)p->slots; - assert(it->h_next != 0); - chunk = (item_chunk *) it->h_next; - - /* roll down the chunks, marking them as such. */ - for (x = 0; x < chunks_req-1; x++) { - chunk->it_flags &= ~ITEM_SLABBED; - chunk->it_flags |= ITEM_CHUNK; - /* Chunks always have a direct reference to the head item */ - chunk->head = it; - chunk->size = p->size - sizeof(item_chunk); - chunk->used = 0; - chunk = chunk->next; - } - - /* The final "next" is now the top of the slab freelist */ - p->slots = chunk; - if (chunk && chunk->prev) { - /* Disconnect the final chunk from the chain */ - chunk->prev->next = 0; - chunk->prev = 0; - } - - it->it_flags &= ~ITEM_SLABBED; - it->it_flags |= ITEM_CHUNKED; - it->refcount = 1; - p->sl_curr -= chunks_req; - ret = (void *)it; - } else { - ret = NULL; - } - - return ret; -} - /*@null@*/ static void *do_slabs_alloc(const size_t size, unsigned int id, uint64_t *total_bytes, unsigned int flags) { @@ -321,30 +263,26 @@ static void *do_slabs_alloc(const size_t size, unsigned int id, uint64_t *total_ *total_bytes = p->requested; } - if (size <= p->size) { - /* fail unless we have space at the end of a recently allocated page, - we have something on our freelist, or we could allocate a new page */ - if (p->sl_curr == 0 && flags != SLABS_ALLOC_NO_NEWPAGE) { - do_slabs_newslab(id); - } + assert(size <= p->size); + /* fail unless we have space at the end of a recently allocated page, + we have something on our freelist, or we could allocate a new page */ + if (p->sl_curr == 0 && flags != SLABS_ALLOC_NO_NEWPAGE) { + do_slabs_newslab(id); + } - if (p->sl_curr != 0) { - /* return off our freelist */ - it = (item *)p->slots; - p->slots = it->next; - if (it->next) it->next->prev = 0; - /* Kill flag and initialize refcount here for lock safety in slab - * mover's freeness detection. */ - it->it_flags &= ~ITEM_SLABBED; - it->refcount = 1; - p->sl_curr--; - ret = (void *)it; - } else { - ret = NULL; - } + if (p->sl_curr != 0) { + /* return off our freelist */ + it = (item *)p->slots; + p->slots = it->next; + if (it->next) it->next->prev = 0; + /* Kill flag and initialize refcount here for lock safety in slab + * mover's freeness detection. */ + it->it_flags &= ~ITEM_SLABBED; + it->refcount = 1; + p->sl_curr--; + ret = (void *)it; } else { - /* Dealing with a chunked item. */ - ret = do_slabs_alloc_chunked(size, p, id); + ret = NULL; } if (ret) { @@ -357,47 +295,53 @@ static void *do_slabs_alloc(const size_t size, unsigned int id, uint64_t *total_ return ret; } -static void do_slabs_free_chunked(item *it, const size_t size, unsigned int id, - slabclass_t *p) { +static void do_slabs_free_chunked(item *it, const size_t size) { item_chunk *chunk = (item_chunk *) ITEM_data(it); - size_t realsize = size; - while (chunk) { - realsize += sizeof(item_chunk); - chunk = chunk->next; - } - chunk = (item_chunk *) ITEM_data(it); - unsigned int chunks_found = 1; + slabclass_t *p; it->it_flags = ITEM_SLABBED; it->slabs_clsid = 0; it->prev = 0; - it->next = (item *) chunk->next; - assert(it->next); - /* top chunk should already point back to head */ - assert(it->next && (void*)it->next->prev == (void*)chunk); - chunk = chunk->next; - chunk->prev = (item_chunk *)it; + // header object's original classid is stored in chunk. + p = &slabclass[chunk->orig_clsid]; + if (chunk->next) { + chunk = chunk->next; + chunk->prev = 0; + } else { + // header with no attached chunk + chunk = NULL; + } + + // return the header object. + // TODO: This is in three places, here and in do_slabs_free(). + it->prev = 0; + it->next = p->slots; + if (it->next) it->next->prev = it; + p->slots = it; + p->sl_curr++; + // TODO: macro + p->requested -= it->nkey + 1 + it->nsuffix + sizeof(item) + sizeof(item_chunk); + if (settings.use_cas) { + p->requested -= sizeof(uint64_t); + } + item_chunk *next_chunk; while (chunk) { assert(chunk->it_flags == ITEM_CHUNK); chunk->it_flags = ITEM_SLABBED; + p = &slabclass[chunk->slabs_clsid]; chunk->slabs_clsid = 0; - chunks_found++; - if (chunk->next) { - chunk = chunk->next; - } else { - break; - } - } - /* must have had nothing hanging off of the final chunk */ - assert(chunk && chunk->next == 0); - /* Tail chunk, link the freelist here. */ - chunk->next = p->slots; - if (chunk->next) chunk->next->prev = chunk; + next_chunk = chunk->next; - p->slots = it; - p->sl_curr += chunks_found; - p->requested -= size; + chunk->prev = 0; + chunk->next = p->slots; + if (chunk->next) chunk->next->prev = chunk; + p->slots = chunk; + p->sl_curr++; + p->requested -= chunk->size + sizeof(item_chunk); + + chunk = next_chunk; + } return; } @@ -426,7 +370,7 @@ static void do_slabs_free(void *ptr, const size_t size, unsigned int id) { p->sl_curr++; p->requested -= size; } else { - do_slabs_free_chunked(it, size, id, p); + do_slabs_free_chunked(it, size); } return; } @@ -649,6 +593,20 @@ unsigned int slabs_available_chunks(const unsigned int id, bool *mem_flag, return ret; } +/* The slabber system could avoid needing to understand much, if anything, + * about items if callbacks were strategically used. Due to how the slab mover + * works, certain flag bits can only be adjusted while holding the slabs lock. + * Using these functions, isolate sections of code needing this and turn them + * into callbacks when an interface becomes more obvious. + */ +void slabs_mlock(void) { + pthread_mutex_lock(&slabs_lock); +} + +void slabs_munlock(void) { + pthread_mutex_unlock(&slabs_lock); +} + static pthread_cond_t slab_rebalance_cond = PTHREAD_COND_INITIALIZER; static volatile int do_run_slab_thread = 1; static volatile int do_run_slab_rebalance_thread = 1; @@ -40,6 +40,9 @@ void slabs_stats(ADD_STAT add_stats, void *c); /* Hints as to freespace in slab class */ unsigned int slabs_available_chunks(unsigned int id, bool *mem_flag, uint64_t *total_bytes, unsigned int *chunks_perslab); +void slabs_mlock(void); +void slabs_munlock(void); + int start_slab_maintenance_thread(void); void stop_slab_maintenance_thread(void); diff --git a/t/slabs-reassign-chunked.t b/t/slabs-reassign-chunked.t index e261577..efc87fa 100644 --- a/t/slabs-reassign-chunked.t +++ b/t/slabs-reassign-chunked.t @@ -133,6 +133,9 @@ for (1 .. $keycount) { cmp_ok($hits, '>', 4000, 'were able to fetch back 2/3rds of 8k keys'); my $stats_done = mem_stats($sock); -cmp_ok($stats_done->{slab_reassign_rescues}, '>', 0, 'some reassign rescues happened'); +cmp_ok($stats_done->{slab_reassign_chunk_rescues}, '>', 0, 'some reassign chunk rescues happened'); +# Reassign rescues won't happen here because the headers are of a different +# size and we aren't moving pages out of that slab class +#cmp_ok($stats_done->{slab_reassign_rescues}, '>', 0, 'some reassign rescues happened'); cmp_ok($stats_done->{slab_reassign_evictions_nomem}, '>', 0, 'some reassign evictions happened'); |