diff options
-rw-r--r-- | Makefile.am | 3 | ||||
-rw-r--r-- | doc/protocol.txt | 4 | ||||
-rw-r--r-- | items.c | 70 | ||||
-rw-r--r-- | items.h | 8 | ||||
-rw-r--r-- | logger.c | 3 | ||||
-rw-r--r-- | logger.h | 1 | ||||
-rw-r--r-- | memcached.c | 55 | ||||
-rw-r--r-- | memcached.h | 2 | ||||
-rwxr-xr-x | scripts/memcached-automove | 2 | ||||
-rw-r--r-- | slab_automove.c | 150 | ||||
-rw-r--r-- | slab_automove.h | 8 | ||||
-rw-r--r-- | slabs.c | 16 | ||||
-rw-r--r-- | slabs.h | 7 | ||||
-rw-r--r-- | t/dyn-maxbytes.t | 2 | ||||
-rw-r--r-- | t/slabs-reassign2.t | 4 |
15 files changed, 311 insertions, 24 deletions
diff --git a/Makefile.am b/Makefile.am index 8b79525..36933ea 100644 --- a/Makefile.am +++ b/Makefile.am @@ -22,7 +22,8 @@ memcached_SOURCES = memcached.c memcached.h \ bipbuffer.c bipbuffer.h \ logger.c logger.h \ crawler.c crawler.h \ - itoa_ljust.c itoa_ljust.h + itoa_ljust.c itoa_ljust.h \ + slab_automove.c slab_automove.h if BUILD_CACHE memcached_SOURCES += cache.c diff --git a/doc/protocol.txt b/doc/protocol.txt index 2ae1c1a..d39bf2b 100644 --- a/doc/protocol.txt +++ b/doc/protocol.txt @@ -775,6 +775,10 @@ other stats command. | hashpower_init | 32 | Starting size multiplier for hash table | | slab_reassign | bool | Whether slab page reassignment is allowed | | slab_automove | bool | Whether slab page automover is enabled | +| slab_automove_ratio | +| | float | Ratio limit between young/old slab classes | +| slab_automove_window | +| | 32u | Internal algo tunable for automove | | slab_chunk_max | 32 | Max slab class size (avoid unless necessary) | | hash_algorithm | char | Hash table algorithm in use | | lru_crawler | bool | Whether the LRU crawler is enabled | @@ -1,6 +1,7 @@ /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ #include "memcached.h" #include "bipbuffer.h" +#include "slab_automove.h" #include <sys/stat.h> #include <sys/socket.h> #include <sys/resource.h> @@ -609,6 +610,33 @@ char *item_cachedump(const unsigned int slabs_clsid, const unsigned int limit, u return buffer; } +/* With refactoring of the various stats code the automover won't need a + * custom function here. + */ +void fill_item_stats_automove(item_stats_automove *am) { + int n; + for (n = 0; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) { + item_stats_automove *cur = &am[n]; + + // outofmemory records into HOT + int i = n | HOT_LRU; + pthread_mutex_lock(&lru_locks[i]); + cur->outofmemory = itemstats[i].outofmemory; + pthread_mutex_unlock(&lru_locks[i]); + + // evictions and tail age are from COLD + i = n | COLD_LRU; + pthread_mutex_lock(&lru_locks[i]); + cur->evicted = itemstats[i].evicted; + if (tails[i]) { + cur->age = current_time - tails[i]->time; + } else { + cur->age = 0; + } + pthread_mutex_unlock(&lru_locks[i]); + } +} + void item_stats_totals(ADD_STAT add_stats, void *c) { itemstats_t totals; memset(&totals, 0, sizeof(itemstats_t)); @@ -1293,12 +1321,11 @@ static uint64_t lru_total_bumps_dropped(void) { static int lru_maintainer_juggle(const int slabs_clsid) { int i; int did_moves = 0; - bool mem_limit_reached = false; uint64_t total_bytes = 0; unsigned int chunks_perslab = 0; - unsigned int chunks_free = 0; + //unsigned int chunks_free = 0; /* TODO: if free_chunks below high watermark, increase aggressiveness */ - chunks_free = slabs_available_chunks(slabs_clsid, &mem_limit_reached, + slabs_available_chunks(slabs_clsid, NULL, &total_bytes, &chunks_perslab); if (settings.temp_lru) { /* Only looking for reclaims. Run before we size the LRU. */ @@ -1312,14 +1339,6 @@ static int lru_maintainer_juggle(const int slabs_clsid) { total_bytes -= temp_lru_size(slabs_clsid); } - /* If slab automove is enabled on any level, and we have more than 2 pages - * worth of chunks free in this class, ask (gently) to reassign a page - * from this class back into the global pool (0) - */ - if (settings.slab_automove > 0 && chunks_free > (chunks_perslab * 2.5)) { - slabs_reassign(slabs_clsid, SLAB_GLOBAL_PAGE_POOL); - } - rel_time_t cold_age = 0; rel_time_t hot_age = 0; /* If LRU is in flat mode, force items to drain into COLD via max age */ @@ -1467,6 +1486,7 @@ static void *lru_maintainer_thread(void *arg) { useconds_t to_sleep = MIN_LRU_MAINTAINER_SLEEP; useconds_t last_sleep = MIN_LRU_MAINTAINER_SLEEP; rel_time_t last_crawler_check = 0; + rel_time_t last_automove_check = 0; useconds_t next_juggles[MAX_NUMBER_OF_SLAB_CLASSES]; useconds_t backoff_juggles[MAX_NUMBER_OF_SLAB_CLASSES]; struct crawler_expired_data cdata; @@ -1480,6 +1500,10 @@ static void *lru_maintainer_thread(void *arg) { abort(); } + double last_ratio = settings.slab_automove_ratio; + void *am = slab_automove_init(settings.slab_automove_window, + settings.slab_automove_ratio); + pthread_mutex_lock(&lru_maintainer_lock); if (settings.verbose > 2) fprintf(stderr, "Starting LRU maintainer background thread\n"); @@ -1537,8 +1561,32 @@ static void *lru_maintainer_thread(void *arg) { lru_maintainer_crawler_check(&cdata, l); last_crawler_check = current_time; } + + if (settings.slab_automove == 1 && last_automove_check != current_time) { + if (last_ratio != settings.slab_automove_ratio) { + slab_automove_free(am); + am = slab_automove_init(settings.slab_automove_window, + settings.slab_automove_ratio); + last_ratio = settings.slab_automove_ratio; + } + int src, dst; + slab_automove_run(am, &src, &dst); + if (src != -1 && dst != -1) { + slabs_reassign(src, dst); + LOGGER_LOG(l, LOG_SYSEVENTS, LOGGER_SLAB_MOVE, NULL, + src, dst); + } + // dst == 0 means reclaim to global pool, be more aggressive + if (dst != 0) { + last_automove_check = current_time; + } else if (dst == 0) { + // also ensure we minimize the thread sleep + to_sleep = 1000; + } + } } pthread_mutex_unlock(&lru_maintainer_lock); + slab_automove_free(am); if (settings.verbose > 2) fprintf(stderr, "LRU maintainer thread stopping\n"); @@ -46,6 +46,14 @@ void item_stats_sizes_add(item *it); void item_stats_sizes_remove(item *it); bool item_stats_sizes_status(void); +/* stats getter for slab automover */ +typedef struct { + int64_t evicted; + int64_t outofmemory; + uint32_t age; +} item_stats_automove; +void fill_item_stats_automove(item_stats_automove *am); + item *do_item_get(const char *key, const size_t nkey, const uint32_t hv, conn *c, const bool do_update); item *do_item_touch(const char *key, const size_t nkey, uint32_t exptime, const uint32_t hv, conn *c); void item_stats_reset(void); @@ -51,6 +51,9 @@ static const entry_details default_entries[] = { [LOGGER_ITEM_STORE] = {LOGGER_ITEM_STORE_ENTRY, 512, LOG_MUTATIONS, NULL}, [LOGGER_CRAWLER_STATUS] = {LOGGER_TEXT_ENTRY, 512, LOG_SYSEVENTS, "type=lru_crawler crawler=%d lru=%s low_mark=%llu next_reclaims=%llu since_run=%u next_run=%d elapsed=%u examined=%llu reclaimed=%llu" + }, + [LOGGER_SLAB_MOVE] = {LOGGER_TEXT_ENTRY, 512, LOG_SYSEVENTS, + "type=slab_move src=%d dst=%d" } }; @@ -19,6 +19,7 @@ enum log_entry_type { LOGGER_ITEM_GET, LOGGER_ITEM_STORE, LOGGER_CRAWLER_STATUS, + LOGGER_SLAB_MOVE, }; enum log_entry_subtype { diff --git a/memcached.c b/memcached.c index c004c73..06cacb4 100644 --- a/memcached.c +++ b/memcached.c @@ -245,6 +245,8 @@ static void settings_init(void) { settings.hashpower_init = 0; settings.slab_reassign = false; settings.slab_automove = 0; + settings.slab_automove_ratio = 0.8; + settings.slab_automove_window = 30; settings.shutdown_command = false; settings.tail_repair_time = TAIL_REPAIR_TIME_DEFAULT; settings.flush_enabled = true; @@ -3017,6 +3019,8 @@ static void process_stat_settings(ADD_STAT add_stats, void *c) { APPEND_STAT("hashpower_init", "%d", settings.hashpower_init); APPEND_STAT("slab_reassign", "%s", settings.slab_reassign ? "yes" : "no"); APPEND_STAT("slab_automove", "%d", settings.slab_automove); + APPEND_STAT("slab_automove_ratio", "%.2f", settings.slab_automove_ratio); + APPEND_STAT("slab_automove_window", "%.2f", settings.slab_automove_window); APPEND_STAT("slab_chunk_max", "%d", settings.slab_chunk_size_max); APPEND_STAT("lru_crawler", "%s", settings.lru_crawler ? "yes" : "no"); APPEND_STAT("lru_crawler_sleep", "%d", settings.lru_crawler_sleep); @@ -3822,19 +3826,28 @@ static void process_verbosity_command(conn *c, token_t *tokens, const size_t nto static void process_slabs_automove_command(conn *c, token_t *tokens, const size_t ntokens) { unsigned int level; + double ratio; assert(c != NULL); set_noreply_maybe(c, tokens, ntokens); - level = strtoul(tokens[2].value, NULL, 10); - if (level == 0) { - settings.slab_automove = 0; - } else if (level == 1 || level == 2) { - settings.slab_automove = level; + if (strcmp(tokens[2].value, "ratio") == 0) { + if (ntokens < 5 || !safe_strtod(tokens[3].value, &ratio)) { + out_string(c, "ERROR"); + return; + } + settings.slab_automove_ratio = ratio; } else { - out_string(c, "ERROR"); - return; + level = strtoul(tokens[2].value, NULL, 10); + if (level == 0) { + settings.slab_automove = 0; + } else if (level == 1 || level == 2) { + settings.slab_automove = level; + } else { + out_string(c, "ERROR"); + return; + } } out_string(c, "OK"); return; @@ -4135,7 +4148,7 @@ static void process_command(conn *c, char *command) { break; } return; - } else if (ntokens == 4 && + } else if (ntokens >= 4 && (strcmp(tokens[COMMAND_TOKEN + 1].value, "automove") == 0)) { process_slabs_automove_command(c, tokens, ntokens); } else { @@ -5872,6 +5885,8 @@ int main (int argc, char **argv) { HASHPOWER_INIT, SLAB_REASSIGN, SLAB_AUTOMOVE, + SLAB_AUTOMOVE_RATIO, + SLAB_AUTOMOVE_WINDOW, TAIL_REPAIR_TIME, HASH_ALGORITHM, LRU_CRAWLER, @@ -5897,6 +5912,8 @@ int main (int argc, char **argv) { [HASHPOWER_INIT] = "hashpower", [SLAB_REASSIGN] = "slab_reassign", [SLAB_AUTOMOVE] = "slab_automove", + [SLAB_AUTOMOVE_RATIO] = "slab_automove_ratio", + [SLAB_AUTOMOVE_WINDOW] = "slab_automove_window", [TAIL_REPAIR_TIME] = "tail_repair_time", [HASH_ALGORITHM] = "hash_algorithm", [LRU_CRAWLER] = "lru_crawler", @@ -6213,6 +6230,28 @@ int main (int argc, char **argv) { return 1; } break; + case SLAB_AUTOMOVE_RATIO: + if (subopts_value == NULL) { + fprintf(stderr, "Missing slab_automove_ratio argument\n"); + return 1; + } + settings.slab_automove_ratio = atof(subopts_value); + if (settings.slab_automove_ratio <= 0 || settings.slab_automove_ratio > 1) { + fprintf(stderr, "slab_automove_ratio must be > 0 and < 1\n"); + return 1; + } + break; + case SLAB_AUTOMOVE_WINDOW: + if (subopts_value == NULL) { + fprintf(stderr, "Missing slab_automove_window argument\n"); + return 1; + } + settings.slab_automove_window = atoi(subopts_value); + if (settings.slab_automove_window < 3) { + fprintf(stderr, "slab_automove_window must be > 2\n"); + return 1; + } + break; case TAIL_REPAIR_TIME: if (subopts_value == NULL) { fprintf(stderr, "Missing numeric argument for tail_repair_time\n"); diff --git a/memcached.h b/memcached.h index 6b54fa0..bb4b4ca 100644 --- a/memcached.h +++ b/memcached.h @@ -360,6 +360,8 @@ struct settings { bool lru_segmented; /* Use split or flat LRU's */ bool slab_reassign; /* Whether or not slab reassignment is allowed */ int slab_automove; /* Whether or not to automatically move slabs */ + double slab_automove_ratio; /* youngest must be within pct of oldest */ + unsigned int slab_automove_window; /* window mover for algorithm */ int hashpower_init; /* Starting hash power level */ bool shutdown_command; /* allow shutdown command */ int tail_repair_time; /* LRU tail refcount leak repair time */ diff --git a/scripts/memcached-automove b/scripts/memcached-automove index 6b4011b..f881129 100755 --- a/scripts/memcached-automove +++ b/scripts/memcached-automove @@ -87,7 +87,7 @@ def determine_move(history, diffs, totals): break # are we the oldest slab class? (and a valid target) - if age > oldest[1] and slab['total_pages'] > 5: + if age > oldest[1] and slab['total_pages'] > 2: oldest = (sid, age) # are we the youngest evicting slab class? diff --git a/slab_automove.c b/slab_automove.c new file mode 100644 index 0000000..8a66370 --- /dev/null +++ b/slab_automove.c @@ -0,0 +1,150 @@ +/* Copyright 2017 Facebook. + * + * Use and distribution licensed under the BSD license. See + * the LICENSE file for full text. + */ + +/* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +#include "memcached.h" +#include "slab_automove.h" +#include <stdlib.h> +#include <string.h> + +#define MIN_PAGES_FOR_SOURCE 2 +#define MIN_PAGES_FOR_RECLAIM 2.5 + +struct window_data { + uint64_t age; + uint64_t dirty; + uint64_t evicted; +}; + +typedef struct { + struct window_data *window_data; + uint32_t window_size; + uint32_t window_cur; + double max_age_ratio; + item_stats_automove iam_before[MAX_NUMBER_OF_SLAB_CLASSES]; + item_stats_automove iam_after[MAX_NUMBER_OF_SLAB_CLASSES]; + slab_stats_automove sam_before[MAX_NUMBER_OF_SLAB_CLASSES]; + slab_stats_automove sam_after[MAX_NUMBER_OF_SLAB_CLASSES]; +} slab_automove; + +void *slab_automove_init(uint32_t window_size, double max_age_ratio) { + slab_automove *a = calloc(1, sizeof(slab_automove)); + if (a == NULL) + return NULL; + a->window_data = calloc(window_size * MAX_NUMBER_OF_SLAB_CLASSES, sizeof(struct window_data)); + a->window_size = window_size; + a->max_age_ratio = max_age_ratio; + if (a->window_data == NULL) { + free(a); + return NULL; + } + + // do a dry run to fill the before structs + fill_item_stats_automove(a->iam_before); + fill_slab_stats_automove(a->sam_before); + + return (void *)a; +} + +void slab_automove_free(void *arg) { + slab_automove *a = (slab_automove *)arg; + free(a->window_data); + free(a); +} + +static void window_sum(struct window_data *wd, struct window_data *w, uint32_t size) { + int x; + for (x = 0; x < size; x++) { + struct window_data *d = &wd[x]; + w->age += d->age; + w->dirty += d->dirty; + w->evicted += d->evicted; + } +} + +// TODO: if oldest is dirty, find next oldest. +// still need to base ratio off of absolute age +void slab_automove_run(void *arg, int *src, int *dst) { + slab_automove *a = (slab_automove *)arg; + int n; + struct window_data w_sum; + int oldest = -1; + uint64_t oldest_age = 0; + int youngest = -1; + uint64_t youngest_age = ~0; + bool youngest_evicting = false; + *src = -1; + *dst = -1; + + // fill after structs + fill_item_stats_automove(a->iam_after); + fill_slab_stats_automove(a->sam_after); + a->window_cur++; + + // iterate slabs + for (n = POWER_SMALLEST; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) { + int w_offset = n * a->window_size; + struct window_data *wd = &a->window_data[w_offset + (a->window_cur % a->window_size)]; + memset(wd, 0, sizeof(struct window_data)); + // summarize the window-up-to-now. + memset(&w_sum, 0, sizeof(struct window_data)); + window_sum(&a->window_data[w_offset], &w_sum, a->window_size); + + // if page delta, or evicted delta, mark window dirty + // (or outofmemory) + if (a->iam_after[n].evicted - a->iam_before[n].evicted > 0 || + a->iam_after[n].outofmemory - a->iam_before[n].outofmemory > 0) { + wd->evicted = 1; + wd->dirty = 1; + } + if (a->sam_after[n].total_pages - a->sam_before[n].total_pages > 0) { + wd->dirty = 1; + } + + // set age into window + wd->age = a->iam_after[n].age; + + // grab age as average of window total + uint64_t age = w_sum.age / a->window_size; + + // if > N free chunks and not dirty, make decision. + if (a->sam_after[n].free_chunks > a->sam_after[n].chunks_per_page * MIN_PAGES_FOR_RECLAIM) { + if (w_sum.dirty == 0) { + *src = n; + *dst = 0; + break; + } + } + + // if oldest and have enough pages, is oldest + if (age > oldest_age && a->sam_after[n].total_pages > MIN_PAGES_FOR_SOURCE) { + oldest = n; + oldest_age = age; + } + + // grab evicted count from window + // if > half the window and youngest, mark as youngest + if (age < youngest_age && w_sum.evicted > a->window_size / 2) { + youngest = n; + youngest_age = age; + youngest_evicting = wd->evicted ? true : false; + } + } + + memcpy(a->iam_before, a->iam_after, + sizeof(item_stats_automove) * MAX_NUMBER_OF_SLAB_CLASSES); + memcpy(a->sam_before, a->sam_after, + sizeof(slab_stats_automove) * MAX_NUMBER_OF_SLAB_CLASSES); + // if we have a youngest and oldest, and oldest is outside the ratio, + // also, only make decisions if window has filled once. + if (youngest != -1 && oldest != -1 && a->window_cur > a->window_size) { + if (youngest_age < ((double)oldest_age * a->max_age_ratio) && youngest_evicting) { + *src = oldest; + *dst = youngest; + } + } + return; +} diff --git a/slab_automove.h b/slab_automove.h new file mode 100644 index 0000000..99d8d0c --- /dev/null +++ b/slab_automove.h @@ -0,0 +1,8 @@ +#ifndef SLAB_AUTOMOVE_H +#define SLAB_AUTOMOVE_H + +void *slab_automove_init(uint32_t window_size, double max_age_ratio); +void slab_automove_free(void *arg); +void slab_automove_run(void *arg, int *src, int *dst); + +#endif @@ -375,6 +375,22 @@ static void do_slabs_free(void *ptr, const size_t size, unsigned int id) { return; } +/* With refactoring of the various stats code the automover won't need a + * custom function here. + */ +void fill_slab_stats_automove(slab_stats_automove *am) { + int n; + pthread_mutex_lock(&slabs_lock); + for (n = 0; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) { + slabclass_t *p = &slabclass[n]; + slab_stats_automove *cur = &am[n]; + cur->chunks_per_page = p->perslab; + cur->free_chunks = p->sl_curr; + cur->total_pages = p->slabs; + } + pthread_mutex_unlock(&slabs_lock); +} + static int nz_strcmp(int nzlength, const char *nz, const char *z) { int zlength=strlen(z); return (zlength == nzlength) && (strncmp(nz, z, zlength) == 0) ? 0 : -1; @@ -34,6 +34,13 @@ bool slabs_adjust_mem_limit(size_t new_mem_limit); /** Return a datum for stats in binary protocol */ bool get_stats(const char *stat_type, int nkey, ADD_STAT add_stats, void *c); +typedef struct { + unsigned int chunks_per_page; + long int free_chunks; + long int total_pages; +} slab_stats_automove; +void fill_slab_stats_automove(slab_stats_automove *am); + /** Fill buffer with stats */ /*@null@*/ void slabs_stats(ADD_STAT add_stats, void *c); diff --git a/t/dyn-maxbytes.t b/t/dyn-maxbytes.t index 23821db..85809ae 100644 --- a/t/dyn-maxbytes.t +++ b/t/dyn-maxbytes.t @@ -7,7 +7,7 @@ use FindBin qw($Bin); use lib "$Bin/lib"; use MemcachedTest; -my $server = new_memcached("-m 3 -o modern"); +my $server = new_memcached("-m 3 -o modern,slab_automove_window=3"); my $sock = $server->sock; my $value = "B"x66560; my $key = 0; diff --git a/t/slabs-reassign2.t b/t/slabs-reassign2.t index d7be004..6762a8f 100644 --- a/t/slabs-reassign2.t +++ b/t/slabs-reassign2.t @@ -8,7 +8,7 @@ use lib "$Bin/lib"; use MemcachedTest; use Data::Dumper qw/Dumper/; -my $server = new_memcached('-m 60 -o slab_reassign,slab_automove,lru_crawler,lru_maintainer'); +my $server = new_memcached('-m 60 -o slab_reassign,slab_automove,lru_crawler,lru_maintainer,slab_automove_window=3'); my $sock = $server->sock; my $value = "B"x11000; @@ -40,7 +40,7 @@ for (1 .. $todelete) { for ($tries = 20; $tries > 0; $tries--) { sleep 1; my $stats = mem_stats($sock); - if ($stats->{slab_global_page_pool} > 0) { + if ($stats->{slab_global_page_pool} > 24) { last; } } |