summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordormando <dormando@rydia.net>2022-08-18 23:03:54 -0700
committerdormando <dormando@rydia.net>2022-08-25 20:43:32 -0700
commit3d6d74a340c3eb27777f96937bead823b4901753 (patch)
tree310cde0318f3e6afb8fa4e36704ab3a5bcb4556d
parenta102df4554e18c8733331f61a1285dd6f6ff4d39 (diff)
downloadmemcached-3d6d74a340c3eb27777f96937bead823b4901753.tar.gz
extstore: make defaults more aggressive
extstore has a background thread which examines slab classes for items to flush to disk. The thresholds for flushing to disk are managed by a specialized "slab automove" algorithm. This algorithm was written in 2017 and not tuned since. Most serious users set "ext_item_age=0" and force flush all items. This is partially because the defaults do not flush aggressively enough, which causes memory to run out and evictions to happen. This change simplifies the slab automove portion. Instead of balancing free chunks of memory per slab class, it sets a target of a certain number of free global pages. The extstore flusher thread also uses the page pool and some low chunk limits to decide when to start flushing. Its sleep routines have also been adjusted as it could oversleep too easily. A few other small changes were required to avoid over-moving slab pages around.
-rw-r--r--items.c8
-rw-r--r--memcached.h4
-rw-r--r--proto_text.c4
-rw-r--r--slab_automove_extstore.c113
-rw-r--r--storage.c20
5 files changed, 36 insertions, 113 deletions
diff --git a/items.c b/items.c
index e5347cc..9cee07e 100644
--- a/items.c
+++ b/items.c
@@ -1669,13 +1669,7 @@ static void *lru_maintainer_thread(void *arg) {
LOGGER_LOG(l, LOG_SYSEVENTS, LOGGER_SLAB_MOVE, NULL,
src, dst);
}
- // dst == 0 means reclaim to global pool, be more aggressive
- if (dst != 0) {
- last_automove_check = current_time;
- } else if (dst == 0) {
- // also ensure we minimize the thread sleep
- to_sleep = 1000;
- }
+ last_automove_check = current_time;
}
}
pthread_mutex_unlock(&lru_maintainer_lock);
diff --git a/memcached.h b/memcached.h
index ba495d3..5f517f2 100644
--- a/memcached.h
+++ b/memcached.h
@@ -507,8 +507,8 @@ struct settings {
double ext_max_frag; /* ideal maximum page fragmentation */
double slab_automove_freeratio; /* % of memory to hold free as buffer */
bool ext_drop_unread; /* skip unread items during compaction */
- /* per-slab-class free chunk limit */
- unsigned int ext_free_memchunks[MAX_NUMBER_OF_SLAB_CLASSES];
+ /* start flushing to extstore after memory below this */
+ unsigned int ext_global_pool_min;
#endif
#ifdef TLS
bool ssl_enabled; /* indicates whether SSL is enabled */
diff --git a/proto_text.c b/proto_text.c
index d794514..6885786 100644
--- a/proto_text.c
+++ b/proto_text.c
@@ -2410,7 +2410,7 @@ static void process_extstore_command(conn *c, token_t *tokens, const size_t ntok
if (ntokens < 4) {
ok = false;
} else if (strcmp(tokens[1].value, "free_memchunks") == 0 && ntokens > 4) {
- /* per-slab-class free chunk setting. */
+ // setting is deprecated and ignored, but accepted for backcompat
unsigned int clsid = 0;
unsigned int limit = 0;
if (!safe_strtoul(tokens[2].value, &clsid) ||
@@ -2418,7 +2418,7 @@ static void process_extstore_command(conn *c, token_t *tokens, const size_t ntok
ok = false;
} else {
if (clsid < MAX_NUMBER_OF_SLAB_CLASSES) {
- settings.ext_free_memchunks[clsid] = limit;
+ ok = true;
} else {
ok = false;
}
diff --git a/slab_automove_extstore.c b/slab_automove_extstore.c
index 3831279..1020983 100644
--- a/slab_automove_extstore.c
+++ b/slab_automove_extstore.c
@@ -13,7 +13,6 @@
#define MIN_PAGES_FOR_SOURCE 2
#define MIN_PAGES_FOR_RECLAIM 2.5
#define MIN_PAGES_FREE 1.5
-#define MEMCHECK_PERIOD 60
struct window_data {
uint64_t age;
@@ -23,23 +22,16 @@ struct window_data {
unsigned int relaxed;
};
-struct window_global {
- uint32_t pool_low;
- uint32_t pool_high;
-};
-
typedef struct {
struct window_data *window_data;
- struct window_global *window_global;
struct settings *settings;
uint32_t window_size;
uint32_t window_cur;
uint32_t item_size;
- rel_time_t last_memcheck_run;
double max_age_ratio;
double free_ratio;
bool pool_filled_once;
- unsigned int free_mem[MAX_NUMBER_OF_SLAB_CLASSES];
+ unsigned int global_pool_watermark;
item_stats_automove iam_before[MAX_NUMBER_OF_SLAB_CLASSES];
item_stats_automove iam_after[MAX_NUMBER_OF_SLAB_CLASSES];
slab_stats_automove sam_before[MAX_NUMBER_OF_SLAB_CLASSES];
@@ -53,19 +45,15 @@ void *slab_automove_extstore_init(struct settings *settings) {
if (a == NULL)
return NULL;
a->window_data = calloc(window_size * MAX_NUMBER_OF_SLAB_CLASSES, sizeof(struct window_data));
- a->window_global = calloc(window_size, sizeof(struct window_global));
a->window_size = window_size;
a->max_age_ratio = max_age_ratio;
a->free_ratio = settings->slab_automove_freeratio;
a->item_size = settings->ext_item_size;
- a->last_memcheck_run = 0;
a->settings = settings;
a->pool_filled_once = false;
- if (a->window_data == NULL || a->window_global == NULL) {
+ if (a->window_data == NULL) {
if (a->window_data)
free(a->window_data);
- if (a->window_global)
- free(a->window_global);
free(a);
return NULL;
}
@@ -80,7 +68,6 @@ void *slab_automove_extstore_init(struct settings *settings) {
void slab_automove_extstore_free(void *arg) {
slab_automove *a = (slab_automove *)arg;
free(a->window_data);
- free(a->window_global);
free(a);
}
@@ -96,32 +83,19 @@ static void window_sum(struct window_data *wd, struct window_data *w,
}
}
-/* This could potentially merge with above */
-static void window_global_sum(struct window_global *wg,
- struct window_global *w, uint32_t size) {
- for (int x = 0; x < size; x++) {
- struct window_global *d = &wg[x];
- w->pool_high += d->pool_high;
- w->pool_low += d->pool_low;
- }
-}
-
-static void global_pool_check(slab_automove *a) {
+static int global_pool_check(slab_automove *a) {
bool mem_limit_reached;
- uint32_t free = a->free_mem[0];
- struct window_global *wg = &a->window_global[a->window_cur % a->window_size];
+ unsigned int free = a->global_pool_watermark;
unsigned int count = global_page_pool_size(&mem_limit_reached);
- memset(wg, 0, sizeof(struct window_global));
if (!mem_limit_reached)
- return;
- if (count < free / 2) {
- wg->pool_low = 1;
+ return 0;
+ if (count < free) {
a->pool_filled_once = true;
- } else if (count > free) {
- wg->pool_high = 1;
+ return 1;
} else {
a->pool_filled_once = true;
}
+ return 0;
}
/* A percentage of memory is configured to be held "free" as buffers for the
@@ -135,24 +109,20 @@ static void global_pool_check(slab_automove *a) {
*/
static void memcheck(slab_automove *a) {
unsigned int total_pages = 0;
- if (current_time < a->last_memcheck_run + MEMCHECK_PERIOD)
- return;
- a->last_memcheck_run = current_time;
+
+ // FIXME: is there a cached counter for total pages alloced?
+ // technically we only really need to do this once as the pages are
+ // prefilled and ratio isn't a runtime change.
for (int n = 1; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) {
slab_stats_automove *sam = &a->sam_after[n];
total_pages += sam->total_pages;
- unsigned int hold_free = (sam->total_pages * sam->chunks_per_page)
- * a->free_ratio;
- if (sam->chunks_per_page * MIN_PAGES_FREE > hold_free)
- hold_free = sam->chunks_per_page * MIN_PAGES_FREE;
- a->free_mem[n] = hold_free;
- if (a->settings->ext_free_memchunks[n] != hold_free && a->pool_filled_once) {
- a->settings->ext_free_memchunks[n] = hold_free;
- }
}
- // remember to add what remains in global pool.
+ // always update what remains in the global page pool
total_pages += a->sam_after[0].total_pages;
- a->free_mem[0] = total_pages * a->free_ratio;
+ a->global_pool_watermark = total_pages * a->free_ratio;
+ if (a->global_pool_watermark < 2)
+ a->global_pool_watermark = 2;
+ settings.ext_global_pool_min = a->global_pool_watermark;
}
static struct window_data *get_window_data(slab_automove *a, int class) {
@@ -166,16 +136,11 @@ void slab_automove_extstore_run(void *arg, int *src, int *dst) {
struct window_data w_sum;
int oldest = -1;
uint64_t oldest_age = 0;
- int youngest = -1;
- uint64_t youngest_age = ~0;
bool too_free = false;
*src = -1;
*dst = -1;
- global_pool_check(a);
- struct window_global wg_sum;
- memset(&wg_sum, 0, sizeof(struct window_global));
- window_global_sum(a->window_global, &wg_sum, a->window_size);
+ int global_low = global_pool_check(a);
// fill after structs
fill_item_stats_automove(a->iam_after);
fill_slab_stats_automove(a->sam_after);
@@ -187,13 +152,13 @@ void slab_automove_extstore_run(void *arg, int *src, int *dst) {
for (n = POWER_SMALLEST; n < MAX_NUMBER_OF_SLAB_CLASSES; n++) {
bool small_slab = a->sam_before[n].chunk_size < a->item_size
? true : false;
- bool free_enough = false;
struct window_data *wd = get_window_data(a, n);
// summarize the window-up-to-now.
memset(&w_sum, 0, sizeof(struct window_data));
int w_offset = n * a->window_size;
window_sum(&a->window_data[w_offset], &w_sum, a->window_size);
memset(wd, 0, sizeof(struct window_data));
+ unsigned int free_target = a->sam_after[n].chunks_per_page * MIN_PAGES_FREE;
// if page delta, oom, or evicted delta, mark window dirty
// classes marked dirty cannot donate memory back to global pool.
@@ -205,15 +170,9 @@ void slab_automove_extstore_run(void *arg, int *src, int *dst) {
if (a->sam_after[n].total_pages - a->sam_before[n].total_pages > 0) {
wd->dirty = 1;
}
- // Mark excess free if we're over the free mem limit for too long.
- // "free_enough" means it is either wobbling, recently received a new
- // page of memory, or the crawler is freeing memory.
- if (a->sam_after[n].free_chunks > a->free_mem[n]) {
- free_enough = true;
- }
// double the free requirements means we may have memory we can
// reclaim to global, if it stays this way for the whole window.
- if (a->sam_after[n].free_chunks > (a->free_mem[n] * 2) && a->free_mem[n] > 0) {
+ if (a->sam_after[n].free_chunks > (free_target * 2)) {
wd->excess_free = 1;
}
@@ -249,14 +208,6 @@ void slab_automove_extstore_run(void *arg, int *src, int *dst) {
oldest_age = age;
}
- // don't count as youngest if it hasn't been using new chunks.
- // (if it was relaxed recently, and is currently "free enough")
- if (age < youngest_age && a->sam_after[n].total_pages != 0
- && w_sum.excess_free < a->window_size
- && !(w_sum.relaxed && free_enough)) {
- youngest = n;
- youngest_age = age;
- }
}
}
@@ -268,31 +219,9 @@ void slab_automove_extstore_run(void *arg, int *src, int *dst) {
if (a->window_cur < a->window_size)
return;
- if (wg_sum.pool_high >= a->window_size && !wg_sum.pool_low && youngest != -1) {
- if (a->sam_after[youngest].free_chunks <= a->free_mem[youngest]) {
- *src = 0;
- *dst = youngest;
- }
- struct window_data *wd = get_window_data(a, youngest);
- // "relaxing" here and below allows us to skip classes which will
- // never grow or are growing slowly, more quickly finding other
- // classes which violate the age ratio.
- wd->relaxed = 1;
- } else if (!too_free && wg_sum.pool_low && oldest != -1) {
+ if (!too_free && global_low && oldest != -1) {
*src = oldest;
*dst = 0;
- } else if (!too_free && youngest != -1 && oldest != -1 && youngest != oldest) {
- // if we have a youngest and oldest, and oldest is outside the ratio.
- if (youngest_age < ((double)oldest_age * a->max_age_ratio)) {
- struct window_data *wd = get_window_data(a, youngest);
- wd->relaxed = 1;
- // only actually assign more memory if it's absorbed what it has
- if (a->sam_after[youngest].free_chunks <= a->free_mem[youngest]) {
- *src = 0;
- *dst = youngest;
-
- }
- }
}
return;
}
diff --git a/storage.c b/storage.c
index 890437e..f82d96b 100644
--- a/storage.c
+++ b/storage.c
@@ -572,6 +572,7 @@ static int storage_write(void *storage, const int clsid, const int item_age) {
static pthread_t storage_write_tid;
static pthread_mutex_t storage_write_plock;
#define WRITE_SLEEP_MIN 500
+#define MIN_PAGES_FREE 3
static void *storage_write_thread(void *arg) {
void *storage = arg;
@@ -591,6 +592,7 @@ static void *storage_write_thread(void *arg) {
while (1) {
// cache per-loop to avoid calls to the slabs_clsid() search loop
int min_class = slabs_clsid(settings.ext_item_size);
+ unsigned int global_pages = global_page_pool_size(NULL);
bool do_sleep = true;
counter++;
if (to_sleep > settings.ext_max_sleep)
@@ -601,7 +603,6 @@ static void *storage_write_thread(void *arg) {
bool mem_limit_reached = false;
unsigned int chunks_free;
int item_age;
- int target = settings.ext_free_memchunks[x];
if (min_class > x || (backoff[x] && (counter % backoff[x] != 0))) {
// Long sleeps means we should retry classes sooner.
if (to_sleep > WRITE_SLEEP_MIN * 10)
@@ -610,13 +611,15 @@ static void *storage_write_thread(void *arg) {
}
// Avoid extra slab lock calls during heavy writing.
+ unsigned int chunks_perpage = 0;
chunks_free = slabs_available_chunks(x, &mem_limit_reached,
- NULL);
+ &chunks_perpage);
+ unsigned int target = chunks_perpage * MIN_PAGES_FREE;
// storage_write() will fail and cut loop after filling write buffer.
while (1) {
// if we are low on chunks and no spare, push out early.
- if (chunks_free < target && mem_limit_reached) {
+ if (chunks_free < target && global_pages <= settings.ext_global_pool_min) {
item_age = 0;
} else {
item_age = settings.ext_item_age;
@@ -624,7 +627,6 @@ static void *storage_write_thread(void *arg) {
if (storage_write(storage, x, item_age)) {
chunks_free++; // Allow stopping if we've done enough this loop
did_move = true;
- do_sleep = false;
if (to_sleep > WRITE_SLEEP_MIN)
to_sleep /= 2;
} else {
@@ -635,7 +637,7 @@ static void *storage_write_thread(void *arg) {
if (!did_move) {
backoff[x]++;
} else if (backoff[x]) {
- backoff[x] /= 2;
+ backoff[x] = 1;
}
}
@@ -643,7 +645,7 @@ static void *storage_write_thread(void *arg) {
pthread_mutex_unlock(&storage_write_plock);
if (do_sleep) {
usleep(to_sleep);
- to_sleep *= 2;
+ to_sleep++;
}
pthread_mutex_lock(&storage_write_plock);
}
@@ -1379,10 +1381,8 @@ void *storage_init(void *conf) {
settings.ext_drop_under = cf->storage_file->page_count / 4;
}
crc32c_init();
- /* Init free chunks to zero. */
- for (int x = 0; x < MAX_NUMBER_OF_SLAB_CLASSES; x++) {
- settings.ext_free_memchunks[x] = 0;
- }
+
+ settings.ext_global_pool_min = 0;
storage = extstore_init(cf->storage_file, ext_cf, &eres);
if (storage == NULL) {
fprintf(stderr, "Failed to initialize external storage: %s\n",