From 4ad6da605d4708bde44c24b186139c276b4020e1 Mon Sep 17 00:00:00 2001 From: dormando Date: Sat, 28 Mar 2009 00:16:38 -0700 Subject: dumb hack to self-repair stuck slabs since 1.2.6, most of the refcount leaks have been quashed. I still get them in production, extremely rarely. It's possibly we'll have refcount leaks on and off even in the future. This hack acknowledges this and exists since we want to guarantee, as much as possible, that memcached is a stable service. Having to monitor for and restart the service on account of "rare bugs" isn't acceptable. --- doc/protocol.txt | 3 +++ items.c | 24 +++++++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/doc/protocol.txt b/doc/protocol.txt index 6dd4cf6..6ce04eb 100644 --- a/doc/protocol.txt +++ b/doc/protocol.txt @@ -478,6 +478,9 @@ evicted_time Seconds since the last access for the most recent item outofmemory Number of times the underlying slab class was unable to store a new item. This means you are running with -M or an eviction failed. +tailrepairs Number of times we self-healed a slab with a refcount + leak. If this counter is increasing a lot, please + report your situation to the developers. Note this will only display information about slabs which exist, so an empty cache will return an empty set. diff --git a/items.c b/items.c index b5e3e37..2cfbae9 100644 --- a/items.c +++ b/items.c @@ -29,6 +29,7 @@ typedef struct { unsigned int evicted; rel_time_t evicted_time; unsigned int outofmemory; + unsigned int tailrepairs; } itemstats_t; static item *heads[LARGEST_ID]; @@ -169,7 +170,26 @@ item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_tim it = slabs_alloc(ntotal, id); if (it == 0) { itemstats[id].outofmemory++; - return NULL; + /* Last ditch effort. There is a very rare bug which causes + * refcount leaks. We've fixed most of them, but it still happens, + * and it may happen in the future. + * We can reasonably assume no item can stay locked for more than + * three hours, so if we find one in the tail which is that old, + * free it anyway. + */ + tries = 50; + for (search = tails[id]; tries > 0 && search != NULL; tries--, search=search->prev) { + if (search->refcount != 0 && search->time + 10800 < current_time) { + itemstats[id].tailrepairs++; + search->refcount = 0; + do_item_unlink(search); + break; + } + } + it = slabs_alloc(ntotal, id); + if (it == 0) { + return NULL; + } } } @@ -402,6 +422,8 @@ char *do_item_stats(uint32_t (*add_stats)(char *buf, "%u", itemstats[i].evicted_time); APPEND_NUM_FMT_STAT(fmt, i, "outofmemory", "%u", itemstats[i].outofmemory); + APPEND_NUM_FMT_STAT(fmt, i, "tailrepairs", + "%u", itemstats[i].tailrepairs);; /* check whether binary protocol terminator will fit */ if (*buflen + hdrsiz > allocated) { -- cgit v1.2.1