summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordormando <dormando@rydia.net>2009-03-28 00:16:38 -0700
committerDustin Sallings <dustin@spy.net>2009-03-29 10:23:52 -0700
commit4ad6da605d4708bde44c24b186139c276b4020e1 (patch)
tree74fd08e4de386fc51c00c5ac4b2a9037ebba346e
parent7a5a1375cf3220f9c69a9b51ebaf56b6d7f41db4 (diff)
downloadmemcached-4ad6da605d4708bde44c24b186139c276b4020e1.tar.gz
dumb hack to self-repair stuck slabs
since 1.2.6, most of the refcount leaks have been quashed. I still get them in production, extremely rarely. It's possibly we'll have refcount leaks on and off even in the future. This hack acknowledges this and exists since we want to guarantee, as much as possible, that memcached is a stable service. Having to monitor for and restart the service on account of "rare bugs" isn't acceptable.
-rw-r--r--doc/protocol.txt3
-rw-r--r--items.c24
2 files changed, 26 insertions, 1 deletions
diff --git a/doc/protocol.txt b/doc/protocol.txt
index 6dd4cf6..6ce04eb 100644
--- a/doc/protocol.txt
+++ b/doc/protocol.txt
@@ -478,6 +478,9 @@ evicted_time Seconds since the last access for the most recent item
outofmemory Number of times the underlying slab class was unable to
store a new item. This means you are running with -M or
an eviction failed.
+tailrepairs Number of times we self-healed a slab with a refcount
+ leak. If this counter is increasing a lot, please
+ report your situation to the developers.
Note this will only display information about slabs which exist, so an empty
cache will return an empty set.
diff --git a/items.c b/items.c
index b5e3e37..2cfbae9 100644
--- a/items.c
+++ b/items.c
@@ -29,6 +29,7 @@ typedef struct {
unsigned int evicted;
rel_time_t evicted_time;
unsigned int outofmemory;
+ unsigned int tailrepairs;
} itemstats_t;
static item *heads[LARGEST_ID];
@@ -169,7 +170,26 @@ item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_tim
it = slabs_alloc(ntotal, id);
if (it == 0) {
itemstats[id].outofmemory++;
- return NULL;
+ /* Last ditch effort. There is a very rare bug which causes
+ * refcount leaks. We've fixed most of them, but it still happens,
+ * and it may happen in the future.
+ * We can reasonably assume no item can stay locked for more than
+ * three hours, so if we find one in the tail which is that old,
+ * free it anyway.
+ */
+ tries = 50;
+ for (search = tails[id]; tries > 0 && search != NULL; tries--, search=search->prev) {
+ if (search->refcount != 0 && search->time + 10800 < current_time) {
+ itemstats[id].tailrepairs++;
+ search->refcount = 0;
+ do_item_unlink(search);
+ break;
+ }
+ }
+ it = slabs_alloc(ntotal, id);
+ if (it == 0) {
+ return NULL;
+ }
}
}
@@ -402,6 +422,8 @@ char *do_item_stats(uint32_t (*add_stats)(char *buf,
"%u", itemstats[i].evicted_time);
APPEND_NUM_FMT_STAT(fmt, i, "outofmemory",
"%u", itemstats[i].outofmemory);
+ APPEND_NUM_FMT_STAT(fmt, i, "tailrepairs",
+ "%u", itemstats[i].tailrepairs);;
/* check whether binary protocol terminator will fit */
if (*buflen + hdrsiz > allocated) {