diff options
author | Keith Bostic <keith.bostic@mongodb.com> | 2016-09-08 02:27:14 -0400 |
---|---|---|
committer | Michael Cahill <michael.cahill@mongodb.com> | 2016-09-08 16:27:14 +1000 |
commit | 5f07ab685f2b1e978cfda354dd52a5fc52f8f8ab (patch) | |
tree | a3f211c6b2e6cf48637cf04b85958ba72efc409a /src | |
parent | ac66d28f02f6ede3ed5f3d0eaf273a47ee4c2147 (diff) | |
download | mongo-5f07ab685f2b1e978cfda354dd52a5fc52f8f8ab.tar.gz |
WT-2897 Checkpoints can become corrupted on failure (#3027)
During a checkpoint, the extent lists of previous checkpoints that are being
discarded are merged into the live system's extent lists. If there's an error
(most likely disk-full causing a write to fail), the live system's allocation
list can be left corrupted. Future checkpoints are not locked out by this
failure, which means a subsequent checkpoint (for example, the sweep server
closing the file) could potentially create a checkpoint based on corrupted
extent lists.
After the live system's extent lists have been modified during a checkpoint,
change errors to switch the handle into read-only mode and panic the system.
Second, the checks for re-entering the checkpoint code don't prevent bugs in
the upper-level code from affecting checkpoints, and I'm suspicious that is
happening. Adjust the in-checkpoint tests and use the live system's lock to
ensure the checkpoint code is never re-entered.
Diffstat (limited to 'src')
-rw-r--r-- | src/block/block_ckpt.c | 69 | ||||
-rw-r--r-- | src/block/block_mgr.c | 25 | ||||
-rw-r--r-- | src/include/extern.h | 1 |
3 files changed, 67 insertions, 28 deletions
diff --git a/src/block/block_ckpt.c b/src/block/block_ckpt.c index 1e7f4ff09ae..22ee4376d29 100644 --- a/src/block/block_ckpt.c +++ b/src/block/block_ckpt.c @@ -362,10 +362,10 @@ __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) WT_DECL_ITEM(tmp); WT_DECL_RET; uint64_t ckpt_size; - bool deleting, locked; + bool deleting, fatal, locked; ci = &block->live; - locked = false; + fatal = locked = false; #ifdef HAVE_DIAGNOSTIC WT_RET(__ckpt_verify(session, ckptbase)); @@ -391,23 +391,20 @@ __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) * This function is the first step, the second step is in the resolve * function. * - * If we're called to checkpoint the same file twice, without the second - * resolution step, it's an error at an upper level and our choices are - * all bad: either leak blocks or risk crashing with our caller not - * having saved the checkpoint information to stable storage. Leaked - * blocks are a safer choice, but that means file verify will fail for - * the rest of "forever", and the chance of us allocating a block and - * then crashing such that it matters is reasonably low: don't leak the - * blocks. + * If we're called to checkpoint the same file twice (without the second + * resolution step), or re-entered for any reason, it's an error in our + * caller, and our choices are all bad: leak blocks or potentially crash + * with our caller not yet having saved previous checkpoint information + * to stable storage. */ - if (block->ckpt_inprogress) { - __wt_errx(session, - "%s: checkpointed without first resolving the previous " - "checkpoint", - block->name); - - WT_RET(__wt_block_checkpoint_resolve(session, block)); - } + __wt_spin_lock(session, &block->live_lock); + if (block->ckpt_inprogress) + ret = __wt_block_panic(session, EINVAL, + "%s: unexpected checkpoint ordering", block->name); + else + block->ckpt_inprogress = true; + __wt_spin_unlock(session, &block->live_lock); + WT_RET(ret); /* * Extents newly available as a result of deleting previous checkpoints @@ -464,6 +461,15 @@ __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) } /* + * Failures are now fatal: we can't currently back out the merge of any + * deleted checkpoint extent lists into the live system's extent lists, + * so continuing after error would leave the live system's extent lists + * corrupted for any subsequent checkpoint (and potentially, should a + * subsequent checkpoint succeed, for recovery). + */ + fatal = true; + + /* * Hold a lock so the live extent lists and the file size can't change * underneath us. I suspect we'll tighten this if checkpoints take too * much time away from real work: we read the historic checkpoint @@ -653,9 +659,11 @@ live_update: "list"); #endif - block->ckpt_inprogress = true; +err: if (ret != 0 && fatal) + ret = __wt_block_panic(session, ret, + "%s: fatal checkpoint failure", block->name); -err: if (locked) + if (locked) __wt_spin_unlock(session, &block->live_lock); /* Discard any checkpoint information we loaded. */ @@ -767,15 +775,16 @@ __wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block) * Resolve the checkpoint after our caller has written the checkpoint * information to stable storage. */ - if (!block->ckpt_inprogress) - WT_RET_MSG(session, WT_ERROR, - "%s: checkpoint resolved, but no checkpoint in progress", - block->name); - block->ckpt_inprogress = false; - __wt_spin_lock(session, &block->live_lock); - ret = __wt_block_extlist_merge( - session, block, &ci->ckpt_avail, &ci->avail); + if (!block->ckpt_inprogress) + WT_ERR(__wt_block_panic(session, WT_ERROR, + "%s: checkpoint resolution with no checkpoint in progress", + block->name)); + + if ((ret = __wt_block_extlist_merge( + session, block, &ci->ckpt_avail, &ci->avail)) != 0) + WT_ERR(__wt_block_panic(session, ret, + "%s: fatal checkpoint failure", block->name)); __wt_spin_unlock(session, &block->live_lock); /* Discard the lists remaining after the checkpoint call. */ @@ -783,6 +792,10 @@ __wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block) __wt_block_extlist_free(session, &ci->ckpt_alloc); __wt_block_extlist_free(session, &ci->ckpt_discard); + __wt_spin_lock(session, &block->live_lock); + block->ckpt_inprogress = 0; +err: __wt_spin_unlock(session, &block->live_lock); + return (ret); } diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c index eff25f34304..db52ad036e2 100644 --- a/src/block/block_mgr.c +++ b/src/block/block_mgr.c @@ -607,3 +607,28 @@ __wt_block_manager_open(WT_SESSION_IMPL *session, err: WT_TRET(bm->close(bm, session)); return (ret); } + +/* + * __wt_block_panic -- + * Report an error, then panic the handle and the system. + */ +int +__wt_block_panic(WT_SESSION_IMPL *session, int error, const char *fmt, ...) + WT_GCC_FUNC_ATTRIBUTE((cold)) + WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4))) +{ + va_list ap; + + /* + * Ignore error returns from underlying event handlers, we already have + * an error value to return. + */ + va_start(ap, fmt); + WT_IGNORE_RET(__wt_eventv(session, false, error, NULL, 0, fmt, ap)); + va_end(ap); + + /* Switch the handle into read-only mode. */ + __bm_method_set(S2BT(session)->bm, true); + + return (__wt_panic(session)); +} diff --git a/src/include/extern.h b/src/include/extern.h index e5af4c62763..90c7d884f3c 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -44,6 +44,7 @@ extern void __wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el); extern int __wt_block_map(WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapped_regionp, size_t *lengthp, void *mapped_cookiep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_unmap(WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapped_region, size_t length, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_manager_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], bool forced_salvage, bool readonly, uint32_t allocsize, WT_BM **bmp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_block_panic(WT_SESSION_IMPL *session, int error, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_manager_drop( WT_SESSION_IMPL *session, const char *filename, bool durable) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_manager_create( WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_block_configure_first_fit(WT_BLOCK *block, bool on); |