summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorKeith Bostic <keith.bostic@mongodb.com>2016-09-08 02:27:14 -0400
committerMichael Cahill <michael.cahill@mongodb.com>2016-09-08 16:27:14 +1000
commit5f07ab685f2b1e978cfda354dd52a5fc52f8f8ab (patch)
treea3f211c6b2e6cf48637cf04b85958ba72efc409a /src
parentac66d28f02f6ede3ed5f3d0eaf273a47ee4c2147 (diff)
downloadmongo-5f07ab685f2b1e978cfda354dd52a5fc52f8f8ab.tar.gz
WT-2897 Checkpoints can become corrupted on failure (#3027)
During a checkpoint, the extent lists of previous checkpoints that are being discarded are merged into the live system's extent lists. If there's an error (most likely disk-full causing a write to fail), the live system's allocation list can be left corrupted. Future checkpoints are not locked out by this failure, which means a subsequent checkpoint (for example, the sweep server closing the file) could potentially create a checkpoint based on corrupted extent lists. After the live system's extent lists have been modified during a checkpoint, change errors to switch the handle into read-only mode and panic the system. Second, the checks for re-entering the checkpoint code don't prevent bugs in the upper-level code from affecting checkpoints, and I'm suspicious that is happening. Adjust the in-checkpoint tests and use the live system's lock to ensure the checkpoint code is never re-entered.
Diffstat (limited to 'src')
-rw-r--r--src/block/block_ckpt.c69
-rw-r--r--src/block/block_mgr.c25
-rw-r--r--src/include/extern.h1
3 files changed, 67 insertions, 28 deletions
diff --git a/src/block/block_ckpt.c b/src/block/block_ckpt.c
index 1e7f4ff09ae..22ee4376d29 100644
--- a/src/block/block_ckpt.c
+++ b/src/block/block_ckpt.c
@@ -362,10 +362,10 @@ __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
WT_DECL_ITEM(tmp);
WT_DECL_RET;
uint64_t ckpt_size;
- bool deleting, locked;
+ bool deleting, fatal, locked;
ci = &block->live;
- locked = false;
+ fatal = locked = false;
#ifdef HAVE_DIAGNOSTIC
WT_RET(__ckpt_verify(session, ckptbase));
@@ -391,23 +391,20 @@ __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
* This function is the first step, the second step is in the resolve
* function.
*
- * If we're called to checkpoint the same file twice, without the second
- * resolution step, it's an error at an upper level and our choices are
- * all bad: either leak blocks or risk crashing with our caller not
- * having saved the checkpoint information to stable storage. Leaked
- * blocks are a safer choice, but that means file verify will fail for
- * the rest of "forever", and the chance of us allocating a block and
- * then crashing such that it matters is reasonably low: don't leak the
- * blocks.
+ * If we're called to checkpoint the same file twice (without the second
+ * resolution step), or re-entered for any reason, it's an error in our
+ * caller, and our choices are all bad: leak blocks or potentially crash
+ * with our caller not yet having saved previous checkpoint information
+ * to stable storage.
*/
- if (block->ckpt_inprogress) {
- __wt_errx(session,
- "%s: checkpointed without first resolving the previous "
- "checkpoint",
- block->name);
-
- WT_RET(__wt_block_checkpoint_resolve(session, block));
- }
+ __wt_spin_lock(session, &block->live_lock);
+ if (block->ckpt_inprogress)
+ ret = __wt_block_panic(session, EINVAL,
+ "%s: unexpected checkpoint ordering", block->name);
+ else
+ block->ckpt_inprogress = true;
+ __wt_spin_unlock(session, &block->live_lock);
+ WT_RET(ret);
/*
* Extents newly available as a result of deleting previous checkpoints
@@ -464,6 +461,15 @@ __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
}
/*
+ * Failures are now fatal: we can't currently back out the merge of any
+ * deleted checkpoint extent lists into the live system's extent lists,
+ * so continuing after error would leave the live system's extent lists
+ * corrupted for any subsequent checkpoint (and potentially, should a
+ * subsequent checkpoint succeed, for recovery).
+ */
+ fatal = true;
+
+ /*
* Hold a lock so the live extent lists and the file size can't change
* underneath us. I suspect we'll tighten this if checkpoints take too
* much time away from real work: we read the historic checkpoint
@@ -653,9 +659,11 @@ live_update:
"list");
#endif
- block->ckpt_inprogress = true;
+err: if (ret != 0 && fatal)
+ ret = __wt_block_panic(session, ret,
+ "%s: fatal checkpoint failure", block->name);
-err: if (locked)
+ if (locked)
__wt_spin_unlock(session, &block->live_lock);
/* Discard any checkpoint information we loaded. */
@@ -767,15 +775,16 @@ __wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block)
* Resolve the checkpoint after our caller has written the checkpoint
* information to stable storage.
*/
- if (!block->ckpt_inprogress)
- WT_RET_MSG(session, WT_ERROR,
- "%s: checkpoint resolved, but no checkpoint in progress",
- block->name);
- block->ckpt_inprogress = false;
-
__wt_spin_lock(session, &block->live_lock);
- ret = __wt_block_extlist_merge(
- session, block, &ci->ckpt_avail, &ci->avail);
+ if (!block->ckpt_inprogress)
+ WT_ERR(__wt_block_panic(session, WT_ERROR,
+ "%s: checkpoint resolution with no checkpoint in progress",
+ block->name));
+
+ if ((ret = __wt_block_extlist_merge(
+ session, block, &ci->ckpt_avail, &ci->avail)) != 0)
+ WT_ERR(__wt_block_panic(session, ret,
+ "%s: fatal checkpoint failure", block->name));
__wt_spin_unlock(session, &block->live_lock);
/* Discard the lists remaining after the checkpoint call. */
@@ -783,6 +792,10 @@ __wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block)
__wt_block_extlist_free(session, &ci->ckpt_alloc);
__wt_block_extlist_free(session, &ci->ckpt_discard);
+ __wt_spin_lock(session, &block->live_lock);
+ block->ckpt_inprogress = 0;
+err: __wt_spin_unlock(session, &block->live_lock);
+
return (ret);
}
diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c
index eff25f34304..db52ad036e2 100644
--- a/src/block/block_mgr.c
+++ b/src/block/block_mgr.c
@@ -607,3 +607,28 @@ __wt_block_manager_open(WT_SESSION_IMPL *session,
err: WT_TRET(bm->close(bm, session));
return (ret);
}
+
+/*
+ * __wt_block_panic --
+ * Report an error, then panic the handle and the system.
+ */
+int
+__wt_block_panic(WT_SESSION_IMPL *session, int error, const char *fmt, ...)
+ WT_GCC_FUNC_ATTRIBUTE((cold))
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4)))
+{
+ va_list ap;
+
+ /*
+ * Ignore error returns from underlying event handlers, we already have
+ * an error value to return.
+ */
+ va_start(ap, fmt);
+ WT_IGNORE_RET(__wt_eventv(session, false, error, NULL, 0, fmt, ap));
+ va_end(ap);
+
+ /* Switch the handle into read-only mode. */
+ __bm_method_set(S2BT(session)->bm, true);
+
+ return (__wt_panic(session));
+}
diff --git a/src/include/extern.h b/src/include/extern.h
index e5af4c62763..90c7d884f3c 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -44,6 +44,7 @@ extern void __wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el);
extern int __wt_block_map(WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapped_regionp, size_t *lengthp, void *mapped_cookiep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_block_unmap(WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapped_region, size_t length, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_block_manager_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], bool forced_salvage, bool readonly, uint32_t allocsize, WT_BM **bmp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_block_panic(WT_SESSION_IMPL *session, int error, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_block_manager_drop( WT_SESSION_IMPL *session, const char *filename, bool durable) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_block_manager_create( WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern void __wt_block_configure_first_fit(WT_BLOCK *block, bool on);