summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeith Bostic <keith.bostic@mongodb.com>2016-10-20 17:13:15 -0400
committerMichael Cahill <michael.cahill@mongodb.com>2016-10-20 17:13:15 -0400
commitceeb57b565fca6ade4bb02d8cf62095374743bd1 (patch)
tree70e7f7ef9f3333b3a66b094f936f96c8db1c62aa
parent940a7aba6a2c37f878114e8280eabf161a212deb (diff)
downloadmongo-ceeb57b565fca6ade4bb02d8cf62095374743bd1.tar.gz
WT-2985 checkpoint core dump (#3100)
* Checkpoint must not skip writing a leaf page that's never been written before. * Split out the code to make a tree dirty: checkpoint needs it, and it's relatively expensive to dirty a page.
-rw-r--r--src/btree/bt_sync.c80
-rw-r--r--src/include/btree.i57
2 files changed, 93 insertions, 44 deletions
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index 7b583bd9c1e..6d4ad9d0d0f 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -9,6 +9,59 @@
#include "wt_internal.h"
/*
+ * __sync_checkpoint_can_skip --
+ * There are limited conditions under which we can skip writing a dirty
+ * page during checkpoint.
+ */
+static inline bool
+__sync_checkpoint_can_skip(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_PAGE_MODIFY *mod;
+ WT_MULTI *multi;
+ WT_TXN *txn;
+ u_int i;
+
+ mod = page->modify;
+ txn = &session->txn;
+
+ /*
+ * We can skip some dirty pages during a checkpoint. The requirements:
+ *
+ * 1. they must be leaf pages,
+ * 2. there is a snapshot transaction active (which is the case in
+ * ordinary application checkpoints but not all internal cases),
+ * 3. the first dirty update on the page is sufficiently recent the
+ * checkpoint transaction would skip them,
+ * 4. there's already an address for every disk block involved.
+ */
+ if (WT_PAGE_IS_INTERNAL(page))
+ return (false);
+ if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))
+ return (false);
+ if (!WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn))
+ return (false);
+
+ /*
+ * The problematic case is when a page was evicted but when there were
+ * unresolved updates and not every block associated with the page has
+ * a disk address. We can't skip such pages because we need a checkpoint
+ * write with valid addresses.
+ *
+ * The page's modification information can change underfoot if the page
+ * is being reconciled, so we'd normally serialize with reconciliation
+ * before reviewing page-modification information. However, checkpoint
+ * is the only valid writer of dirty leaf pages at this point, we skip
+ * the lock.
+ */
+ if (mod->rec_result == WT_PM_REC_MULTIBLOCK)
+ for (multi = mod->mod_multi,
+ i = 0; i < mod->mod_multi_entries; ++multi, ++i)
+ if (multi->addr.addr == NULL)
+ return (false);
+ return (true);
+}
+
+/*
* __sync_file --
* Flush pages for a specific file.
*/
@@ -20,7 +73,6 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_PAGE *page;
- WT_PAGE_MODIFY *mod;
WT_REF *walk;
WT_TXN *txn;
uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages;
@@ -161,29 +213,15 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
* reference and checking modified.
*/
page = walk->page;
- mod = page->modify;
/*
- * Write dirty pages, unless we can be sure they only
- * became dirty after the checkpoint started.
- *
- * We can skip dirty pages if:
- * (1) they are leaf pages;
- * (2) there is a snapshot transaction active (which
- * is the case in ordinary application checkpoints
- * but not all internal cases); and
- * (3) the first dirty update on the page is
- * sufficiently recent that the checkpoint
- * transaction would skip them.
- *
- * Mark the tree dirty: the checkpoint marked it clean
- * and we can't skip future checkpoints until this page
- * is written.
+ * Write dirty pages, if we can't skip them. If we skip
+ * a page, mark the tree dirty. The checkpoint marked it
+ * clean and we can't skip future checkpoints until this
+ * page is written.
*/
- if (!WT_PAGE_IS_INTERNAL(page) &&
- F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) &&
- WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) {
- __wt_page_modify_set(session, page);
+ if (__sync_checkpoint_can_skip(session, page)) {
+ __wt_tree_modify_set(session);
continue;
}
diff --git a/src/include/btree.i b/src/include/btree.i
index 79367da9cc9..daf2eb158c1 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -485,6 +485,38 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
}
/*
+ * __wt_tree_modify_set --
+ * Mark the tree dirty.
+ */
+static inline void
+__wt_tree_modify_set(WT_SESSION_IMPL *session)
+{
+ /*
+ * Test before setting the dirty flag, it's a hot cache line.
+ *
+ * The tree's modified flag is cleared by the checkpoint thread: set it
+ * and insert a barrier before dirtying the page. (I don't think it's
+ * a problem if the tree is marked dirty with all the pages clean, it
+ * might result in an extra checkpoint that doesn't do any work but it
+ * shouldn't cause problems; regardless, let's play it safe.)
+ */
+ if (!S2BT(session)->modified) {
+ /* Assert we never dirty a checkpoint handle. */
+ WT_ASSERT(session, session->dhandle->checkpoint == NULL);
+
+ S2BT(session)->modified = true;
+ WT_FULL_BARRIER();
+ }
+
+ /*
+ * The btree may already be marked dirty while the connection is still
+ * clean; mark the connection dirty outside the test of the btree state.
+ */
+ if (!S2C(session)->modified)
+ S2C(session)->modified = true;
+}
+
+/*
* __wt_page_modify_clear --
* Clean a modified page.
*/
@@ -513,30 +545,9 @@ __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
/*
* Mark the tree dirty (even if the page is already marked dirty), newly
* created pages to support "empty" files are dirty, but the file isn't
- * marked dirty until there's a real change needing to be written. Test
- * before setting the dirty flag, it's a hot cache line.
- *
- * The tree's modified flag is cleared by the checkpoint thread: set it
- * and insert a barrier before dirtying the page. (I don't think it's
- * a problem if the tree is marked dirty with all the pages clean, it
- * might result in an extra checkpoint that doesn't do any work but it
- * shouldn't cause problems; regardless, let's play it safe.)
- */
- if (!S2BT(session)->modified) {
- /* Assert we never dirty a checkpoint handle. */
- WT_ASSERT(session, session->dhandle->checkpoint == NULL);
-
- S2BT(session)->modified = true;
- WT_FULL_BARRIER();
- }
-
- /*
- * There is a possibility of btree being dirty whereas connection being
- * clean when entering this function. So make sure to update connection
- * to dirty outside a condition on btree modified flag.
+ * marked dirty until there's a real change needing to be written.
*/
- if (!S2C(session)->modified)
- S2C(session)->modified = true;
+ __wt_tree_modify_set(session);
__wt_page_only_modify_set(session, page);
}