diff options
author | Keith Bostic <keith.bostic@mongodb.com> | 2016-10-20 17:13:15 -0400 |
---|---|---|
committer | Michael Cahill <michael.cahill@mongodb.com> | 2016-10-20 17:13:15 -0400 |
commit | ceeb57b565fca6ade4bb02d8cf62095374743bd1 (patch) | |
tree | 70e7f7ef9f3333b3a66b094f936f96c8db1c62aa | |
parent | 940a7aba6a2c37f878114e8280eabf161a212deb (diff) | |
download | mongo-ceeb57b565fca6ade4bb02d8cf62095374743bd1.tar.gz |
WT-2985 checkpoint core dump (#3100)
* Checkpoint must not skip writing a leaf page that's never been written before.
* Split out the code to make a tree dirty: checkpoint needs it, and it's relatively expensive to dirty a page.
-rw-r--r-- | src/btree/bt_sync.c | 80 | ||||
-rw-r--r-- | src/include/btree.i | 57 |
2 files changed, 93 insertions, 44 deletions
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index 7b583bd9c1e..6d4ad9d0d0f 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -9,6 +9,59 @@ #include "wt_internal.h" /* + * __sync_checkpoint_can_skip -- + * There are limited conditions under which we can skip writing a dirty + * page during checkpoint. + */ +static inline bool +__sync_checkpoint_can_skip(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_PAGE_MODIFY *mod; + WT_MULTI *multi; + WT_TXN *txn; + u_int i; + + mod = page->modify; + txn = &session->txn; + + /* + * We can skip some dirty pages during a checkpoint. The requirements: + * + * 1. they must be leaf pages, + * 2. there is a snapshot transaction active (which is the case in + * ordinary application checkpoints but not all internal cases), + * 3. the first dirty update on the page is sufficiently recent the + * checkpoint transaction would skip them, + * 4. there's already an address for every disk block involved. + */ + if (WT_PAGE_IS_INTERNAL(page)) + return (false); + if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)) + return (false); + if (!WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) + return (false); + + /* + * The problematic case is when a page was evicted but when there were + * unresolved updates and not every block associated with the page has + * a disk address. We can't skip such pages because we need a checkpoint + * write with valid addresses. + * + * The page's modification information can change underfoot if the page + * is being reconciled, so we'd normally serialize with reconciliation + * before reviewing page-modification information. However, checkpoint + * is the only valid writer of dirty leaf pages at this point, we skip + * the lock. + */ + if (mod->rec_result == WT_PM_REC_MULTIBLOCK) + for (multi = mod->mod_multi, + i = 0; i < mod->mod_multi_entries; ++multi, ++i) + if (multi->addr.addr == NULL) + return (false); + return (true); +} + +/* * __sync_file -- * Flush pages for a specific file. */ @@ -20,7 +73,6 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; - WT_PAGE_MODIFY *mod; WT_REF *walk; WT_TXN *txn; uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; @@ -161,29 +213,15 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) * reference and checking modified. */ page = walk->page; - mod = page->modify; /* - * Write dirty pages, unless we can be sure they only - * became dirty after the checkpoint started. - * - * We can skip dirty pages if: - * (1) they are leaf pages; - * (2) there is a snapshot transaction active (which - * is the case in ordinary application checkpoints - * but not all internal cases); and - * (3) the first dirty update on the page is - * sufficiently recent that the checkpoint - * transaction would skip them. - * - * Mark the tree dirty: the checkpoint marked it clean - * and we can't skip future checkpoints until this page - * is written. + * Write dirty pages, if we can't skip them. If we skip + * a page, mark the tree dirty. The checkpoint marked it + * clean and we can't skip future checkpoints until this + * page is written. */ - if (!WT_PAGE_IS_INTERNAL(page) && - F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) && - WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) { - __wt_page_modify_set(session, page); + if (__sync_checkpoint_can_skip(session, page)) { + __wt_tree_modify_set(session); continue; } diff --git a/src/include/btree.i b/src/include/btree.i index 79367da9cc9..daf2eb158c1 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -485,6 +485,38 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) } /* + * __wt_tree_modify_set -- + * Mark the tree dirty. + */ +static inline void +__wt_tree_modify_set(WT_SESSION_IMPL *session) +{ + /* + * Test before setting the dirty flag, it's a hot cache line. + * + * The tree's modified flag is cleared by the checkpoint thread: set it + * and insert a barrier before dirtying the page. (I don't think it's + * a problem if the tree is marked dirty with all the pages clean, it + * might result in an extra checkpoint that doesn't do any work but it + * shouldn't cause problems; regardless, let's play it safe.) + */ + if (!S2BT(session)->modified) { + /* Assert we never dirty a checkpoint handle. */ + WT_ASSERT(session, session->dhandle->checkpoint == NULL); + + S2BT(session)->modified = true; + WT_FULL_BARRIER(); + } + + /* + * The btree may already be marked dirty while the connection is still + * clean; mark the connection dirty outside the test of the btree state. + */ + if (!S2C(session)->modified) + S2C(session)->modified = true; +} + +/* * __wt_page_modify_clear -- * Clean a modified page. */ @@ -513,30 +545,9 @@ __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) /* * Mark the tree dirty (even if the page is already marked dirty), newly * created pages to support "empty" files are dirty, but the file isn't - * marked dirty until there's a real change needing to be written. Test - * before setting the dirty flag, it's a hot cache line. - * - * The tree's modified flag is cleared by the checkpoint thread: set it - * and insert a barrier before dirtying the page. (I don't think it's - * a problem if the tree is marked dirty with all the pages clean, it - * might result in an extra checkpoint that doesn't do any work but it - * shouldn't cause problems; regardless, let's play it safe.) - */ - if (!S2BT(session)->modified) { - /* Assert we never dirty a checkpoint handle. */ - WT_ASSERT(session, session->dhandle->checkpoint == NULL); - - S2BT(session)->modified = true; - WT_FULL_BARRIER(); - } - - /* - * There is a possibility of btree being dirty whereas connection being - * clean when entering this function. So make sure to update connection - * to dirty outside a condition on btree modified flag. + * marked dirty until there's a real change needing to be written. */ - if (!S2C(session)->modified) - S2C(session)->modified = true; + __wt_tree_modify_set(session); __wt_page_only_modify_set(session, page); } |