diff options
Diffstat (limited to 'src/btree/bt_sync.c')
-rw-r--r-- | src/btree/bt_sync.c | 80 |
1 files changed, 59 insertions, 21 deletions
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index 7b583bd9c1e..6d4ad9d0d0f 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -9,6 +9,59 @@ #include "wt_internal.h" /* + * __sync_checkpoint_can_skip -- + * There are limited conditions under which we can skip writing a dirty + * page during checkpoint. + */ +static inline bool +__sync_checkpoint_can_skip(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_PAGE_MODIFY *mod; + WT_MULTI *multi; + WT_TXN *txn; + u_int i; + + mod = page->modify; + txn = &session->txn; + + /* + * We can skip some dirty pages during a checkpoint. The requirements: + * + * 1. they must be leaf pages, + * 2. there is a snapshot transaction active (which is the case in + * ordinary application checkpoints but not all internal cases), + * 3. the first dirty update on the page is sufficiently recent the + * checkpoint transaction would skip them, + * 4. there's already an address for every disk block involved. + */ + if (WT_PAGE_IS_INTERNAL(page)) + return (false); + if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)) + return (false); + if (!WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) + return (false); + + /* + * The problematic case is when a page was evicted but when there were + * unresolved updates and not every block associated with the page has + * a disk address. We can't skip such pages because we need a checkpoint + * write with valid addresses. + * + * The page's modification information can change underfoot if the page + * is being reconciled, so we'd normally serialize with reconciliation + * before reviewing page-modification information. However, checkpoint + * is the only valid writer of dirty leaf pages at this point, we skip + * the lock. + */ + if (mod->rec_result == WT_PM_REC_MULTIBLOCK) + for (multi = mod->mod_multi, + i = 0; i < mod->mod_multi_entries; ++multi, ++i) + if (multi->addr.addr == NULL) + return (false); + return (true); +} + +/* * __sync_file -- * Flush pages for a specific file. */ @@ -20,7 +73,6 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; - WT_PAGE_MODIFY *mod; WT_REF *walk; WT_TXN *txn; uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; @@ -161,29 +213,15 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) * reference and checking modified. */ page = walk->page; - mod = page->modify; /* - * Write dirty pages, unless we can be sure they only - * became dirty after the checkpoint started. - * - * We can skip dirty pages if: - * (1) they are leaf pages; - * (2) there is a snapshot transaction active (which - * is the case in ordinary application checkpoints - * but not all internal cases); and - * (3) the first dirty update on the page is - * sufficiently recent that the checkpoint - * transaction would skip them. - * - * Mark the tree dirty: the checkpoint marked it clean - * and we can't skip future checkpoints until this page - * is written. + * Write dirty pages, if we can't skip them. If we skip + * a page, mark the tree dirty. The checkpoint marked it + * clean and we can't skip future checkpoints until this + * page is written. */ - if (!WT_PAGE_IS_INTERNAL(page) && - F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) && - WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) { - __wt_page_modify_set(session, page); + if (__sync_checkpoint_can_skip(session, page)) { + __wt_tree_modify_set(session); continue; } |