summaryrefslogtreecommitdiff
path: root/src/btree/bt_sync.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/btree/bt_sync.c')
-rw-r--r--src/btree/bt_sync.c80
1 files changed, 59 insertions, 21 deletions
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index 7b583bd9c1e..6d4ad9d0d0f 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -9,6 +9,59 @@
#include "wt_internal.h"
/*
+ * __sync_checkpoint_can_skip --
+ * There are limited conditions under which we can skip writing a dirty
+ * page during checkpoint.
+ */
+static inline bool
+__sync_checkpoint_can_skip(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_PAGE_MODIFY *mod;
+ WT_MULTI *multi;
+ WT_TXN *txn;
+ u_int i;
+
+ mod = page->modify;
+ txn = &session->txn;
+
+ /*
+ * We can skip some dirty pages during a checkpoint. The requirements:
+ *
+ * 1. they must be leaf pages,
+ * 2. there is a snapshot transaction active (which is the case in
+ * ordinary application checkpoints but not all internal cases),
+ * 3. the first dirty update on the page is sufficiently recent the
+ * checkpoint transaction would skip them,
+ * 4. there's already an address for every disk block involved.
+ */
+ if (WT_PAGE_IS_INTERNAL(page))
+ return (false);
+ if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))
+ return (false);
+ if (!WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn))
+ return (false);
+
+ /*
+ * The problematic case is when a page was evicted but when there were
+ * unresolved updates and not every block associated with the page has
+ * a disk address. We can't skip such pages because we need a checkpoint
+ * write with valid addresses.
+ *
+ * The page's modification information can change underfoot if the page
+ * is being reconciled, so we'd normally serialize with reconciliation
+ * before reviewing page-modification information. However, checkpoint
+ * is the only valid writer of dirty leaf pages at this point, we skip
+ * the lock.
+ */
+ if (mod->rec_result == WT_PM_REC_MULTIBLOCK)
+ for (multi = mod->mod_multi,
+ i = 0; i < mod->mod_multi_entries; ++multi, ++i)
+ if (multi->addr.addr == NULL)
+ return (false);
+ return (true);
+}
+
+/*
* __sync_file --
* Flush pages for a specific file.
*/
@@ -20,7 +73,6 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_PAGE *page;
- WT_PAGE_MODIFY *mod;
WT_REF *walk;
WT_TXN *txn;
uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages;
@@ -161,29 +213,15 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
* reference and checking modified.
*/
page = walk->page;
- mod = page->modify;
/*
- * Write dirty pages, unless we can be sure they only
- * became dirty after the checkpoint started.
- *
- * We can skip dirty pages if:
- * (1) they are leaf pages;
- * (2) there is a snapshot transaction active (which
- * is the case in ordinary application checkpoints
- * but not all internal cases); and
- * (3) the first dirty update on the page is
- * sufficiently recent that the checkpoint
- * transaction would skip them.
- *
- * Mark the tree dirty: the checkpoint marked it clean
- * and we can't skip future checkpoints until this page
- * is written.
+ * Write dirty pages, if we can't skip them. If we skip
+ * a page, mark the tree dirty. The checkpoint marked it
+ * clean and we can't skip future checkpoints until this
+ * page is written.
*/
- if (!WT_PAGE_IS_INTERNAL(page) &&
- F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) &&
- WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) {
- __wt_page_modify_set(session, page);
+ if (__sync_checkpoint_can_skip(session, page)) {
+ __wt_tree_modify_set(session);
continue;
}