summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@wiredtiger.com>2014-05-09 14:28:34 +1000
committerMichael Cahill <michael.cahill@wiredtiger.com>2014-05-09 14:28:34 +1000
commitcd98e8877b6e9b3b1e5d6474b45ef4bbc1a6d24a (patch)
tree7213f155225b6af03eae27d14d4c53b98b1a3faa
parent5407d69f616c6ff5227e608cd54c4f3b85555ae2 (diff)
downloadmongo-cd98e8877b6e9b3b1e5d6474b45ef4bbc1a6d24a.tar.gz
Speed up checkpoints by doing a better job of skipping pages that can't contain changes that need to be included.
Before the checkpoint pass, we have written every dirty leaf page at read-committed isolation. So the only pages that checkpoint has to write are internal pages, or leaf pages that were dirtied after the "write leaves" phase but before the checkpoint transaction started. Now that we separate out the allocation of a transaction ID from setting up a snapshot, we can distinguish between update transactions and the transaction we're using for the checkpoint. refs #954, #963, #1001
-rw-r--r--src/btree/bt_evict.c17
-rw-r--r--src/btree/bt_sync.c30
-rw-r--r--src/btree/rec_evict.c35
-rw-r--r--src/btree/rec_split.c10
-rw-r--r--src/btree/rec_write.c8
-rw-r--r--src/include/btmem.h11
-rw-r--r--src/include/btree.h6
-rw-r--r--src/include/btree.i22
8 files changed, 47 insertions, 92 deletions
diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c
index 34a71e8bad2..dcdce17ef58 100644
--- a/src/btree/bt_evict.c
+++ b/src/btree/bt_evict.c
@@ -815,6 +815,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
WT_DECL_RET;
WT_EVICT_ENTRY *end, *evict, *start;
WT_PAGE *page;
+ WT_PAGE_MODIFY *mod;
uint64_t pages_walked;
uint32_t walk_flags;
int internal_pages, modified, restarts;
@@ -883,12 +884,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
* with the checkpointing thread.
*/
modified = __wt_page_is_modified(page);
-#ifdef EVICTION_DURING_CHECKPOINT
- if (modified && btree->checkpointing &&
- page->modify->checkpoint_gen >= btree->checkpoint_gen)
-#else
if (modified && btree->checkpointing)
-#endif
continue;
/* Optionally ignore clean pages. */
@@ -899,9 +895,10 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
* If the page is clean but has modifications that appear too
* new to evict, skip it.
*/
- if (!modified && page->modify != NULL &&
+ mod = page->modify;
+ if (!modified && mod != NULL &&
!LF_ISSET(WT_EVICT_PASS_AGGRESSIVE) &&
- !__wt_txn_visible_all(session, page->modify->rec_max_txn))
+ !__wt_txn_visible_all(session, mod->rec_max_txn))
continue;
/*
@@ -920,10 +917,8 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
*/
if (modified && !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE) &&
!btree->checkpointing &&
- (page->modify->disk_snap_min ==
- S2C(session)->txn_global.oldest_id ||
- !__wt_txn_visible_all(session,
- page->modify->update_txn)))
+ (mod->disk_snap_min == S2C(session)->txn_global.oldest_id ||
+ !__wt_txn_visible_all(session, mod->update_txn)))
continue;
WT_ASSERT(session, evict->ref == NULL);
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index 84e5261310e..d8114a6019a 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -23,7 +23,7 @@ __sync_file(WT_SESSION_IMPL *session, int syncop)
WT_TXN *txn;
uint64_t internal_bytes, leaf_bytes;
uint64_t internal_pages, leaf_pages;
- uint32_t checkpoint_gen, flags;
+ uint32_t flags;
btree = S2BT(session);
walk = NULL;
@@ -70,7 +70,6 @@ __sync_file(WT_SESSION_IMPL *session, int syncop)
* eviction to complete.
*/
btree->checkpointing = 1;
- checkpoint_gen = ++btree->checkpoint_gen;
if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) {
WT_ERR(__wt_evict_file_exclusive_on(session));
@@ -88,19 +87,12 @@ __sync_file(WT_SESSION_IMPL *session, int syncop)
* Write dirty pages, unless we can be sure they only
* became dirty after the checkpoint started.
*
- * We can skip pages if:
+ * We can skip dirty pages if:
* (1) they are leaf pages;
- * (2) the global checkpoint generation has been
- * incremented (otherwise we skip writing the
- * metadata when first creating tables);
- * (3) the page's checkpoint generation is equal to
- * the current checkpoint generation, so it has
- * already been written since this checkpoint
- * started; and
- * (4) there is a snapshot transaction active (which
+ * (2) there is a snapshot transaction active (which
* is the case in ordinary application checkpoints
* but not all internal cases); and
- * (5) any updates skipped by reconciliation were
+ * (3) the first dirty update on the page is
* sufficiently recent that the checkpoint
* transaction would skip them.
*/
@@ -108,10 +100,8 @@ __sync_file(WT_SESSION_IMPL *session, int syncop)
mod = page->modify;
if (__wt_page_is_modified(page) &&
(WT_PAGE_IS_INTERNAL(page) ||
- checkpoint_gen == 0 ||
- mod->checkpoint_gen < checkpoint_gen ||
!F_ISSET(txn, TXN_HAS_SNAPSHOT) ||
- TXNID_LE(mod->rec_skipped_txn, txn->snap_max))) {
+ TXNID_LE(mod->first_dirty_txn, txn->snap_max))) {
if (WT_PAGE_IS_INTERNAL(page)) {
internal_bytes +=
page->memory_footprint;
@@ -122,17 +112,7 @@ __sync_file(WT_SESSION_IMPL *session, int syncop)
}
WT_ERR(__wt_rec_write(session, walk, NULL, 0));
}
-
- /*
- * Set the checkpoint generation, even if we didn't
- * write the page. If it becomes dirty and is selected
- * for eviction, it can't be written until this
- * checkpoint completes.
- */
- if (page->modify != NULL)
- page->modify->checkpoint_gen = checkpoint_gen;
}
- WT_ASSERT(session, checkpoint_gen == btree->checkpoint_gen);
break;
WT_ILLEGAL_VALUE_ERR(session);
}
diff --git a/src/btree/rec_evict.c b/src/btree/rec_evict.c
index e373a2905eb..bce7b4c6c64 100644
--- a/src/btree/rec_evict.c
+++ b/src/btree/rec_evict.c
@@ -241,7 +241,6 @@ __rec_review(
WT_PAGE_MODIFY *mod;
WT_REF *child;
uint32_t flags;
- int behind_checkpoint;
btree = S2BT(session);
page = ref->page;
@@ -293,46 +292,34 @@ __rec_review(
} WT_INTL_FOREACH_END;
/*
- * If the file is being checkpointed, we can't evict dirty pages already
- * visited during the checkpoint: if we write a page and free the
- * previous version of the page, that previous version might be
- * referenced by an internal page already been written in the
- * checkpoint, leaving the checkpoint inconsistent.
- * Don't rely on new updates being skipped by the transaction used
+ * If the file is being checkpointed, we can't evict dirty pages:
+ * if we write a page and free the previous version of the page, that
+ * previous version might be referenced by an internal page already
+ * been written in the checkpoint, leaving the checkpoint inconsistent.
+ *
+ * Don't rely on new updates being skipped by the transaction used
* for transaction reads: (1) there are paths that dirty pages for
* artificial reasons; (2) internal pages aren't transactional; and
* (3) if an update was skipped during the checkpoint (leaving the page
* dirty), then rolled back, we could still successfully overwrite a
* page and corrupt the checkpoint.
- * Further, we can't race with the checkpoint's reconciliation of
+ *
+ * Further, we can't race with the checkpoint's reconciliation of
* an internal page as we evict a clean child from the page's subtree.
* This works in the usual way: eviction locks the page and then checks
* for existing hazard pointers, the checkpoint thread reconciling an
* internal page acquires hazard pointers on child pages it reads, and
* is blocked by the exclusive lock.
*/
- mod = page->modify;
-#ifdef EVICTION_DURING_CHECKPOINT
- behind_checkpoint = btree->checkpointing && (mod != NULL) &&
- mod->checkpoint_gen >= btree->checkpoint_gen;
-#else
- behind_checkpoint = btree->checkpointing && (mod != NULL);
-#endif
-
- if (behind_checkpoint && __wt_page_is_modified(page)) {
+ if ((mod = page->modify) != NULL && btree->checkpointing &&
+ (__wt_page_is_modified(page) ||
+ F_ISSET(mod, WT_PM_REC_MULTIBLOCK))) {
WT_STAT_FAST_CONN_INCR(session, cache_eviction_checkpoint);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_checkpoint);
return (EBUSY);
}
/*
- * If we behind a checkpoint, we can't merge multiblock pages into
- * their parent.
- */
- if (behind_checkpoint && F_ISSET(mod, WT_PM_REC_MULTIBLOCK))
- return (EBUSY);
-
- /*
* Fail if any page in the top-level page's subtree won't be merged into
* its parent, the page that cannot be merged must be evicted first.
* The test is necessary but should not fire much: the eviction code is
diff --git a/src/btree/rec_split.c b/src/btree/rec_split.c
index b45624c3cb0..c9e61582c14 100644
--- a/src/btree/rec_split.c
+++ b/src/btree/rec_split.c
@@ -549,11 +549,13 @@ __split_inmem_build(
}
/*
- * We modified the page above, which will have copied the current
- * checkpoint generation. If there is a checkpoint in progress, it
- * must write this page, so reset the checkpoint generation to zero.
+ * We modified the page above, which will have set the first dirty
+ * transaction to the last transaction current running. However, the
+ * updates we installed may be older than that. Take the oldest active
+ * transaction ID to make sure these updates are not skipped by a
+ * checkpoint.
*/
- page->modify->checkpoint_gen = 0;
+ page->modify->first_dirty_txn = S2C(session)->txn_global.oldest_id;
err: __wt_scr_free(&key);
/* Free any resources that may have been cached in the cursor. */
diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c
index 8d562c6df69..9a605c86914 100644
--- a/src/btree/rec_write.c
+++ b/src/btree/rec_write.c
@@ -4655,7 +4655,7 @@ err: __wt_scr_free(&tkey);
WT_PANIC_RETX(session,
"reconciliation illegally skipped an update");
- mod->rec_skipped_txn = r->skipped_txn;
+ mod->first_dirty_txn = r->skipped_txn;
btree->modified = 1;
WT_FULL_BARRIER();
@@ -4676,12 +4676,6 @@ err: __wt_scr_free(&tkey);
__wt_cache_dirty_decr(session, page);
}
- /*
- * Set the checkpoint generation, used to determine whether we can skip
- * writing this page again.
- */
- mod->checkpoint_gen = btree->checkpoint_gen;
-
return (0);
}
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 629607a5071..66031ed24d8 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -181,8 +181,8 @@ struct __wt_page_modify {
/* The largest transaction ID seen on the page by reconciliation. */
uint64_t rec_max_txn;
- /* The smallest transaction ID skipped by reconciliation. */
- uint64_t rec_skipped_txn;
+ /* The first unwritten transaction ID (approximate). */
+ uint64_t first_dirty_txn;
/* The largest update transaction ID (approximate). */
uint64_t update_txn;
@@ -328,13 +328,6 @@ struct __wt_page_modify {
*/
uint32_t write_gen;
- /*
- * The checkpoint generation is the most recent checkpoint to have
- * visited a page. When a checkpoint is in progress, dirty pages that
- * have not yet been visited can be evicted by application threads.
- */
- uint32_t checkpoint_gen;
-
#define WT_PAGE_LOCK(s, p) \
__wt_spin_lock((s), &S2C(s)->page_lock[(p)->modify->page_lock])
#define WT_PAGE_UNLOCK(s, p) \
diff --git a/src/include/btree.h b/src/include/btree.h
index 6f20b2c2784..fece786fced 100644
--- a/src/include/btree.h
+++ b/src/include/btree.h
@@ -117,12 +117,6 @@ struct __wt_btree {
u_int evict_walk_skips; /* Number of walks skipped */
volatile uint32_t evict_busy; /* Count of threads in eviction */
- /*
- * The current checkpoint generation. Use a 32-bit count for now: if
- * we can do 4 billion checkpoints without a restart, we'll be in good
- * shape.
- */
- uint32_t checkpoint_gen;
int checkpointing; /* Checkpoint in progress */
/* Flags values up to 0xff are reserved for WT_DHANDLE_* */
diff --git a/src/include/btree.i b/src/include/btree.i
index e433a558d9c..43418d65b96 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -323,6 +323,13 @@ static inline void
__wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
{
WT_TXN_GLOBAL *txn_global;
+ uint64_t last_running;
+
+ txn_global = &S2C(session)->txn_global;
+
+ last_running = 0;
+ if (page->modify->write_gen == 0)
+ last_running = txn_global->last_running;
/*
* We depend on atomic-add being a write barrier, that is, a barrier to
@@ -343,14 +350,17 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
if (F_ISSET(&session->txn, TXN_HAS_SNAPSHOT))
page->modify->disk_snap_min = session->txn.snap_min;
- txn_global = &S2C(session)->txn_global;
- page->modify->rec_skipped_txn = txn_global->last_running;
-
/*
- * Set the checkpoint generation: if a checkpoint is already
- * running, these changes cannot be included, by definition.
+ * We won the race to dirty the page, but another thread could
+ * have committed in the meantime, and the last_running field
+ * been updated past it. That is all very unlikely, but not
+ * impossible, so we take care to read the global state before
+ * the atomic increment. If we raced with reconciliation, just
+ * leave the previous value here: at worst, we will write a
+ * page in a checkpoint when not absolutely necessary.
*/
- page->modify->checkpoint_gen = S2BT(session)->checkpoint_gen;
+ if (last_running != 0)
+ page->modify->first_dirty_txn = last_running;
}
/* Check if this is the largest transaction ID to update the page. */