summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeith Bostic <keith.bostic@mongodb.com>2016-12-06 21:52:34 -0500
committerAlex Gorrod <alexander.gorrod@mongodb.com>2016-12-07 13:52:34 +1100
commit1adda6a0a51e51ffa5a3fa9cf86f6f765585a5f4 (patch)
tree489278ba31b4a033111e33a0337fd9a545ebdabf
parentc68e35c4c8aca43ebfebaf1ed8a68bef71ccb4cc (diff)
downloadmongo-1adda6a0a51e51ffa5a3fa9cf86f6f765585a5f4.tar.gz
WT-2960 Reduce likelihood of using the lookaside file, especially when inserting multi-megabyte values (#3171)
Don't configure the lookaside table as soon as eviction is stuck, only configure the lookaside table if update/restore reconciliation fails and there's reason to believe the lookaside table will be effective, based on the updates that were skipped during the update/restore reconciliation. The evaluation of whether an update/restore reconciliation would make progress was a check for a single block rewrite with no updates to restore or at least one update chain without a skipped entry. Check more deeply, if there's any block without updates to be restored or at least 10% of the update chains didn't have skipped entries, assume we're making progress.
-rw-r--r--src/btree/bt_slvg.c4
-rw-r--r--src/btree/bt_sync.c6
-rw-r--r--src/evict/evict_file.c3
-rw-r--r--src/evict/evict_page.c68
-rw-r--r--src/include/extern.h2
-rw-r--r--src/os_posix/os_fs.c3
-rw-r--r--src/reconcile/rec_write.c50
7 files changed, 85 insertions, 51 deletions
diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c
index f269c2d7f43..fde4d4fb9de 100644
--- a/src/btree/bt_slvg.c
+++ b/src/btree/bt_slvg.c
@@ -1300,7 +1300,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref)
/* Write the new version of the leaf page to disk. */
WT_ERR(__slvg_modify_init(session, page));
- WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR));
+ WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR, NULL));
/* Reset the page. */
page->pg_var_d = save_col_var;
@@ -2011,7 +2011,7 @@ __slvg_row_build_leaf(
/* Write the new version of the leaf page to disk. */
WT_ERR(__slvg_modify_init(session, page));
- WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR));
+ WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR, NULL));
/* Reset the page. */
page->pg_row_entries += skip_stop;
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index 129d7fec05f..7bf15baa67f 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -136,8 +136,8 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
__wt_txn_get_snapshot(session);
leaf_bytes += page->memory_footprint;
++leaf_pages;
- WT_ERR(__wt_reconcile(
- session, walk, NULL, WT_CHECKPOINTING));
+ WT_ERR(__wt_reconcile(session,
+ walk, NULL, WT_CHECKPOINTING, NULL));
}
}
break;
@@ -233,7 +233,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
++leaf_pages;
}
WT_ERR(__wt_reconcile(
- session, walk, NULL, WT_CHECKPOINTING));
+ session, walk, NULL, WT_CHECKPOINTING, NULL));
}
break;
case WT_SYNC_CLOSE:
diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c
index b0cd50cc655..17b038fb003 100644
--- a/src/evict/evict_file.c
+++ b/src/evict/evict_file.c
@@ -57,7 +57,8 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
* error, retrying later.
*/
if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page))
- WT_ERR(__wt_reconcile(session, ref, NULL, WT_EVICTING));
+ WT_ERR(__wt_reconcile(
+ session, ref, NULL, WT_EVICTING, NULL));
/*
* We can't evict the page just returned to us (it marks our
diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c
index 3d1557e027e..893133432bb 100644
--- a/src/evict/evict_page.c
+++ b/src/evict/evict_page.c
@@ -396,7 +396,7 @@ __evict_review(
WT_DECL_RET;
WT_PAGE *page;
uint32_t flags;
- bool modified;
+ bool lookaside_retry, modified;
flags = WT_EVICTING;
*flagsp = flags;
@@ -495,27 +495,29 @@ __evict_review(
* If we have an exclusive lock (we're discarding the tree), assert
* there are no updates we cannot read.
*
- * Don't set any other flags for internal pages: they don't have update
- * lists to be saved and restored, nor can we re-create them in memory.
+ * Don't set any other flags for internal pages: there are no update
+ * lists to be saved and restored, changes can't be written into the
+ * lookaside table, nor can we re-create internal pages in memory.
*
* For leaf pages:
*
- * If an in-memory configuration or the page is being forcibly evicted,
- * set the update-restore flag, so reconciliation will write blocks it
+ * In-memory pages are a known configuration.
+ *
+ * Set the update/restore flag, so reconciliation will write blocks it
* can write and create a list of skipped updates for blocks it cannot
- * write, along with disk images. This is how eviction of active, huge
+ * write, along with disk images. This is how eviction of active, huge
* pages works: we take a big page and reconcile it into blocks, some of
* which we write and discard, the rest of which we re-create as smaller
* in-memory pages, (restoring the updates that stopped us from writing
- * the block), and inserting the whole mess into the page's parent.
- *
- * Otherwise, if eviction is getting pressed, configure reconciliation
- * to write not-yet-globally-visible updates to the lookaside table,
- * allowing the eviction of pages we'd otherwise have to retain in cache
- * to support older readers.
+ * the block), and inserting the whole mess into the page's parent. Set
+ * the flag in all cases because the incremental cost of update/restore
+ * in reconciliation is minimal, eviction shouldn't have picked a page
+ * where update/restore is necessary, absent some cache pressure. It's
+ * possible updates occurred after we selected this page for eviction,
+ * but it's unlikely and we don't try and manage that risk.
*
- * Finally, if we don't need to do eviction at the moment, create disk
- * images of split pages in order to re-instantiate them.
+ * Additionally, if we aren't trying to free space in the cache, scrub
+ * the page and keep it in memory.
*/
cache = S2C(session)->cache;
if (closing)
@@ -524,25 +526,33 @@ __evict_review(
if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY))
LF_SET(WT_EVICT_IN_MEMORY |
WT_EVICT_SCRUB | WT_EVICT_UPDATE_RESTORE);
- else if (__wt_cache_stuck(session))
- LF_SET(WT_EVICT_LOOKASIDE);
- else if (!__wt_txn_visible_all(
- session, page->modify->update_txn) ||
- page->read_gen == WT_READGEN_OLDEST ||
- page->memory_footprint >= S2BT(session)->splitmempage)
+ else {
LF_SET(WT_EVICT_UPDATE_RESTORE);
- /*
- * If we aren't trying to free space in the cache, scrub the
- * page and keep it around.
- */
- if (!LF_ISSET(WT_EVICT_LOOKASIDE) &&
- F_ISSET(cache, WT_CACHE_EVICT_SCRUB))
- LF_SET(WT_EVICT_SCRUB);
+ if (F_ISSET(cache, WT_CACHE_EVICT_SCRUB))
+ LF_SET(WT_EVICT_SCRUB);
+ }
}
- *flagsp = flags;
- WT_RET(__wt_reconcile(session, ref, NULL, flags));
+ /* Reconcile the page. */
+ ret = __wt_reconcile(session, ref, NULL, flags, &lookaside_retry);
+
+ /*
+ * If reconciliation fails, eviction is stuck and reconciliation reports
+ * it might succeed if we use the lookaside table (the page didn't have
+ * uncommitted updates, it was not-yet-globally visible updates causing
+ * the problem), configure reconciliation to write those updates to the
+ * lookaside table, allowing the eviction of pages we'd otherwise have
+ * to retain in cache to support older readers.
+ */
+ if (ret == EBUSY && __wt_cache_stuck(session) && lookaside_retry) {
+ LF_CLR(WT_EVICT_SCRUB | WT_EVICT_UPDATE_RESTORE);
+ LF_SET(WT_EVICT_LOOKASIDE);
+ ret = __wt_reconcile(session, ref, NULL, flags, NULL);
+ }
+
+ *flagsp = flags;
+ WT_RET(ret);
/*
* Success: assert the page is clean or reconciliation was configured
diff --git a/src/include/extern.h b/src/include/extern.h
index b617b66e8aa..42b34f75a17 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -542,7 +542,7 @@ extern int __wt_ovfl_txnc_add(WT_SESSION_IMPL *session, WT_PAGE *page, const uin
extern void __wt_ovfl_txnc_free(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_ovfl_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
-extern int __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
+extern int __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags, bool *lookaside_retryp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern uint32_t __wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
extern int __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden")));
diff --git a/src/os_posix/os_fs.c b/src/os_posix/os_fs.c
index 26be0f049cc..bc8cbf67025 100644
--- a/src/os_posix/os_fs.c
+++ b/src/os_posix/os_fs.c
@@ -687,8 +687,7 @@ __posix_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session,
advise_flag = POSIX_FADV_RANDOM;
if (LF_ISSET(WT_FS_OPEN_ACCESS_SEQ))
advise_flag = POSIX_FADV_SEQUENTIAL;
- WT_SYSCALL(
- posix_fadvise(pfh->fd, 0, 0, advise_flag), ret);
+ WT_SYSCALL(posix_fadvise(pfh->fd, 0, 0, advise_flag), ret);
if (ret != 0)
WT_ERR_MSG(session, ret,
"%s: handle-open: posix_fadvise", name);
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index f71715412af..86749eef2e1 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -44,7 +44,8 @@ typedef struct {
uint64_t max_txn;
/* Track if all updates were skipped. */
- bool all_skipped;
+ uint64_t update_cnt;
+ uint64_t update_skip_cnt;
/*
* When we can't mark the page clean (for example, checkpoint found some
@@ -349,8 +350,8 @@ static void __rec_dictionary_reset(WT_RECONCILE *);
* Reconcile an in-memory page into its on-disk format, and write it.
*/
int
-__wt_reconcile(WT_SESSION_IMPL *session,
- WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags)
+__wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref,
+ WT_SALVAGE_COOKIE *salvage, uint32_t flags, bool *lookaside_retryp)
{
WT_DECL_RET;
WT_PAGE *page;
@@ -360,6 +361,8 @@ __wt_reconcile(WT_SESSION_IMPL *session,
page = ref->page;
mod = page->modify;
+ if (lookaside_retryp != NULL)
+ *lookaside_retryp = false;
__wt_verbose(session,
WT_VERB_RECONCILE, "%s", __wt_page_type_string(page->type));
@@ -438,6 +441,14 @@ __wt_reconcile(WT_SESSION_IMPL *session,
/* Release the reconciliation lock. */
__wt_writeunlock(session, &page->page_lock);
+ /*
+ * If our caller can configure lookaside table reconciliation, flag if
+ * that's worth trying. The lookaside table doesn't help if we skipped
+ * updates, it can only help with older readers preventing eviction.
+ */
+ if (lookaside_retryp != NULL && r->update_cnt == r->update_skip_cnt)
+ *lookaside_retryp = true;
+
/* Update statistics. */
WT_STAT_CONN_INCR(session, rec_pages);
WT_STAT_DATA_INCR(session, rec_pages);
@@ -545,6 +556,9 @@ __rec_las_checkpoint_test(WT_SESSION_IMPL *session, WT_RECONCILE *r)
static int
__rec_write_check_complete(WT_SESSION_IMPL *session, WT_RECONCILE *r)
{
+ WT_BOUNDARY *bnd;
+ size_t i;
+
/*
* If we have used the lookaside table, check for a lookaside table and
* checkpoint collision.
@@ -553,14 +567,18 @@ __rec_write_check_complete(WT_SESSION_IMPL *session, WT_RECONCILE *r)
return (EBUSY);
/*
- * If we are doing eviction and restoring updates, there is only one
- * block and all update were skipped, no progress has been made and
- * there is no point swapping the new page into place.
+ * If we are doing update/restore based eviction, confirm part of the
+ * page is being discarded, or at least 10% of the updates won't have
+ * to be re-instantiated. Otherwise, it isn't progress, don't bother.
*/
- if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && r->all_skipped &&
- r->bnd_next == 1 && r->bnd[0].supd != NULL)
- return (EBUSY);
-
+ if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) {
+ for (bnd = r->bnd, i = 0; i < r->bnd_entries; ++bnd, ++i)
+ if (bnd->supd == NULL)
+ break;
+ if (i == r->bnd_entries &&
+ r->update_cnt / 10 >= r->update_skip_cnt)
+ return (EBUSY);
+ }
return (0);
}
@@ -724,7 +742,7 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
* Fake up a reference structure, and write the next root page.
*/
__wt_root_ref_init(&fake_ref, next, page->type == WT_PAGE_COL_INT);
- return (__wt_reconcile(session, &fake_ref, NULL, flags));
+ return (__wt_reconcile(session, &fake_ref, NULL, flags, NULL));
err: __wt_page_out(session, &next);
return (ret);
@@ -866,7 +884,7 @@ __rec_write_init(WT_SESSION_IMPL *session,
r->max_txn = WT_TXN_NONE;
/* Track if all updates were skipped. */
- r->all_skipped = true;
+ r->update_cnt = r->update_skip_cnt = 0;
/* Track if the page can be marked clean. */
r->leave_dirty = false;
@@ -1109,6 +1127,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
} else
upd_list = ins->upd;
+ ++r->update_cnt;
for (skipped = false,
max_txn = WT_TXN_NONE, min_txn = UINT64_MAX,
upd = upd_list; upd != NULL; upd = upd->next) {
@@ -1199,7 +1218,12 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
txnid != S2C(session)->txn_global.checkpoint_txnid ||
WT_SESSION_IS_CHECKPOINT(session));
#endif
- r->all_skipped = false;
+
+ /*
+ * Track how many update chains we saw vs. how many update
+ * chains had an entry we skipped.
+ */
+ ++r->update_skip_cnt;
return (0);
}