diff options
author | Keith Bostic <keith.bostic@mongodb.com> | 2016-12-06 21:52:34 -0500 |
---|---|---|
committer | Alex Gorrod <alexander.gorrod@mongodb.com> | 2016-12-07 13:52:34 +1100 |
commit | 1adda6a0a51e51ffa5a3fa9cf86f6f765585a5f4 (patch) | |
tree | 489278ba31b4a033111e33a0337fd9a545ebdabf | |
parent | c68e35c4c8aca43ebfebaf1ed8a68bef71ccb4cc (diff) | |
download | mongo-1adda6a0a51e51ffa5a3fa9cf86f6f765585a5f4.tar.gz |
WT-2960 Reduce likelihood of using the lookaside file, especially when inserting multi-megabyte values (#3171)
Don't configure the lookaside table as soon as eviction is stuck, only
configure the lookaside table if update/restore reconciliation fails and
there's reason to believe the lookaside table will be effective, based
on the updates that were skipped during the update/restore reconciliation.
The evaluation of whether an update/restore reconciliation would make
progress was a check for a single block rewrite with no updates to
restore or at least one update chain without a skipped entry. Check
more deeply, if there's any block without updates to be restored or at
least 10% of the update chains didn't have skipped entries, assume we're
making progress.
-rw-r--r-- | src/btree/bt_slvg.c | 4 | ||||
-rw-r--r-- | src/btree/bt_sync.c | 6 | ||||
-rw-r--r-- | src/evict/evict_file.c | 3 | ||||
-rw-r--r-- | src/evict/evict_page.c | 68 | ||||
-rw-r--r-- | src/include/extern.h | 2 | ||||
-rw-r--r-- | src/os_posix/os_fs.c | 3 | ||||
-rw-r--r-- | src/reconcile/rec_write.c | 50 |
7 files changed, 85 insertions, 51 deletions
diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c index f269c2d7f43..fde4d4fb9de 100644 --- a/src/btree/bt_slvg.c +++ b/src/btree/bt_slvg.c @@ -1300,7 +1300,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) /* Write the new version of the leaf page to disk. */ WT_ERR(__slvg_modify_init(session, page)); - WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR)); + WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR, NULL)); /* Reset the page. */ page->pg_var_d = save_col_var; @@ -2011,7 +2011,7 @@ __slvg_row_build_leaf( /* Write the new version of the leaf page to disk. */ WT_ERR(__slvg_modify_init(session, page)); - WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR)); + WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR, NULL)); /* Reset the page. */ page->pg_row_entries += skip_stop; diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index 129d7fec05f..7bf15baa67f 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -136,8 +136,8 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; ++leaf_pages; - WT_ERR(__wt_reconcile( - session, walk, NULL, WT_CHECKPOINTING)); + WT_ERR(__wt_reconcile(session, + walk, NULL, WT_CHECKPOINTING, NULL)); } } break; @@ -233,7 +233,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) ++leaf_pages; } WT_ERR(__wt_reconcile( - session, walk, NULL, WT_CHECKPOINTING)); + session, walk, NULL, WT_CHECKPOINTING, NULL)); } break; case WT_SYNC_CLOSE: diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c index b0cd50cc655..17b038fb003 100644 --- a/src/evict/evict_file.c +++ b/src/evict/evict_file.c @@ -57,7 +57,8 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) * error, retrying later. */ if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page)) - WT_ERR(__wt_reconcile(session, ref, NULL, WT_EVICTING)); + WT_ERR(__wt_reconcile( + session, ref, NULL, WT_EVICTING, NULL)); /* * We can't evict the page just returned to us (it marks our diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c index 3d1557e027e..893133432bb 100644 --- a/src/evict/evict_page.c +++ b/src/evict/evict_page.c @@ -396,7 +396,7 @@ __evict_review( WT_DECL_RET; WT_PAGE *page; uint32_t flags; - bool modified; + bool lookaside_retry, modified; flags = WT_EVICTING; *flagsp = flags; @@ -495,27 +495,29 @@ __evict_review( * If we have an exclusive lock (we're discarding the tree), assert * there are no updates we cannot read. * - * Don't set any other flags for internal pages: they don't have update - * lists to be saved and restored, nor can we re-create them in memory. + * Don't set any other flags for internal pages: there are no update + * lists to be saved and restored, changes can't be written into the + * lookaside table, nor can we re-create internal pages in memory. * * For leaf pages: * - * If an in-memory configuration or the page is being forcibly evicted, - * set the update-restore flag, so reconciliation will write blocks it + * In-memory pages are a known configuration. + * + * Set the update/restore flag, so reconciliation will write blocks it * can write and create a list of skipped updates for blocks it cannot - * write, along with disk images. This is how eviction of active, huge + * write, along with disk images. This is how eviction of active, huge * pages works: we take a big page and reconcile it into blocks, some of * which we write and discard, the rest of which we re-create as smaller * in-memory pages, (restoring the updates that stopped us from writing - * the block), and inserting the whole mess into the page's parent. - * - * Otherwise, if eviction is getting pressed, configure reconciliation - * to write not-yet-globally-visible updates to the lookaside table, - * allowing the eviction of pages we'd otherwise have to retain in cache - * to support older readers. + * the block), and inserting the whole mess into the page's parent. Set + * the flag in all cases because the incremental cost of update/restore + * in reconciliation is minimal, eviction shouldn't have picked a page + * where update/restore is necessary, absent some cache pressure. It's + * possible updates occurred after we selected this page for eviction, + * but it's unlikely and we don't try and manage that risk. * - * Finally, if we don't need to do eviction at the moment, create disk - * images of split pages in order to re-instantiate them. + * Additionally, if we aren't trying to free space in the cache, scrub + * the page and keep it in memory. */ cache = S2C(session)->cache; if (closing) @@ -524,25 +526,33 @@ __evict_review( if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) LF_SET(WT_EVICT_IN_MEMORY | WT_EVICT_SCRUB | WT_EVICT_UPDATE_RESTORE); - else if (__wt_cache_stuck(session)) - LF_SET(WT_EVICT_LOOKASIDE); - else if (!__wt_txn_visible_all( - session, page->modify->update_txn) || - page->read_gen == WT_READGEN_OLDEST || - page->memory_footprint >= S2BT(session)->splitmempage) + else { LF_SET(WT_EVICT_UPDATE_RESTORE); - /* - * If we aren't trying to free space in the cache, scrub the - * page and keep it around. - */ - if (!LF_ISSET(WT_EVICT_LOOKASIDE) && - F_ISSET(cache, WT_CACHE_EVICT_SCRUB)) - LF_SET(WT_EVICT_SCRUB); + if (F_ISSET(cache, WT_CACHE_EVICT_SCRUB)) + LF_SET(WT_EVICT_SCRUB); + } } - *flagsp = flags; - WT_RET(__wt_reconcile(session, ref, NULL, flags)); + /* Reconcile the page. */ + ret = __wt_reconcile(session, ref, NULL, flags, &lookaside_retry); + + /* + * If reconciliation fails, eviction is stuck and reconciliation reports + * it might succeed if we use the lookaside table (the page didn't have + * uncommitted updates, it was not-yet-globally visible updates causing + * the problem), configure reconciliation to write those updates to the + * lookaside table, allowing the eviction of pages we'd otherwise have + * to retain in cache to support older readers. + */ + if (ret == EBUSY && __wt_cache_stuck(session) && lookaside_retry) { + LF_CLR(WT_EVICT_SCRUB | WT_EVICT_UPDATE_RESTORE); + LF_SET(WT_EVICT_LOOKASIDE); + ret = __wt_reconcile(session, ref, NULL, flags, NULL); + } + + *flagsp = flags; + WT_RET(ret); /* * Success: assert the page is clean or reconciliation was configured diff --git a/src/include/extern.h b/src/include/extern.h index b617b66e8aa..42b34f75a17 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -542,7 +542,7 @@ extern int __wt_ovfl_txnc_add(WT_SESSION_IMPL *session, WT_PAGE *page, const uin extern void __wt_ovfl_txnc_free(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_ovfl_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags, bool *lookaside_retryp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern uint32_t __wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/os_posix/os_fs.c b/src/os_posix/os_fs.c index 26be0f049cc..bc8cbf67025 100644 --- a/src/os_posix/os_fs.c +++ b/src/os_posix/os_fs.c @@ -687,8 +687,7 @@ __posix_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, advise_flag = POSIX_FADV_RANDOM; if (LF_ISSET(WT_FS_OPEN_ACCESS_SEQ)) advise_flag = POSIX_FADV_SEQUENTIAL; - WT_SYSCALL( - posix_fadvise(pfh->fd, 0, 0, advise_flag), ret); + WT_SYSCALL(posix_fadvise(pfh->fd, 0, 0, advise_flag), ret); if (ret != 0) WT_ERR_MSG(session, ret, "%s: handle-open: posix_fadvise", name); diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index f71715412af..86749eef2e1 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -44,7 +44,8 @@ typedef struct { uint64_t max_txn; /* Track if all updates were skipped. */ - bool all_skipped; + uint64_t update_cnt; + uint64_t update_skip_cnt; /* * When we can't mark the page clean (for example, checkpoint found some @@ -349,8 +350,8 @@ static void __rec_dictionary_reset(WT_RECONCILE *); * Reconcile an in-memory page into its on-disk format, and write it. */ int -__wt_reconcile(WT_SESSION_IMPL *session, - WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags) +__wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, + WT_SALVAGE_COOKIE *salvage, uint32_t flags, bool *lookaside_retryp) { WT_DECL_RET; WT_PAGE *page; @@ -360,6 +361,8 @@ __wt_reconcile(WT_SESSION_IMPL *session, page = ref->page; mod = page->modify; + if (lookaside_retryp != NULL) + *lookaside_retryp = false; __wt_verbose(session, WT_VERB_RECONCILE, "%s", __wt_page_type_string(page->type)); @@ -438,6 +441,14 @@ __wt_reconcile(WT_SESSION_IMPL *session, /* Release the reconciliation lock. */ __wt_writeunlock(session, &page->page_lock); + /* + * If our caller can configure lookaside table reconciliation, flag if + * that's worth trying. The lookaside table doesn't help if we skipped + * updates, it can only help with older readers preventing eviction. + */ + if (lookaside_retryp != NULL && r->update_cnt == r->update_skip_cnt) + *lookaside_retryp = true; + /* Update statistics. */ WT_STAT_CONN_INCR(session, rec_pages); WT_STAT_DATA_INCR(session, rec_pages); @@ -545,6 +556,9 @@ __rec_las_checkpoint_test(WT_SESSION_IMPL *session, WT_RECONCILE *r) static int __rec_write_check_complete(WT_SESSION_IMPL *session, WT_RECONCILE *r) { + WT_BOUNDARY *bnd; + size_t i; + /* * If we have used the lookaside table, check for a lookaside table and * checkpoint collision. @@ -553,14 +567,18 @@ __rec_write_check_complete(WT_SESSION_IMPL *session, WT_RECONCILE *r) return (EBUSY); /* - * If we are doing eviction and restoring updates, there is only one - * block and all update were skipped, no progress has been made and - * there is no point swapping the new page into place. + * If we are doing update/restore based eviction, confirm part of the + * page is being discarded, or at least 10% of the updates won't have + * to be re-instantiated. Otherwise, it isn't progress, don't bother. */ - if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && r->all_skipped && - r->bnd_next == 1 && r->bnd[0].supd != NULL) - return (EBUSY); - + if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) { + for (bnd = r->bnd, i = 0; i < r->bnd_entries; ++bnd, ++i) + if (bnd->supd == NULL) + break; + if (i == r->bnd_entries && + r->update_cnt / 10 >= r->update_skip_cnt) + return (EBUSY); + } return (0); } @@ -724,7 +742,7 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) * Fake up a reference structure, and write the next root page. */ __wt_root_ref_init(&fake_ref, next, page->type == WT_PAGE_COL_INT); - return (__wt_reconcile(session, &fake_ref, NULL, flags)); + return (__wt_reconcile(session, &fake_ref, NULL, flags, NULL)); err: __wt_page_out(session, &next); return (ret); @@ -866,7 +884,7 @@ __rec_write_init(WT_SESSION_IMPL *session, r->max_txn = WT_TXN_NONE; /* Track if all updates were skipped. */ - r->all_skipped = true; + r->update_cnt = r->update_skip_cnt = 0; /* Track if the page can be marked clean. */ r->leave_dirty = false; @@ -1109,6 +1127,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, } else upd_list = ins->upd; + ++r->update_cnt; for (skipped = false, max_txn = WT_TXN_NONE, min_txn = UINT64_MAX, upd = upd_list; upd != NULL; upd = upd->next) { @@ -1199,7 +1218,12 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, txnid != S2C(session)->txn_global.checkpoint_txnid || WT_SESSION_IS_CHECKPOINT(session)); #endif - r->all_skipped = false; + + /* + * Track how many update chains we saw vs. how many update + * chains had an entry we skipped. + */ + ++r->update_skip_cnt; return (0); } |