summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/btree
diff options
context:
space:
mode:
authorAlex Gorrod <alexander.gorrod@mongodb.com>2016-08-04 16:10:27 +1000
committerAlex Gorrod <alexander.gorrod@mongodb.com>2016-08-04 16:20:01 +1000
commitc4612847ed88c5e0500f0e9ecb2ecdcb49621cf1 (patch)
treeaa58b63ad77ea46be39793aa3d9f94a6b0104163 /src/third_party/wiredtiger/src/btree
parent5e9b3e958ad96db67a8f4f8790947b1b62b6469e (diff)
downloadmongo-c4612847ed88c5e0500f0e9ecb2ecdcb49621cf1.tar.gz
Import wiredtiger-wiredtiger-2.8.0-589-ga9e9696.tar.gz from wiredtiger branch mongodb-3.4
ref: d8fb874..a9e9696 for: 3.3.11 SERVER-24971 Excessive memory held by sessions when application threads do evictions WT-1162 Add latency to Jenkins wtperf tests and plots WT-2026 Maximum pages size at eviction too large WT-2239 Make sure LSM cursors read up to date dsk_gen, it was racing with compact WT-2353 Failure to create async threads as part of a wiredtiger_open call will cause a hang WT-2380 Make scripts fail if code doesn't match style WT-2486 Update make check so that it runs faster WT-2578 remove write barriers from the TAILQ_INSERT_XXX macros WT-2648 cache-line alignment for new ports WT-2665 Limit allocator fragmentation in WiredTiger WT-2693 Check open_cursor error paths for consistent handling WT-2708 split child-update race with reconciliation/eviction WT-2711 Change statistics log configuration options WT-2728 Don't re-read log file headers during log_flush WT-2729 Focus eviction walks in largest trees WT-2730 cursor next/prev can return the wrong key/value pair when crossing a page boundary WT-2731 Raw compression can create pages that are larger than expected WT-2732 Coverity analysis defect 99665: Redundant test WT-2737 Scrub dirty pages rather than evicting them WT-2738 Remove the ability to change the default checkpoint name WT-2739 pluggable file systems documentation cleanups WT-2743 Thread count statistics always report 0 WT-2744 partial line even with line buffering set WT-2746 track checkpoint I/O separately from eviction I/O WT-2751 column-store statistics incorrectly calculates the number of entries WT-2752 Fixes to zipfian wtperf workload config WT-2755 flexelint configuration treats size_t as 4B type WT-2756 Upgrade the autoconf archive package to check for swig 3.0 WT-2757 Column tables behave differently when column names are provided WT-2759 Releasing the hot-backup lock doesn't require the schema lock. WT-2760 Fix a bug in backup related to directory sync. Change the filesystem API to make durable the default WT-2762 wtstats tool fails if checkpoint runs WT-2763 Unit test test_intpack failing on OSX WT-2764 Optimize checkpoints to reduce throughput disruption WT-2765 wt dump: indices need to be shown in the dump output WT-2767 test suite needs way to run an individual scenario WT-2769 Update documentation to reflect correct limits of memory_page_max WT-2770 Add statistics tracking schema operations WT-2772 Investigate log performance testing weirdness WT-2773 search_near in indexes does not find exact matches WT-2774 minor cleanups/improvements WT-2778 Python test suite: make scenario initialization consistent WT-2779 Raw compression created unexpectedly large pages on disk WT-2781 Enhance bulk cursor option with an option to return immediately on contention WT-2782 Missing a fs_directory_list_free in ex_file_system.c WT-2785 Scrub dirty pages rather than evicting them: single-page reconciliation WT-2791 Enhance OS X Evergreen unit test WT-2793 wtperf config improvements WT-2796 Memory leak in reconciliation uncovered by stress testing WT-2798 Crash vulnerability with nojournal after create during checkpoint WT-2800 Illegal file format in test/format on PPC WT-2801 Crash vulnerability from eviction of metadata during checkpoint WT-2802 Transaction commit causes heap-use-after free WT-2803 Add verbose functionality to WT Evergreen tests WT-2804 Don't read values in a tree without a snapshot WT-2805 Infinite recursion if error streams fail WT-2806 wtperf allocation size off-by-one
Diffstat (limited to 'src/third_party/wiredtiger/src/btree')
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curnext.c14
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curprev.c6
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_discard.c4
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_handle.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_huffman.c3
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_io.c9
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_ovfl.c2
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_page.c1
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_read.c27
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c141
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_stat.c5
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_sync.c58
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_walk.c20
-rw-r--r--src/third_party/wiredtiger/src/btree/row_srch.c2
14 files changed, 170 insertions, 124 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c
index 70b3ba56e31..e1b097c22a5 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curnext.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c
@@ -183,6 +183,7 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, bool newpage)
if (cbt->last_standard_recno == 0)
return (WT_NOTFOUND);
__cursor_set_recno(cbt, cbt->ref->ref_recno);
+ cbt->cip_saved = NULL;
goto new_page;
}
@@ -301,12 +302,13 @@ __cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage)
* WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are
* odd-numbered slots, and WT_ROW array slots are even-numbered slots.
*
- * New page configuration.
+ * Initialize for each new page.
*/
if (newpage) {
cbt->ins_head = WT_ROW_INSERT_SMALLEST(page);
cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
cbt->row_iteration_slot = 1;
+ cbt->rip_saved = NULL;
goto new_insert;
}
@@ -517,11 +519,13 @@ __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt)
*/
F_SET(cbt, WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV);
- /*
- * Clear the count of deleted items on the page.
- */
+ /* Clear the count of deleted items on the page. */
cbt->page_deleted_count = 0;
+ /* Clear saved iteration cursor position information. */
+ cbt->cip_saved = NULL;
+ cbt->rip_saved = NULL;
+
/*
* If we don't have a search page, then we're done, we're starting at
* the beginning or end of the tree, not as a result of a search.
@@ -661,7 +665,7 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating)
if (page != NULL &&
(cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD ||
(newpage && cbt->page_deleted_count > 0)))
- __wt_page_evict_soon(page);
+ WT_ERR(__wt_page_evict_soon(session, cbt->ref));
cbt->page_deleted_count = 0;
WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c
index 872f648446c..e39dffa357f 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curprev.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c
@@ -329,6 +329,7 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage)
if (cbt->last_standard_recno == 0)
return (WT_NOTFOUND);
__cursor_set_recno(cbt, cbt->last_standard_recno);
+ cbt->cip_saved = NULL;
goto new_page;
}
@@ -447,7 +448,7 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage)
* WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are
* odd-numbered slots, and WT_ROW array slots are even-numbered slots.
*
- * New page configuration.
+ * Initialize for each new page.
*/
if (newpage) {
/*
@@ -464,6 +465,7 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage)
WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1);
cbt->ins = WT_SKIP_LAST(cbt->ins_head);
cbt->row_iteration_slot = page->pg_row_entries * 2 + 1;
+ cbt->rip_saved = NULL;
goto new_insert;
}
@@ -619,7 +621,7 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating)
if (page != NULL &&
(cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD ||
(newpage && cbt->page_deleted_count > 0)))
- __wt_page_evict_soon(page);
+ WT_ERR(__wt_page_evict_soon(session, cbt->ref));
cbt->page_deleted_count = 0;
WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c
index a00bb7dc2b5..965aec16fc2 100644
--- a/src/third_party/wiredtiger/src/btree/bt_discard.c
+++ b/src/third_party/wiredtiger/src/btree/bt_discard.c
@@ -131,8 +131,10 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep)
/* Discard any disk image. */
dsk = (WT_PAGE_HEADER *)page->dsk;
- if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC))
+ if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC)) {
+ __wt_cache_page_image_decr(session, dsk->mem_size);
__wt_overwrite_and_free_len(session, dsk, dsk->mem_size);
+ }
/* Discard any mapped image. */
if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_MAPPED))
diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c
index c97e05d74a7..cacf1369430 100644
--- a/src/third_party/wiredtiger/src/btree/bt_handle.c
+++ b/src/third_party/wiredtiger/src/btree/bt_handle.c
@@ -690,6 +690,8 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
* Don't let pages grow large compared to the cache size or we can end
* up in a situation where nothing can be evicted. Take care getting
* the cache size: with a shared cache, it may not have been set.
+ * Don't forget to update the API documentation if you alter the
+ * bounds for any of the parameters here.
*/
WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval));
btree->maxmempage = (uint64_t)cval.val;
diff --git a/src/third_party/wiredtiger/src/btree/bt_huffman.c b/src/third_party/wiredtiger/src/btree/bt_huffman.c
index 9e9d69c342e..918791d9c6e 100644
--- a/src/third_party/wiredtiger/src/btree/bt_huffman.c
+++ b/src/third_party/wiredtiger/src/btree/bt_huffman.c
@@ -157,7 +157,8 @@ __huffman_confchk_file(WT_SESSION_IMPL *session,
/* Check the file exists. */
WT_RET(__wt_strndup(session, v->str + len, v->len - len, &fname));
- WT_ERR(__wt_fopen(session, fname, WT_OPEN_FIXED, WT_STREAM_READ, &fs));
+ WT_ERR(__wt_fopen(
+ session, fname, WT_FS_OPEN_FIXED, WT_STREAM_READ, &fs));
/* Optionally return the file handle. */
if (fsp == NULL)
diff --git a/src/third_party/wiredtiger/src/btree/bt_io.c b/src/third_party/wiredtiger/src/btree/bt_io.c
index 4339de6f25c..6c2e2f1b3fb 100644
--- a/src/third_party/wiredtiger/src/btree/bt_io.c
+++ b/src/third_party/wiredtiger/src/btree/bt_io.c
@@ -117,7 +117,7 @@ __wt_bt_read(WT_SESSION_IMPL *session,
*/
if (ret != 0 ||
result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) {
- fail_msg = "block decryption failed";
+ fail_msg = "block decompression failed";
goto corrupt;
}
} else
@@ -168,7 +168,8 @@ err: __wt_scr_free(session, &tmp);
*/
int
__wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf,
- uint8_t *addr, size_t *addr_sizep, bool checkpoint, bool compressed)
+ uint8_t *addr, size_t *addr_sizep,
+ bool checkpoint, bool checkpoint_io, bool compressed)
{
WT_BM *bm;
WT_BTREE *btree;
@@ -359,10 +360,12 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf,
/* Call the block manager to write the block. */
WT_ERR(checkpoint ?
bm->checkpoint(bm, session, ip, btree->ckpt, data_cksum) :
- bm->write(bm, session, ip, addr, addr_sizep, data_cksum));
+ bm->write(
+ bm, session, ip, addr, addr_sizep, data_cksum, checkpoint_io));
WT_STAT_FAST_CONN_INCR(session, cache_write);
WT_STAT_FAST_DATA_INCR(session, cache_write);
+ S2C(session)->cache->bytes_written += dsk->mem_size;
WT_STAT_FAST_CONN_INCRV(session, cache_bytes_write, dsk->mem_size);
WT_STAT_FAST_DATA_INCRV(session, cache_bytes_write, dsk->mem_size);
diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c
index fbe361e000a..1f080041a23 100644
--- a/src/third_party/wiredtiger/src/btree/bt_ovfl.c
+++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c
@@ -33,6 +33,7 @@ __ovfl_read(WT_SESSION_IMPL *session,
store->data = WT_PAGE_HEADER_BYTE(btree, dsk);
store->size = dsk->u.datalen;
+ WT_STAT_FAST_CONN_INCR(session, cache_read_overflow);
WT_STAT_FAST_DATA_INCR(session, cache_read_overflow);
return (0);
@@ -208,6 +209,7 @@ __wt_ovfl_cache(WT_SESSION_IMPL *session,
*/
if (!visible) {
WT_RET(__ovfl_cache(session, page, vpack));
+ WT_STAT_FAST_CONN_INCR(session, cache_overflow_value);
WT_STAT_FAST_DATA_INCR(session, cache_overflow_value);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c
index 00ec8aa4494..89e5f428628 100644
--- a/src/third_party/wiredtiger/src/btree/bt_page.c
+++ b/src/third_party/wiredtiger/src/btree/bt_page.c
@@ -219,6 +219,7 @@ __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref,
/* Update the page's in-memory size and the cache statistics. */
__wt_cache_page_inmem_incr(session, page, size);
+ __wt_cache_page_image_incr(session, dsk->mem_size);
/* Link the new internal page to the parent. */
if (ref != NULL) {
diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c
index 086500c8b2f..3d396d5ae5b 100644
--- a/src/third_party/wiredtiger/src/btree/bt_read.c
+++ b/src/third_party/wiredtiger/src/btree/bt_read.c
@@ -296,7 +296,7 @@ err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
* __evict_force_check --
* Check if a page matches the criteria for forced eviction.
*/
-static int
+static bool
__evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref)
{
WT_BTREE *btree;
@@ -307,26 +307,26 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref)
/* Leaf pages only. */
if (WT_PAGE_IS_INTERNAL(page))
- return (0);
+ return (false);
/*
* It's hard to imagine a page with a huge memory footprint that has
* never been modified, but check to be sure.
*/
if (page->modify == NULL)
- return (0);
+ return (false);
/* Pages are usually small enough, check that first. */
if (page->memory_footprint < btree->splitmempage)
- return (0);
+ return (false);
else if (page->memory_footprint < btree->maxmempage)
return (__wt_leaf_page_can_split(session, page));
/* Trigger eviction on the next page release. */
- __wt_page_evict_soon(page);
+ (void)__wt_page_evict_soon(session, ref);
/* Bump the oldest ID, we're about to do some visibility checks. */
- WT_RET(__wt_txn_update_oldest(session, 0));
+ (void)__wt_txn_update_oldest(session, 0);
/* If eviction cannot succeed, don't try. */
return (__wt_page_can_evict(session, ref, NULL));
@@ -548,10 +548,14 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
* if the page qualifies for forced eviction and update
* the page's generation number. If eviction isn't being
* done on this file, we're done.
+ * In-memory split of large pages is allowed while
+ * no_eviction is set on btree, whereas reconciliation
+ * is not allowed.
*/
if (LF_ISSET(WT_READ_NO_EVICT) ||
F_ISSET(session, WT_SESSION_NO_EVICTION) ||
- F_ISSET(btree, WT_BTREE_NO_EVICTION))
+ (F_ISSET(btree, WT_BTREE_NO_EVICTION) &&
+ !F_ISSET(btree, WT_BTREE_NO_RECONCILE)))
goto skip_evict;
/*
@@ -595,7 +599,14 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
page = ref->page;
if (page->read_gen == WT_READGEN_NOTSET) {
if (evict_soon)
- __wt_page_evict_soon(page);
+ /*
+ * Ignore error returns, since the
+ * evict soon call is advisory and we
+ * are holding a hazard pointer to the
+ * page already.
+ */
+ (void)__wt_page_evict_soon(
+ session, ref);
else
__wt_cache_read_gen_new(session, page);
} else if (!LF_ISSET(WT_READ_NO_GEN))
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index 7a05a883f83..4f6f300802e 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -298,7 +298,7 @@ static int
__split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
WT_REF **from_refp, size_t *decrp, WT_REF **to_refp, size_t *incrp)
{
- WT_ADDR *addr;
+ WT_ADDR *addr, *ref_addr;
WT_CELL_UNPACK unpack;
WT_DECL_RET;
WT_IKEY *ikey;
@@ -345,13 +345,18 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
}
/*
- * If there's no address (the page has never been written), or the
- * address has been instantiated, there's no work to do. Otherwise,
- * instantiate the address in-memory, from the on-page cell.
+ * If there's no address at all (the page has never been written), or
+ * the address has already been instantiated, there's no work to do.
+ * Otherwise, the address still references a split page on-page cell,
+ * instantiate it. We can race with reconciliation and/or eviction of
+ * the child pages, be cautious: read the address and verify it, and
+ * only update it if the value is unchanged from the original. In the
+ * case of a race, the address must no longer reference the split page,
+ * we're done.
*/
- addr = ref->addr;
- if (addr != NULL && !__wt_off_page(from_home, addr)) {
- __wt_cell_unpack((WT_CELL *)ref->addr, &unpack);
+ WT_ORDERED_READ(ref_addr, ref->addr);
+ if (ref_addr != NULL && !__wt_off_page(from_home, ref_addr)) {
+ __wt_cell_unpack((WT_CELL *)ref_addr, &unpack);
WT_RET(__wt_calloc_one(session, &addr));
if ((ret = __wt_strndup(
session, unpack.data, unpack.size, &addr->addr)) != 0) {
@@ -371,7 +376,10 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
break;
WT_ILLEGAL_VALUE(session);
}
- ref->addr = addr;
+ if (!__wt_atomic_cas_ptr(&ref->addr, ref_addr, addr)) {
+ __wt_free(session, addr->addr);
+ __wt_free(session, addr);
+ }
}
/* And finally, copy the WT_REF pointer itself. */
@@ -786,7 +794,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
*/
if (result_entries == 0) {
empty_parent = true;
- __wt_page_evict_soon(parent);
+ if (!__wt_ref_is_root(parent->pg_intl_parent_ref))
+ ret = __wt_page_evict_soon(
+ session, parent->pg_intl_parent_ref);
goto err;
}
@@ -1462,11 +1472,11 @@ err: if (parent != NULL)
/*
* __split_multi_inmem --
- * Instantiate a page in a multi-block set.
+ * Instantiate a page from a disk image.
*/
static int
__split_multi_inmem(
- WT_SESSION_IMPL *session, WT_PAGE *orig, WT_REF *ref, WT_MULTI *multi)
+ WT_SESSION_IMPL *session, WT_PAGE *orig, WT_MULTI *multi, WT_REF *ref)
{
WT_CURSOR_BTREE cbt;
WT_DECL_ITEM(key);
@@ -1487,13 +1497,12 @@ __split_multi_inmem(
orig->type != WT_PAGE_COL_VAR || ref->ref_recno != 0);
/*
- * This code re-creates an in-memory page that is part of a set created
- * while evicting a large page, and adds references to any unresolved
- * update chains to the new page. We get here due to choosing to keep
- * the results of a split in memory or because and update could not be
- * written when attempting to evict a page.
+ * This code re-creates an in-memory page from a disk image, and adds
+ * references to any unresolved update chains to the new page. We get
+ * here either because an update could not be written when evicting a
+ * page, or eviction chose to keep a page in memory.
*
- * Clear the disk image and link the page into the passed-in WT_REF to
+ * Steal the disk image and link the page into the passed-in WT_REF to
* simplify error handling: our caller will not discard the disk image
* when discarding the original page, and our caller will discard the
* allocated page on error, when discarding the allocated WT_REF.
@@ -1503,6 +1512,19 @@ __split_multi_inmem(
WT_PAGE_DISK_ALLOC, &page));
multi->disk_image = NULL;
+ /*
+ * Put the re-instantiated page in the same LRU queue location as the
+ * original page, unless this was a forced eviction, in which case we
+ * leave the new page with the read generation unset. Eviction will
+ * set the read generation next time it visits this page.
+ */
+ if (orig->read_gen != WT_READGEN_OLDEST)
+ page->read_gen = orig->read_gen;
+
+ /* If there are no updates to apply to the page, we're done. */
+ if (multi->supd_entries == 0)
+ return (0);
+
if (orig->type == WT_PAGE_ROW_LEAF)
WT_RET(__wt_scr_alloc(session, 0, &key));
@@ -1551,14 +1573,12 @@ __split_multi_inmem(
}
/*
- * If we modified the page above, it will have set the first dirty
- * transaction to the last transaction currently running. However, the
- * updates we installed may be older than that. Set the first dirty
- * transaction to an impossibly old value so this page is never skipped
- * in a checkpoint.
+ * When modifying the page we set the first dirty transaction to the
+ * last transaction currently running. However, the updates we made
+ * might be older than that. Set the first dirty transaction to an
+ * impossibly old value so this page is never skipped in a checkpoint.
*/
- if (page->modify != NULL)
- page->modify->first_dirty_txn = WT_TXN_FIRST;
+ page->modify->first_dirty_txn = WT_TXN_FIRST;
err: /* Free any resources that may have been cached in the cursor. */
WT_TRET(__wt_btcur_close(&cbt, true));
@@ -1629,19 +1649,17 @@ __split_multi_inmem_fail(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_REF *ref)
*/
int
__wt_multi_to_ref(WT_SESSION_IMPL *session,
- WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp)
+ WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp, bool closing)
{
WT_ADDR *addr;
WT_IKEY *ikey;
WT_REF *ref;
- size_t incr;
-
- incr = 0;
/* Allocate an underlying WT_REF. */
WT_RET(__wt_calloc_one(session, refp));
ref = *refp;
- incr += sizeof(WT_REF);
+ if (incrp)
+ *incrp += sizeof(WT_REF);
/*
* Set the WT_REF key before (optionally) building the page, underlying
@@ -1653,21 +1671,34 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
ikey = multi->key.ikey;
WT_RET(__wt_row_ikey(
session, 0, WT_IKEY_DATA(ikey), ikey->size, ref));
- incr += sizeof(WT_IKEY) + ikey->size;
+ if (incrp)
+ *incrp += sizeof(WT_IKEY) + ikey->size;
break;
default:
ref->ref_recno = multi->key.recno;
break;
}
- /* If there's a disk image, build a page, otherwise set the address. */
- if (multi->disk_image == NULL) {
- /*
- * Copy the address: we could simply take the buffer, but that
- * would complicate error handling, freeing the reference array
- * would have to avoid freeing the memory, and it's not worth
- * the confusion.
- */
+ /* There should be an address or a disk image (or both). */
+ WT_ASSERT(session,
+ multi->addr.addr != NULL || multi->disk_image != NULL);
+
+ /* If we're closing the file, there better be an address. */
+ WT_ASSERT(session, multi->addr.addr != NULL || !closing);
+
+ /* Verify any disk image we have. */
+ WT_ASSERT(session, multi->disk_image == NULL ||
+ __wt_verify_dsk_image(session,
+ "[page instantiate]", multi->disk_image, 0, false) == 0);
+
+ /*
+ * If there's an address, the page was written, set it.
+ *
+ * Copy the address: we could simply take the buffer, but that would
+ * complicate error handling, freeing the reference array would have
+ * to avoid freeing the memory, and it's not worth the confusion.
+ */
+ if (multi->addr.addr != NULL) {
WT_RET(__wt_calloc_one(session, &addr));
ref->addr = addr;
addr->size = multi->addr.size;
@@ -1675,14 +1706,20 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
WT_RET(__wt_strndup(session,
multi->addr.addr, addr->size, &addr->addr));
ref->state = WT_REF_DISK;
- } else {
- WT_RET(__split_multi_inmem(session, page, ref, multi));
+ }
+
+ /*
+ * If we have a disk image and we're not closing the file,
+ * re-instantiate the page.
+ *
+ * Discard any page image we don't use.
+ */
+ if (multi->disk_image != NULL && !closing) {
+ WT_RET(__split_multi_inmem(session, page, multi, ref));
ref->state = WT_REF_MEM;
}
+ __wt_free(session, multi->disk_image);
- /* Optionally return changes in the memory footprint. */
- if (incrp != NULL)
- *incrp += incr;
return (0);
}
@@ -2086,8 +2123,8 @@ __split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing)
*/
WT_RET(__wt_calloc_def(session, new_entries, &ref_new));
for (i = 0; i < new_entries; ++i)
- WT_ERR(__wt_multi_to_ref(session,
- page, &mod->mod_multi[i], &ref_new[i], &parent_incr));
+ WT_ERR(__wt_multi_to_ref(session, page,
+ &mod->mod_multi[i], &ref_new[i], &parent_incr, closing));
/*
* Split into the parent; if we're closing the file, we hold it
@@ -2175,15 +2212,13 @@ __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
* Rewrite an in-memory page with a new version.
*/
int
-__wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref)
+__wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, WT_MULTI *multi)
{
WT_DECL_RET;
WT_PAGE *page;
- WT_PAGE_MODIFY *mod;
WT_REF *new;
page = ref->page;
- mod = page->modify;
WT_RET(__wt_verbose(
session, WT_VERB_SPLIT, "%p: split-rewrite", ref->page));
@@ -2198,14 +2233,14 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref)
*
* Build the new page.
*
- * Allocate a WT_REF because the error path uses routines that will ea
- * free memory. The only field we need to set is the record number, as
- * it's used by the search routines.
+ * Allocate a WT_REF, the error path calls routines that free memory.
+ * The only field we need to set is the record number, as it's used by
+ * the search routines.
*/
WT_RET(__wt_calloc_one(session, &new));
new->ref_recno = ref->ref_recno;
- WT_ERR(__split_multi_inmem(session, page, new, &mod->mod_multi[0]));
+ WT_ERR(__split_multi_inmem(session, page, multi, new));
/*
* The rewrite succeeded, we can no longer fail.
@@ -2213,7 +2248,7 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref)
* Finalize the move, discarding moved update lists from the original
* page.
*/
- __split_multi_inmem_final(page, &mod->mod_multi[0]);
+ __split_multi_inmem_final(page, multi);
/*
* Discard the original page.
diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c
index 3d5abf34147..d3ddf33446e 100644
--- a/src/third_party/wiredtiger/src/btree/bt_stat.c
+++ b/src/third_party/wiredtiger/src/btree/bt_stat.c
@@ -41,6 +41,9 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage);
WT_STAT_SET(session, stats, btree_maxleafvalue, btree->maxleafvalue);
+ WT_STAT_SET(session, stats, cache_bytes_inuse,
+ __wt_btree_bytes_inuse(session));
+
/* Everything else is really, really expensive. */
if (!F_ISSET(cst, WT_CONN_STAT_ALL))
return (0);
@@ -139,7 +142,7 @@ __stat_page_col_var(
} else {
orig_deleted = false;
__wt_cell_unpack(cell, unpack);
- if (unpack->type == WT_CELL_ADDR_DEL)
+ if (unpack->type == WT_CELL_DEL)
orig_deleted = true;
else {
entry_cnt += __wt_cell_rle(unpack);
diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c
index da6c53aa316..df794c96cda 100644
--- a/src/third_party/wiredtiger/src/btree/bt_sync.c
+++ b/src/third_party/wiredtiger/src/btree/bt_sync.c
@@ -84,7 +84,8 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
WT_ERR(__wt_txn_get_snapshot(session));
leaf_bytes += page->memory_footprint;
++leaf_pages;
- WT_ERR(__wt_reconcile(session, walk, NULL, 0));
+ WT_ERR(__wt_reconcile(
+ session, walk, NULL, WT_CHECKPOINTING));
}
}
break;
@@ -92,7 +93,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
/*
* If we are flushing a file at read-committed isolation, which
* is of particular interest for flushing the metadata to make
- * schema-changing operation durable, get a transactional
+ * a schema-changing operation durable, get a transactional
* snapshot now.
*
* All changes committed up to this point should be included.
@@ -126,7 +127,17 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
*/
WT_PUBLISH(btree->checkpointing, WT_CKPT_PREPARE);
- WT_ERR(__wt_evict_file_exclusive_on(session));
+ /*
+ * Sync for checkpoint allows splits to happen while the queue
+ * is being drained, but not reconciliation. We need to do this,
+ * since draining the queue can take long enough for hot pages
+ * to grow significantly larger than the configured maximum
+ * size.
+ */
+ F_SET(btree, WT_BTREE_NO_RECONCILE);
+ ret = __wt_evict_file_exclusive_on(session);
+ F_CLR(btree, WT_BTREE_NO_RECONCILE);
+ WT_ERR(ret);
__wt_evict_file_exclusive_off(session);
WT_PUBLISH(btree->checkpointing, WT_CKPT_RUNNING);
@@ -183,7 +194,8 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
leaf_bytes += page->memory_footprint;
++leaf_pages;
}
- WT_ERR(__wt_reconcile(session, walk, NULL, 0));
+ WT_ERR(__wt_reconcile(
+ session, walk, NULL, WT_CHECKPOINTING));
}
break;
case WT_SYNC_CLOSE:
@@ -217,41 +229,9 @@ err: /* On error, clear any left-over tree walk. */
saved_snap_min == WT_TXN_NONE)
__wt_txn_release_snapshot(session);
- if (btree->checkpointing != WT_CKPT_OFF) {
- /*
- * Update the checkpoint generation for this handle so visible
- * updates newer than the checkpoint can be evicted.
- *
- * This has to be published before eviction is enabled again,
- * so that eviction knows that the checkpoint has completed.
- */
- WT_PUBLISH(btree->checkpoint_gen,
- conn->txn_global.checkpoint_gen);
- WT_STAT_FAST_DATA_SET(session,
- btree_checkpoint_generation, btree->checkpoint_gen);
-
- /*
- * Clear the checkpoint flag and push the change; not required,
- * but publishing the change means stalled eviction gets moving
- * as soon as possible.
- */
- btree->checkpointing = WT_CKPT_OFF;
- WT_FULL_BARRIER();
-
- /*
- * If this tree was being skipped by the eviction server during
- * the checkpoint, clear the wait.
- */
- btree->evict_walk_period = 0;
-
- /*
- * Wake the eviction server, in case application threads have
- * stalled while the eviction server decided it couldn't make
- * progress. Without this, application threads will be stalled
- * until the eviction server next wakes.
- */
- WT_TRET(__wt_evict_server_wake(session));
- }
+ /* Clear the checkpoint flag and push the change. */
+ if (btree->checkpointing != WT_CKPT_OFF)
+ WT_PUBLISH(btree->checkpointing, WT_CKPT_OFF);
__wt_spin_unlock(session, &btree->flush_lock);
diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c
index bb8a750d848..17d32d6ed63 100644
--- a/src/third_party/wiredtiger/src/btree/bt_walk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_walk.c
@@ -381,16 +381,6 @@ restart: /*
__ref_ascend(session, &ref, &pindex, &slot);
/*
- * If we got all the way through an internal page and
- * all of the child pages were deleted, mark it for
- * eviction.
- */
- if (empty_internal && pindex->entries > 1) {
- __wt_page_evict_soon(ref->page);
- empty_internal = false;
- }
-
- /*
* If at the root and returning internal pages, return
* the root page, otherwise we're done. Regardless, no
* hazard pointer is required, release the one we hold.
@@ -404,6 +394,16 @@ restart: /*
}
/*
+ * If we got all the way through an internal page and
+ * all of the child pages were deleted, mark it for
+ * eviction.
+ */
+ if (empty_internal && pindex->entries > 1) {
+ WT_ERR(__wt_page_evict_soon(session, ref));
+ empty_internal = false;
+ }
+
+ /*
* Optionally return internal pages. Swap our previous
* hazard pointer for the page we'll return. We don't
* handle restart or not-found returns, it would require
diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c
index 4afcd74520f..0f70e84de7e 100644
--- a/src/third_party/wiredtiger/src/btree/row_srch.c
+++ b/src/third_party/wiredtiger/src/btree/row_srch.c
@@ -775,7 +775,7 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
* traversing the skip list each time accumulates to real time.
*/
if (samples > 5000)
- __wt_page_evict_soon(page);
+ WT_RET(__wt_page_evict_soon(session, cbt->ref));
return (0);
}