diff options
author | Alex Gorrod <alexander.gorrod@mongodb.com> | 2016-08-04 16:10:27 +1000 |
---|---|---|
committer | Alex Gorrod <alexander.gorrod@mongodb.com> | 2016-08-04 16:20:01 +1000 |
commit | c4612847ed88c5e0500f0e9ecb2ecdcb49621cf1 (patch) | |
tree | aa58b63ad77ea46be39793aa3d9f94a6b0104163 /src/third_party/wiredtiger/src/btree | |
parent | 5e9b3e958ad96db67a8f4f8790947b1b62b6469e (diff) | |
download | mongo-c4612847ed88c5e0500f0e9ecb2ecdcb49621cf1.tar.gz |
Import wiredtiger-wiredtiger-2.8.0-589-ga9e9696.tar.gz from wiredtiger branch mongodb-3.4
ref: d8fb874..a9e9696
for: 3.3.11
SERVER-24971 Excessive memory held by sessions when application threads do evictions
WT-1162 Add latency to Jenkins wtperf tests and plots
WT-2026 Maximum pages size at eviction too large
WT-2239 Make sure LSM cursors read up to date dsk_gen, it was racing with compact
WT-2353 Failure to create async threads as part of a wiredtiger_open call will cause a hang
WT-2380 Make scripts fail if code doesn't match style
WT-2486 Update make check so that it runs faster
WT-2578 remove write barriers from the TAILQ_INSERT_XXX macros
WT-2648 cache-line alignment for new ports
WT-2665 Limit allocator fragmentation in WiredTiger
WT-2693 Check open_cursor error paths for consistent handling
WT-2708 split child-update race with reconciliation/eviction
WT-2711 Change statistics log configuration options
WT-2728 Don't re-read log file headers during log_flush
WT-2729 Focus eviction walks in largest trees
WT-2730 cursor next/prev can return the wrong key/value pair when crossing a page boundary
WT-2731 Raw compression can create pages that are larger than expected
WT-2732 Coverity analysis defect 99665: Redundant test
WT-2737 Scrub dirty pages rather than evicting them
WT-2738 Remove the ability to change the default checkpoint name
WT-2739 pluggable file systems documentation cleanups
WT-2743 Thread count statistics always report 0
WT-2744 partial line even with line buffering set
WT-2746 track checkpoint I/O separately from eviction I/O
WT-2751 column-store statistics incorrectly calculates the number of entries
WT-2752 Fixes to zipfian wtperf workload config
WT-2755 flexelint configuration treats size_t as 4B type
WT-2756 Upgrade the autoconf archive package to check for swig 3.0
WT-2757 Column tables behave differently when column names are provided
WT-2759 Releasing the hot-backup lock doesn't require the schema lock.
WT-2760 Fix a bug in backup related to directory sync. Change the filesystem API to make durable the default
WT-2762 wtstats tool fails if checkpoint runs
WT-2763 Unit test test_intpack failing on OSX
WT-2764 Optimize checkpoints to reduce throughput disruption
WT-2765 wt dump: indices need to be shown in the dump output
WT-2767 test suite needs way to run an individual scenario
WT-2769 Update documentation to reflect correct limits of memory_page_max
WT-2770 Add statistics tracking schema operations
WT-2772 Investigate log performance testing weirdness
WT-2773 search_near in indexes does not find exact matches
WT-2774 minor cleanups/improvements
WT-2778 Python test suite: make scenario initialization consistent
WT-2779 Raw compression created unexpectedly large pages on disk
WT-2781 Enhance bulk cursor option with an option to return immediately on contention
WT-2782 Missing a fs_directory_list_free in ex_file_system.c
WT-2785 Scrub dirty pages rather than evicting them: single-page reconciliation
WT-2791 Enhance OS X Evergreen unit test
WT-2793 wtperf config improvements
WT-2796 Memory leak in reconciliation uncovered by stress testing
WT-2798 Crash vulnerability with nojournal after create during checkpoint
WT-2800 Illegal file format in test/format on PPC
WT-2801 Crash vulnerability from eviction of metadata during checkpoint
WT-2802 Transaction commit causes heap-use-after free
WT-2803 Add verbose functionality to WT Evergreen tests
WT-2804 Don't read values in a tree without a snapshot
WT-2805 Infinite recursion if error streams fail
WT-2806 wtperf allocation size off-by-one
Diffstat (limited to 'src/third_party/wiredtiger/src/btree')
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_curnext.c | 14 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_curprev.c | 6 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_discard.c | 4 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_handle.c | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_huffman.c | 3 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_io.c | 9 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_ovfl.c | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_page.c | 1 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_read.c | 27 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_split.c | 141 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_stat.c | 5 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_sync.c | 58 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_walk.c | 20 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/row_srch.c | 2 |
14 files changed, 170 insertions, 124 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c index 70b3ba56e31..e1b097c22a5 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curnext.c +++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c @@ -183,6 +183,7 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, bool newpage) if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->ref->ref_recno); + cbt->cip_saved = NULL; goto new_page; } @@ -301,12 +302,13 @@ __cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage) * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are * odd-numbered slots, and WT_ROW array slots are even-numbered slots. * - * New page configuration. + * Initialize for each new page. */ if (newpage) { cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); cbt->ins = WT_SKIP_FIRST(cbt->ins_head); cbt->row_iteration_slot = 1; + cbt->rip_saved = NULL; goto new_insert; } @@ -517,11 +519,13 @@ __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt) */ F_SET(cbt, WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV); - /* - * Clear the count of deleted items on the page. - */ + /* Clear the count of deleted items on the page. */ cbt->page_deleted_count = 0; + /* Clear saved iteration cursor position information. */ + cbt->cip_saved = NULL; + cbt->rip_saved = NULL; + /* * If we don't have a search page, then we're done, we're starting at * the beginning or end of the tree, not as a result of a search. @@ -661,7 +665,7 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) if (page != NULL && (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD || (newpage && cbt->page_deleted_count > 0))) - __wt_page_evict_soon(page); + WT_ERR(__wt_page_evict_soon(session, cbt->ref)); cbt->page_deleted_count = 0; WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c index 872f648446c..e39dffa357f 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curprev.c +++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c @@ -329,6 +329,7 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage) if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->last_standard_recno); + cbt->cip_saved = NULL; goto new_page; } @@ -447,7 +448,7 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage) * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are * odd-numbered slots, and WT_ROW array slots are even-numbered slots. * - * New page configuration. + * Initialize for each new page. */ if (newpage) { /* @@ -464,6 +465,7 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage) WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); cbt->ins = WT_SKIP_LAST(cbt->ins_head); cbt->row_iteration_slot = page->pg_row_entries * 2 + 1; + cbt->rip_saved = NULL; goto new_insert; } @@ -619,7 +621,7 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) if (page != NULL && (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD || (newpage && cbt->page_deleted_count > 0))) - __wt_page_evict_soon(page); + WT_ERR(__wt_page_evict_soon(session, cbt->ref)); cbt->page_deleted_count = 0; WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c index a00bb7dc2b5..965aec16fc2 100644 --- a/src/third_party/wiredtiger/src/btree/bt_discard.c +++ b/src/third_party/wiredtiger/src/btree/bt_discard.c @@ -131,8 +131,10 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) /* Discard any disk image. */ dsk = (WT_PAGE_HEADER *)page->dsk; - if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC)) + if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC)) { + __wt_cache_page_image_decr(session, dsk->mem_size); __wt_overwrite_and_free_len(session, dsk, dsk->mem_size); + } /* Discard any mapped image. */ if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_MAPPED)) diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c index c97e05d74a7..cacf1369430 100644 --- a/src/third_party/wiredtiger/src/btree/bt_handle.c +++ b/src/third_party/wiredtiger/src/btree/bt_handle.c @@ -690,6 +690,8 @@ __btree_page_sizes(WT_SESSION_IMPL *session) * Don't let pages grow large compared to the cache size or we can end * up in a situation where nothing can be evicted. Take care getting * the cache size: with a shared cache, it may not have been set. + * Don't forget to update the API documentation if you alter the + * bounds for any of the parameters here. */ WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval)); btree->maxmempage = (uint64_t)cval.val; diff --git a/src/third_party/wiredtiger/src/btree/bt_huffman.c b/src/third_party/wiredtiger/src/btree/bt_huffman.c index 9e9d69c342e..918791d9c6e 100644 --- a/src/third_party/wiredtiger/src/btree/bt_huffman.c +++ b/src/third_party/wiredtiger/src/btree/bt_huffman.c @@ -157,7 +157,8 @@ __huffman_confchk_file(WT_SESSION_IMPL *session, /* Check the file exists. */ WT_RET(__wt_strndup(session, v->str + len, v->len - len, &fname)); - WT_ERR(__wt_fopen(session, fname, WT_OPEN_FIXED, WT_STREAM_READ, &fs)); + WT_ERR(__wt_fopen( + session, fname, WT_FS_OPEN_FIXED, WT_STREAM_READ, &fs)); /* Optionally return the file handle. */ if (fsp == NULL) diff --git a/src/third_party/wiredtiger/src/btree/bt_io.c b/src/third_party/wiredtiger/src/btree/bt_io.c index 4339de6f25c..6c2e2f1b3fb 100644 --- a/src/third_party/wiredtiger/src/btree/bt_io.c +++ b/src/third_party/wiredtiger/src/btree/bt_io.c @@ -117,7 +117,7 @@ __wt_bt_read(WT_SESSION_IMPL *session, */ if (ret != 0 || result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) { - fail_msg = "block decryption failed"; + fail_msg = "block decompression failed"; goto corrupt; } } else @@ -168,7 +168,8 @@ err: __wt_scr_free(session, &tmp); */ int __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, - uint8_t *addr, size_t *addr_sizep, bool checkpoint, bool compressed) + uint8_t *addr, size_t *addr_sizep, + bool checkpoint, bool checkpoint_io, bool compressed) { WT_BM *bm; WT_BTREE *btree; @@ -359,10 +360,12 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, /* Call the block manager to write the block. */ WT_ERR(checkpoint ? bm->checkpoint(bm, session, ip, btree->ckpt, data_cksum) : - bm->write(bm, session, ip, addr, addr_sizep, data_cksum)); + bm->write( + bm, session, ip, addr, addr_sizep, data_cksum, checkpoint_io)); WT_STAT_FAST_CONN_INCR(session, cache_write); WT_STAT_FAST_DATA_INCR(session, cache_write); + S2C(session)->cache->bytes_written += dsk->mem_size; WT_STAT_FAST_CONN_INCRV(session, cache_bytes_write, dsk->mem_size); WT_STAT_FAST_DATA_INCRV(session, cache_bytes_write, dsk->mem_size); diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c index fbe361e000a..1f080041a23 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ovfl.c +++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c @@ -33,6 +33,7 @@ __ovfl_read(WT_SESSION_IMPL *session, store->data = WT_PAGE_HEADER_BYTE(btree, dsk); store->size = dsk->u.datalen; + WT_STAT_FAST_CONN_INCR(session, cache_read_overflow); WT_STAT_FAST_DATA_INCR(session, cache_read_overflow); return (0); @@ -208,6 +209,7 @@ __wt_ovfl_cache(WT_SESSION_IMPL *session, */ if (!visible) { WT_RET(__ovfl_cache(session, page, vpack)); + WT_STAT_FAST_CONN_INCR(session, cache_overflow_value); WT_STAT_FAST_DATA_INCR(session, cache_overflow_value); } diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c index 00ec8aa4494..89e5f428628 100644 --- a/src/third_party/wiredtiger/src/btree/bt_page.c +++ b/src/third_party/wiredtiger/src/btree/bt_page.c @@ -219,6 +219,7 @@ __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, /* Update the page's in-memory size and the cache statistics. */ __wt_cache_page_inmem_incr(session, page, size); + __wt_cache_page_image_incr(session, dsk->mem_size); /* Link the new internal page to the parent. */ if (ref != NULL) { diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index 086500c8b2f..3d396d5ae5b 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -296,7 +296,7 @@ err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); * __evict_force_check -- * Check if a page matches the criteria for forced eviction. */ -static int +static bool __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref) { WT_BTREE *btree; @@ -307,26 +307,26 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref) /* Leaf pages only. */ if (WT_PAGE_IS_INTERNAL(page)) - return (0); + return (false); /* * It's hard to imagine a page with a huge memory footprint that has * never been modified, but check to be sure. */ if (page->modify == NULL) - return (0); + return (false); /* Pages are usually small enough, check that first. */ if (page->memory_footprint < btree->splitmempage) - return (0); + return (false); else if (page->memory_footprint < btree->maxmempage) return (__wt_leaf_page_can_split(session, page)); /* Trigger eviction on the next page release. */ - __wt_page_evict_soon(page); + (void)__wt_page_evict_soon(session, ref); /* Bump the oldest ID, we're about to do some visibility checks. */ - WT_RET(__wt_txn_update_oldest(session, 0)); + (void)__wt_txn_update_oldest(session, 0); /* If eviction cannot succeed, don't try. */ return (__wt_page_can_evict(session, ref, NULL)); @@ -548,10 +548,14 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags * if the page qualifies for forced eviction and update * the page's generation number. If eviction isn't being * done on this file, we're done. + * In-memory split of large pages is allowed while + * no_eviction is set on btree, whereas reconciliation + * is not allowed. */ if (LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(session, WT_SESSION_NO_EVICTION) || - F_ISSET(btree, WT_BTREE_NO_EVICTION)) + (F_ISSET(btree, WT_BTREE_NO_EVICTION) && + !F_ISSET(btree, WT_BTREE_NO_RECONCILE))) goto skip_evict; /* @@ -595,7 +599,14 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags page = ref->page; if (page->read_gen == WT_READGEN_NOTSET) { if (evict_soon) - __wt_page_evict_soon(page); + /* + * Ignore error returns, since the + * evict soon call is advisory and we + * are holding a hazard pointer to the + * page already. + */ + (void)__wt_page_evict_soon( + session, ref); else __wt_cache_read_gen_new(session, page); } else if (!LF_ISSET(WT_READ_NO_GEN)) diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 7a05a883f83..4f6f300802e 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -298,7 +298,7 @@ static int __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, WT_REF **from_refp, size_t *decrp, WT_REF **to_refp, size_t *incrp) { - WT_ADDR *addr; + WT_ADDR *addr, *ref_addr; WT_CELL_UNPACK unpack; WT_DECL_RET; WT_IKEY *ikey; @@ -345,13 +345,18 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, } /* - * If there's no address (the page has never been written), or the - * address has been instantiated, there's no work to do. Otherwise, - * instantiate the address in-memory, from the on-page cell. + * If there's no address at all (the page has never been written), or + * the address has already been instantiated, there's no work to do. + * Otherwise, the address still references a split page on-page cell, + * instantiate it. We can race with reconciliation and/or eviction of + * the child pages, be cautious: read the address and verify it, and + * only update it if the value is unchanged from the original. In the + * case of a race, the address must no longer reference the split page, + * we're done. */ - addr = ref->addr; - if (addr != NULL && !__wt_off_page(from_home, addr)) { - __wt_cell_unpack((WT_CELL *)ref->addr, &unpack); + WT_ORDERED_READ(ref_addr, ref->addr); + if (ref_addr != NULL && !__wt_off_page(from_home, ref_addr)) { + __wt_cell_unpack((WT_CELL *)ref_addr, &unpack); WT_RET(__wt_calloc_one(session, &addr)); if ((ret = __wt_strndup( session, unpack.data, unpack.size, &addr->addr)) != 0) { @@ -371,7 +376,10 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, break; WT_ILLEGAL_VALUE(session); } - ref->addr = addr; + if (!__wt_atomic_cas_ptr(&ref->addr, ref_addr, addr)) { + __wt_free(session, addr->addr); + __wt_free(session, addr); + } } /* And finally, copy the WT_REF pointer itself. */ @@ -786,7 +794,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, */ if (result_entries == 0) { empty_parent = true; - __wt_page_evict_soon(parent); + if (!__wt_ref_is_root(parent->pg_intl_parent_ref)) + ret = __wt_page_evict_soon( + session, parent->pg_intl_parent_ref); goto err; } @@ -1462,11 +1472,11 @@ err: if (parent != NULL) /* * __split_multi_inmem -- - * Instantiate a page in a multi-block set. + * Instantiate a page from a disk image. */ static int __split_multi_inmem( - WT_SESSION_IMPL *session, WT_PAGE *orig, WT_REF *ref, WT_MULTI *multi) + WT_SESSION_IMPL *session, WT_PAGE *orig, WT_MULTI *multi, WT_REF *ref) { WT_CURSOR_BTREE cbt; WT_DECL_ITEM(key); @@ -1487,13 +1497,12 @@ __split_multi_inmem( orig->type != WT_PAGE_COL_VAR || ref->ref_recno != 0); /* - * This code re-creates an in-memory page that is part of a set created - * while evicting a large page, and adds references to any unresolved - * update chains to the new page. We get here due to choosing to keep - * the results of a split in memory or because and update could not be - * written when attempting to evict a page. + * This code re-creates an in-memory page from a disk image, and adds + * references to any unresolved update chains to the new page. We get + * here either because an update could not be written when evicting a + * page, or eviction chose to keep a page in memory. * - * Clear the disk image and link the page into the passed-in WT_REF to + * Steal the disk image and link the page into the passed-in WT_REF to * simplify error handling: our caller will not discard the disk image * when discarding the original page, and our caller will discard the * allocated page on error, when discarding the allocated WT_REF. @@ -1503,6 +1512,19 @@ __split_multi_inmem( WT_PAGE_DISK_ALLOC, &page)); multi->disk_image = NULL; + /* + * Put the re-instantiated page in the same LRU queue location as the + * original page, unless this was a forced eviction, in which case we + * leave the new page with the read generation unset. Eviction will + * set the read generation next time it visits this page. + */ + if (orig->read_gen != WT_READGEN_OLDEST) + page->read_gen = orig->read_gen; + + /* If there are no updates to apply to the page, we're done. */ + if (multi->supd_entries == 0) + return (0); + if (orig->type == WT_PAGE_ROW_LEAF) WT_RET(__wt_scr_alloc(session, 0, &key)); @@ -1551,14 +1573,12 @@ __split_multi_inmem( } /* - * If we modified the page above, it will have set the first dirty - * transaction to the last transaction currently running. However, the - * updates we installed may be older than that. Set the first dirty - * transaction to an impossibly old value so this page is never skipped - * in a checkpoint. + * When modifying the page we set the first dirty transaction to the + * last transaction currently running. However, the updates we made + * might be older than that. Set the first dirty transaction to an + * impossibly old value so this page is never skipped in a checkpoint. */ - if (page->modify != NULL) - page->modify->first_dirty_txn = WT_TXN_FIRST; + page->modify->first_dirty_txn = WT_TXN_FIRST; err: /* Free any resources that may have been cached in the cursor. */ WT_TRET(__wt_btcur_close(&cbt, true)); @@ -1629,19 +1649,17 @@ __split_multi_inmem_fail(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_REF *ref) */ int __wt_multi_to_ref(WT_SESSION_IMPL *session, - WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp) + WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp, bool closing) { WT_ADDR *addr; WT_IKEY *ikey; WT_REF *ref; - size_t incr; - - incr = 0; /* Allocate an underlying WT_REF. */ WT_RET(__wt_calloc_one(session, refp)); ref = *refp; - incr += sizeof(WT_REF); + if (incrp) + *incrp += sizeof(WT_REF); /* * Set the WT_REF key before (optionally) building the page, underlying @@ -1653,21 +1671,34 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, ikey = multi->key.ikey; WT_RET(__wt_row_ikey( session, 0, WT_IKEY_DATA(ikey), ikey->size, ref)); - incr += sizeof(WT_IKEY) + ikey->size; + if (incrp) + *incrp += sizeof(WT_IKEY) + ikey->size; break; default: ref->ref_recno = multi->key.recno; break; } - /* If there's a disk image, build a page, otherwise set the address. */ - if (multi->disk_image == NULL) { - /* - * Copy the address: we could simply take the buffer, but that - * would complicate error handling, freeing the reference array - * would have to avoid freeing the memory, and it's not worth - * the confusion. - */ + /* There should be an address or a disk image (or both). */ + WT_ASSERT(session, + multi->addr.addr != NULL || multi->disk_image != NULL); + + /* If we're closing the file, there better be an address. */ + WT_ASSERT(session, multi->addr.addr != NULL || !closing); + + /* Verify any disk image we have. */ + WT_ASSERT(session, multi->disk_image == NULL || + __wt_verify_dsk_image(session, + "[page instantiate]", multi->disk_image, 0, false) == 0); + + /* + * If there's an address, the page was written, set it. + * + * Copy the address: we could simply take the buffer, but that would + * complicate error handling, freeing the reference array would have + * to avoid freeing the memory, and it's not worth the confusion. + */ + if (multi->addr.addr != NULL) { WT_RET(__wt_calloc_one(session, &addr)); ref->addr = addr; addr->size = multi->addr.size; @@ -1675,14 +1706,20 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_RET(__wt_strndup(session, multi->addr.addr, addr->size, &addr->addr)); ref->state = WT_REF_DISK; - } else { - WT_RET(__split_multi_inmem(session, page, ref, multi)); + } + + /* + * If we have a disk image and we're not closing the file, + * re-instantiate the page. + * + * Discard any page image we don't use. + */ + if (multi->disk_image != NULL && !closing) { + WT_RET(__split_multi_inmem(session, page, multi, ref)); ref->state = WT_REF_MEM; } + __wt_free(session, multi->disk_image); - /* Optionally return changes in the memory footprint. */ - if (incrp != NULL) - *incrp += incr; return (0); } @@ -2086,8 +2123,8 @@ __split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) */ WT_RET(__wt_calloc_def(session, new_entries, &ref_new)); for (i = 0; i < new_entries; ++i) - WT_ERR(__wt_multi_to_ref(session, - page, &mod->mod_multi[i], &ref_new[i], &parent_incr)); + WT_ERR(__wt_multi_to_ref(session, page, + &mod->mod_multi[i], &ref_new[i], &parent_incr, closing)); /* * Split into the parent; if we're closing the file, we hold it @@ -2175,15 +2212,13 @@ __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref) * Rewrite an in-memory page with a new version. */ int -__wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref) +__wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, WT_MULTI *multi) { WT_DECL_RET; WT_PAGE *page; - WT_PAGE_MODIFY *mod; WT_REF *new; page = ref->page; - mod = page->modify; WT_RET(__wt_verbose( session, WT_VERB_SPLIT, "%p: split-rewrite", ref->page)); @@ -2198,14 +2233,14 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref) * * Build the new page. * - * Allocate a WT_REF because the error path uses routines that will ea - * free memory. The only field we need to set is the record number, as - * it's used by the search routines. + * Allocate a WT_REF, the error path calls routines that free memory. + * The only field we need to set is the record number, as it's used by + * the search routines. */ WT_RET(__wt_calloc_one(session, &new)); new->ref_recno = ref->ref_recno; - WT_ERR(__split_multi_inmem(session, page, new, &mod->mod_multi[0])); + WT_ERR(__split_multi_inmem(session, page, multi, new)); /* * The rewrite succeeded, we can no longer fail. @@ -2213,7 +2248,7 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref) * Finalize the move, discarding moved update lists from the original * page. */ - __split_multi_inmem_final(page, &mod->mod_multi[0]); + __split_multi_inmem_final(page, multi); /* * Discard the original page. diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c index 3d5abf34147..d3ddf33446e 100644 --- a/src/third_party/wiredtiger/src/btree/bt_stat.c +++ b/src/third_party/wiredtiger/src/btree/bt_stat.c @@ -41,6 +41,9 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage); WT_STAT_SET(session, stats, btree_maxleafvalue, btree->maxleafvalue); + WT_STAT_SET(session, stats, cache_bytes_inuse, + __wt_btree_bytes_inuse(session)); + /* Everything else is really, really expensive. */ if (!F_ISSET(cst, WT_CONN_STAT_ALL)) return (0); @@ -139,7 +142,7 @@ __stat_page_col_var( } else { orig_deleted = false; __wt_cell_unpack(cell, unpack); - if (unpack->type == WT_CELL_ADDR_DEL) + if (unpack->type == WT_CELL_DEL) orig_deleted = true; else { entry_cnt += __wt_cell_rle(unpack); diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index da6c53aa316..df794c96cda 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -84,7 +84,8 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) WT_ERR(__wt_txn_get_snapshot(session)); leaf_bytes += page->memory_footprint; ++leaf_pages; - WT_ERR(__wt_reconcile(session, walk, NULL, 0)); + WT_ERR(__wt_reconcile( + session, walk, NULL, WT_CHECKPOINTING)); } } break; @@ -92,7 +93,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) /* * If we are flushing a file at read-committed isolation, which * is of particular interest for flushing the metadata to make - * schema-changing operation durable, get a transactional + * a schema-changing operation durable, get a transactional * snapshot now. * * All changes committed up to this point should be included. @@ -126,7 +127,17 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) */ WT_PUBLISH(btree->checkpointing, WT_CKPT_PREPARE); - WT_ERR(__wt_evict_file_exclusive_on(session)); + /* + * Sync for checkpoint allows splits to happen while the queue + * is being drained, but not reconciliation. We need to do this, + * since draining the queue can take long enough for hot pages + * to grow significantly larger than the configured maximum + * size. + */ + F_SET(btree, WT_BTREE_NO_RECONCILE); + ret = __wt_evict_file_exclusive_on(session); + F_CLR(btree, WT_BTREE_NO_RECONCILE); + WT_ERR(ret); __wt_evict_file_exclusive_off(session); WT_PUBLISH(btree->checkpointing, WT_CKPT_RUNNING); @@ -183,7 +194,8 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) leaf_bytes += page->memory_footprint; ++leaf_pages; } - WT_ERR(__wt_reconcile(session, walk, NULL, 0)); + WT_ERR(__wt_reconcile( + session, walk, NULL, WT_CHECKPOINTING)); } break; case WT_SYNC_CLOSE: @@ -217,41 +229,9 @@ err: /* On error, clear any left-over tree walk. */ saved_snap_min == WT_TXN_NONE) __wt_txn_release_snapshot(session); - if (btree->checkpointing != WT_CKPT_OFF) { - /* - * Update the checkpoint generation for this handle so visible - * updates newer than the checkpoint can be evicted. - * - * This has to be published before eviction is enabled again, - * so that eviction knows that the checkpoint has completed. - */ - WT_PUBLISH(btree->checkpoint_gen, - conn->txn_global.checkpoint_gen); - WT_STAT_FAST_DATA_SET(session, - btree_checkpoint_generation, btree->checkpoint_gen); - - /* - * Clear the checkpoint flag and push the change; not required, - * but publishing the change means stalled eviction gets moving - * as soon as possible. - */ - btree->checkpointing = WT_CKPT_OFF; - WT_FULL_BARRIER(); - - /* - * If this tree was being skipped by the eviction server during - * the checkpoint, clear the wait. - */ - btree->evict_walk_period = 0; - - /* - * Wake the eviction server, in case application threads have - * stalled while the eviction server decided it couldn't make - * progress. Without this, application threads will be stalled - * until the eviction server next wakes. - */ - WT_TRET(__wt_evict_server_wake(session)); - } + /* Clear the checkpoint flag and push the change. */ + if (btree->checkpointing != WT_CKPT_OFF) + WT_PUBLISH(btree->checkpointing, WT_CKPT_OFF); __wt_spin_unlock(session, &btree->flush_lock); diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c index bb8a750d848..17d32d6ed63 100644 --- a/src/third_party/wiredtiger/src/btree/bt_walk.c +++ b/src/third_party/wiredtiger/src/btree/bt_walk.c @@ -381,16 +381,6 @@ restart: /* __ref_ascend(session, &ref, &pindex, &slot); /* - * If we got all the way through an internal page and - * all of the child pages were deleted, mark it for - * eviction. - */ - if (empty_internal && pindex->entries > 1) { - __wt_page_evict_soon(ref->page); - empty_internal = false; - } - - /* * If at the root and returning internal pages, return * the root page, otherwise we're done. Regardless, no * hazard pointer is required, release the one we hold. @@ -404,6 +394,16 @@ restart: /* } /* + * If we got all the way through an internal page and + * all of the child pages were deleted, mark it for + * eviction. + */ + if (empty_internal && pindex->entries > 1) { + WT_ERR(__wt_page_evict_soon(session, ref)); + empty_internal = false; + } + + /* * Optionally return internal pages. Swap our previous * hazard pointer for the page we'll return. We don't * handle restart or not-found returns, it would require diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c index 4afcd74520f..0f70e84de7e 100644 --- a/src/third_party/wiredtiger/src/btree/row_srch.c +++ b/src/third_party/wiredtiger/src/btree/row_srch.c @@ -775,7 +775,7 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) * traversing the skip list each time accumulates to real time. */ if (samples > 5000) - __wt_page_evict_soon(page); + WT_RET(__wt_page_evict_soon(session, cbt->ref)); return (0); } |