diff options
22 files changed, 202 insertions, 220 deletions
diff --git a/src/third_party/wiredtiger/ext/extractors/csv/csv_extractor.c b/src/third_party/wiredtiger/ext/extractors/csv/csv_extractor.c index 8d50cc7ec5d..a13f7a79b1f 100644 --- a/src/third_party/wiredtiger/ext/extractors/csv/csv_extractor.c +++ b/src/third_party/wiredtiger/ext/extractors/csv/csv_extractor.c @@ -97,8 +97,10 @@ csv_extract(WT_EXTRACTOR *extractor, WT_SESSION *session, strncpy(copy, p, len); copy[len] = '\0'; if (csv_extractor->format_isnum) { - if ((val = atoi(copy)) < 0) + if ((val = atoi(copy)) < 0) { + free(copy); return (EINVAL); + } result_cursor->set_key(result_cursor, val); } else result_cursor->set_key(result_cursor, copy); @@ -150,7 +152,7 @@ csv_customize(WT_EXTRACTOR *extractor, WT_SESSION *session, return (errno); *csv_extractor = *orig; - csv_extractor->field = field_num; + csv_extractor->field = (int)field_num; csv_extractor->format_isnum = (format.str[0] == 'i'); *customp = (WT_EXTRACTOR *)csv_extractor; return (0); diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index 910df616015..9dd72108e4b 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -77,7 +77,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) } (void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1); - ret = __wt_evict(session, ref, 0); + ret = __wt_evict(session, ref, false); (void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1); WT_RET_BUSY_OK(ret); } @@ -99,25 +99,18 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) /* * We cannot fast-delete pages that have overflow key/value items as * the overflow blocks have to be discarded. The way we figure that - * out is to check the on-page cell type for the page, cells for leaf - * pages that have no overflow items are special. - * - * In some cases, the reference address may not reference an on-page - * cell (for example, some combination of page splits), in which case - * we can't check the original cell value and we fail. + * out is to check the page's cell type, cells for leaf pages without + * overflow items are special. * * To look at an on-page cell, we need to look at the parent page, and * that's dangerous, our parent page could change without warning if * the parent page were to split, deepening the tree. It's safe: the * page's reference will always point to some valid page, and if we find * any problems we simply fail the fast-delete optimization. - * - * !!! - * I doubt it's worth the effort, but we could copy the cell's type into - * the reference structure, and then we wouldn't need an on-page cell. */ parent = ref->home; - if (__wt_off_page(parent, ref->addr) || + if (__wt_off_page(parent, ref->addr) ? + ((WT_ADDR *)ref->addr)->type != WT_ADDR_LEAF_NO : __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO) goto err; diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c index 32418a9c063..7cd97831044 100644 --- a/src/third_party/wiredtiger/src/btree/bt_discard.c +++ b/src/third_party/wiredtiger/src/btree/bt_discard.c @@ -50,15 +50,18 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) page = *pagep; *pagep = NULL; + if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) + __wt_page_modify_clear(session, page); + /* - * We should never discard ... + * We should never discard: + * - a dirty page, + * - a page queued for eviction, or + * - a locked page. */ - WT_ASSERT( /* ... a dirty page */ - session, !__wt_page_is_modified(page)); - WT_ASSERT( /* ... a page queued for LRU eviction */ - session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)); - WT_ASSERT( /* ... a locked page */ - session, !__wt_fair_islocked(session, &page->page_lock)); + WT_ASSERT(session, !__wt_page_is_modified(page)); + WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)); + WT_ASSERT(session, !__wt_fair_islocked(session, &page->page_lock)); #ifdef HAVE_DIAGNOSTIC { @@ -227,7 +230,7 @@ __free_page_int(WT_SESSION_IMPL *session, WT_PAGE *page) */ void __wt_free_ref( - WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref, bool free_pages) + WT_SESSION_IMPL *session, WT_REF *ref, int page_type, bool free_pages) { WT_IKEY *ikey; @@ -246,8 +249,15 @@ __wt_free_ref( __wt_page_out(session, &ref->page); } - /* Free any key allocation. */ - switch (page->type) { + /* + * Optionally free row-store WT_REF key allocation. Historic versions of + * this code looked in a passed-in page argument, but that is dangerous, + * some of our error-path callers create WT_REF structures without ever + * setting WT_REF.home or having a parent page to which the WT_REF will + * be linked. Those WT_REF structures invariably have instantiated keys, + * (they obviously cannot be on-page keys), and we must free the memory. + */ + switch (page_type) { case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: if ((ikey = __wt_ref_key_instantiated(ref)) != NULL) @@ -255,8 +265,12 @@ __wt_free_ref( break; } - /* Free any address allocation. */ - if (ref->addr != NULL && __wt_off_page(page, ref->addr)) { + /* + * Free any address allocation; if there's no linked WT_REF page, it + * must be allocated. + */ + if (ref->addr != NULL && + (ref->home == NULL || __wt_off_page(ref->home, ref->addr))) { __wt_free(session, ((WT_ADDR *)ref->addr)->addr); __wt_free(session, ref->addr); } @@ -272,7 +286,7 @@ __wt_free_ref( /* * __wt_free_ref_index -- - * Discard a page index and it's references. + * Discard a page index and its references. */ void __wt_free_ref_index(WT_SESSION_IMPL *session, @@ -284,7 +298,8 @@ __wt_free_ref_index(WT_SESSION_IMPL *session, return; for (i = 0; i < pindex->entries; ++i) - __wt_free_ref(session, page, pindex->index[i], free_pages); + __wt_free_ref( + session, pindex->index[i], page->type, free_pages); __wt_free(session, pindex); } diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c index dbdf94fc1b6..294cc399d65 100644 --- a/src/third_party/wiredtiger/src/btree/bt_handle.c +++ b/src/third_party/wiredtiger/src/btree/bt_handle.c @@ -697,6 +697,13 @@ __btree_page_sizes(WT_SESSION_IMPL *session) } /* + * Try in-memory splits once we hit 80% of the maximum in-memory page + * size. This gives multi-threaded append workloads a better chance of + * not stalling. + */ + btree->splitmempage = 8 * btree->maxmempage / 10; + + /* * Get the split percentage (reconciliation splits pages into smaller * than the maximum page size chunks so we don't split every time a * new entry is added). Determine how large newly split pages will be. diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index 18fd87e78ff..77215474359 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -307,10 +307,6 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref) btree = S2BT(session); page = ref->page; - /* Pages are usually small enough, check that first. */ - if (page->memory_footprint < btree->maxmempage) - return (0); - /* Leaf pages only. */ if (WT_PAGE_IS_INTERNAL(page)) return (0); @@ -322,6 +318,12 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref) if (page->modify == NULL) return (0); + /* Pages are usually small enough, check that first. */ + if (page->memory_footprint < btree->splitmempage) + return (0); + else if (page->memory_footprint < btree->maxmempage) + return (__wt_leaf_page_can_split(session, page)); + /* Trigger eviction on the next page release. */ __wt_page_evict_soon(page); @@ -329,7 +331,7 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref) __wt_txn_update_oldest(session, false); /* If eviction cannot succeed, don't try. */ - return (__wt_page_can_evict(session, ref, true, NULL)); + return (__wt_page_can_evict(session, ref, NULL)); } /* diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c index 4f7fdc97f5f..80e467b5707 100644 --- a/src/third_party/wiredtiger/src/btree/bt_slvg.c +++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c @@ -326,7 +326,7 @@ __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) */ if (ss->root_ref.page != NULL) { btree->ckpt = ckptbase; - ret = __wt_evict(session, &ss->root_ref, 1); + ret = __wt_evict(session, &ss->root_ref, true); ss->root_ref.page = NULL; btree->ckpt = NULL; } @@ -1304,7 +1304,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) ret = __wt_page_release(session, ref, 0); if (ret == 0) - ret = __wt_evict(session, ref, 1); + ret = __wt_evict(session, ref, true); if (0) { err: WT_TRET(__wt_page_release(session, ref, 0)); @@ -2030,7 +2030,7 @@ __slvg_row_build_leaf( */ ret = __wt_page_release(session, ref, 0); if (ret == 0) - ret = __wt_evict(session, ref, 1); + ret = __wt_evict(session, ref, true); if (0) { err: WT_TRET(__wt_page_release(session, ref, 0)); diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 92ed2b3e559..a0dfbf32cad 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -340,8 +340,18 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, return (ret); } addr->size = (uint8_t)unpack.size; - addr->type = - unpack.raw == WT_CELL_ADDR_INT ? WT_ADDR_INT : WT_ADDR_LEAF; + switch (unpack.raw) { + case WT_CELL_ADDR_INT: + addr->type = WT_ADDR_INT; + break; + case WT_CELL_ADDR_LEAF: + addr->type = WT_ADDR_LEAF; + break; + case WT_CELL_ADDR_LEAF_NO: + addr->type = WT_ADDR_LEAF_NO; + break; + WT_ILLEGAL_VALUE(session); + } ref->addr = addr; } @@ -399,18 +409,9 @@ __split_ref_move_final( WT_DECL_RET; WT_PAGE *child; WT_REF *ref, *child_ref; - uint64_t txn_new_id; uint32_t i; /* - * When creating new internal pages as part of a split, we set a field - * in those pages modify structure to prevent them from being evicted - * until all threads are known to have exited the index of the page that - * previously "owned" the WT_REF. Set that field to a safe value. - */ - txn_new_id = __wt_txn_id_alloc(session, false); - - /* * The WT_REF structures moved to newly allocated child pages reference * the wrong parent page and we have to fix that up. The problem is * revealed when a thread of control searches for the child page's @@ -461,8 +462,6 @@ __split_ref_move_final( if (child_ref->home != child) { child_ref->home = child; child_ref->pindex_hint = 0; - - child->modify->mod_split_txn = txn_new_id; } } WT_INTL_FOREACH_END; WT_LEAVE_PAGE_INDEX(session); @@ -1527,7 +1526,7 @@ __split_multi_inmem_final(WT_PAGE *orig, WT_MULTI *multi) * Discard allocated pages after failure. */ static void -__split_multi_inmem_fail(WT_SESSION_IMPL *session, WT_REF *ref) +__split_multi_inmem_fail(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_REF *ref) { /* * We failed creating new in-memory pages. For error-handling reasons, @@ -1537,7 +1536,7 @@ __split_multi_inmem_fail(WT_SESSION_IMPL *session, WT_REF *ref) */ if (ref->page != NULL) { F_SET_ATOMIC(ref->page, WT_PAGE_UPDATE_IGNORE); - __wt_free_ref(session, ref->page, ref, true); + __wt_free_ref(session, ref, orig->type, true); } } @@ -1635,7 +1634,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) * * Note this page has already been through an in-memory split. */ - WT_ASSERT(session, __wt_page_can_split(session, page)); + WT_ASSERT(session, __wt_leaf_page_can_split(session, page)); WT_ASSERT(session, __wt_page_is_modified(page)); F_SET_ATOMIC(page, WT_PAGE_SPLIT_INSERT); @@ -1818,13 +1817,6 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) #endif /* - * Save the transaction ID when the split happened. Application - * threads will not try to forcibly evict the page again until - * all concurrent transactions commit. - */ - page->modify->inmem_split_txn = __wt_txn_id_alloc(session, false); - - /* * Update the page accounting. * * XXX @@ -1962,7 +1954,7 @@ __split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) if (0) { err: for (i = 0; i < new_entries; ++i) - __split_multi_inmem_fail(session, ref_new[i]); + __split_multi_inmem_fail(session, page, ref_new[i]); } __wt_free(session, ref_new); @@ -2072,6 +2064,6 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref) return (0); -err: __split_multi_inmem_fail(session, &new); +err: __split_multi_inmem_fail(session, page, &new); return (ret); } diff --git a/src/third_party/wiredtiger/src/cursor/cur_file.c b/src/third_party/wiredtiger/src/cursor/cur_file.c index 7c18b59fded..63f77248ca8 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_file.c +++ b/src/third_party/wiredtiger/src/cursor/cur_file.c @@ -379,7 +379,7 @@ __curfile_close(WT_CURSOR *cursor) * updated correctly. */ if (session->dhandle != NULL) { - /* Increment the data-source's in-use counter. */ + /* Decrement the data-source's in-use counter. */ __wt_cursor_dhandle_decr_use(session); WT_TRET(__wt_session_release_btree(session)); } @@ -439,6 +439,9 @@ __wt_curfile_create(WT_SESSION_IMPL *session, cursor->value_format = btree->value_format; cbt->btree = btree; + if (session->dhandle->checkpoint != NULL) + F_SET(cbt, WT_CBT_NO_TXN); + if (bulk) { F_SET(cursor, WT_CURSTD_BULK); diff --git a/src/third_party/wiredtiger/src/evict/evict_file.c b/src/third_party/wiredtiger/src/evict/evict_file.c index 043fbf6bbeb..2b2117ad9fd 100644 --- a/src/third_party/wiredtiger/src/evict/evict_file.c +++ b/src/third_party/wiredtiger/src/evict/evict_file.c @@ -76,22 +76,16 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) /* * Evict the page. */ - WT_ERR(__wt_evict(session, ref, 1)); + WT_ERR(__wt_evict(session, ref, true)); break; case WT_SYNC_DISCARD: /* - * Dead handles may reference dirty pages; clean the - * page, both to keep statistics correct, and to let - * the page-discard function assert no dirty page is - * ever discarded. + * Discard the page regardless of whether it is dirty. */ - if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) - __wt_page_modify_clear(session, page); - WT_ASSERT(session, F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || - __wt_page_can_evict(session, ref, false, NULL)); - __wt_evict_page_clean_update(session, ref, 1); + __wt_page_can_evict(session, ref, NULL)); + __wt_evict_page_clean_update(session, ref, true); break; WT_ILLEGAL_VALUE_ERR(session); } diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 02e8bd24899..306362de57f 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -1169,7 +1169,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp) uint64_t pages_walked; uint32_t walk_flags; int internal_pages, restarts; - bool enough, modified; + bool enough, modified, would_split; conn = S2C(session); btree = S2BT(session); @@ -1254,10 +1254,16 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp) page->read_gen = __wt_cache_read_gen_new(session); fast: /* If the page can't be evicted, give up. */ - if (!__wt_page_can_evict(session, ref, true, NULL)) + if (!__wt_page_can_evict(session, ref, &would_split)) continue; /* + * Note: take care with ordering: if we detected that + * the page is modified above, we expect mod != NULL. + */ + mod = page->modify; + + /* * Additional tests if eviction is likely to succeed. * * If eviction is stuck or we are helping with forced eviction, @@ -1270,33 +1276,27 @@ fast: /* If the page can't be evicted, give up. */ if (!FLD_ISSET(cache->state, WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK)) { /* - * Note: take care with ordering: if we detected that - * the page is modified above, we expect mod != NULL. - */ - mod = page->modify; - - /* * If the page is clean but has modifications that * appear too new to evict, skip it. */ if (!modified && mod != NULL && !__wt_txn_visible_all(session, mod->rec_max_txn)) continue; - - /* - * If the oldest transaction hasn't changed since the - * last time this page was written, it's unlikely we - * can make progress. Similarly, if the most recent - * update on the page is not yet globally visible, - * eviction will fail. These heuristics attempt to - * avoid repeated attempts to evict the same page. - */ - if (modified && - (mod->disk_snap_min == conn->txn_global.oldest_id || - !__wt_txn_visible_all(session, mod->update_txn))) - continue; } + /* + * If the oldest transaction hasn't changed since the last time + * this page was written, it's unlikely we can make progress. + * Similarly, if the most recent update on the page is not yet + * globally visible, eviction will fail. These heuristics + * attempt to avoid repeated attempts to evict the same page. + */ + if (modified && !would_split && + !FLD_ISSET(cache->state, WT_CACHE_STUCK) && + (mod->last_oldest_id == __wt_txn_oldest_id(session) || + !__wt_txn_visible_all(session, mod->update_txn))) + continue; + WT_ASSERT(session, evict->ref == NULL); __evict_init_candidate(session, evict, ref); ++evict; @@ -1454,17 +1454,7 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server) if (page->read_gen != WT_READGEN_OLDEST) page->read_gen = __wt_cache_read_gen_bump(session); - /* - * If we are evicting in a dead tree, don't write dirty pages. - * - * Force pages clean to keep statistics correct and to let the - * page-discard function assert that no dirty pages are ever - * discarded. - */ - if (F_ISSET(btree->dhandle, WT_DHANDLE_DEAD)) - __wt_page_modify_clear(session, page); - - WT_WITH_BTREE(session, btree, ret = __wt_evict(session, ref, 0)); + WT_WITH_BTREE(session, btree, ret = __wt_evict(session, ref, false)); (void)__wt_atomic_subv32(&btree->evict_busy, 1); diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index 2e5f82d1ff8..26ea9117fae 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -55,7 +55,7 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; - bool forced_eviction, inmem_split; + bool clean_page, forced_eviction, inmem_split, tree_dead; conn = S2C(session); @@ -65,6 +65,7 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) page = ref->page; forced_eviction = page->read_gen == WT_READGEN_OLDEST; inmem_split = false; + tree_dead = F_ISSET(session->dhandle, WT_DHANDLE_DEAD); WT_RET(__wt_verbose(session, WT_VERB_EVICT, "page %p (%s)", page, __wt_page_type_string(page->type))); @@ -105,24 +106,26 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) if (page->memory_footprint > conn->cache->evict_max_page_size) conn->cache->evict_max_page_size = page->memory_footprint; + /* Figure out whether reconciliation was done on the page */ + clean_page = mod == NULL || mod->rec_result == 0; + /* Update the reference and discard the page. */ - if ((mod == NULL || mod->rec_result == 0) && - !F_ISSET(conn, WT_CONN_IN_MEMORY)) { - if (__wt_ref_is_root(ref)) - __wt_ref_out(session, ref); - else - WT_ERR(__wt_evict_page_clean_update( - session, ref, closing)); + if (__wt_ref_is_root(ref)) + __wt_ref_out(session, ref); + else if (tree_dead || (clean_page && !F_ISSET(conn, WT_CONN_IN_MEMORY))) + /* + * Pages that belong to dead trees never write back to disk + * and can't support page splits. + */ + WT_ERR(__wt_evict_page_clean_update( + session, ref, tree_dead || closing)); + else + WT_ERR(__evict_page_dirty_update(session, ref, closing)); + if (clean_page) { WT_STAT_FAST_CONN_INCR(session, cache_eviction_clean); WT_STAT_FAST_DATA_INCR(session, cache_eviction_clean); } else { - if (__wt_ref_is_root(ref)) - __wt_ref_out(session, ref); - else - WT_ERR(__evict_page_dirty_update( - session, ref, closing)); - WT_STAT_FAST_CONN_INCR(session, cache_eviction_dirty); WT_STAT_FAST_DATA_INCR(session, cache_eviction_dirty); } @@ -400,6 +403,13 @@ __evict_review( } /* + * It is always OK to evict pages from dead trees if they don't have + * children. + */ + if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) + return (0); + + /* * Retrieve the modified state of the page. This must happen after the * check for evictable internal pages otherwise there is a race where a * page could be marked modified due to a child being transitioned to @@ -424,7 +434,7 @@ __evict_review( if (modified) __wt_txn_update_oldest(session, true); - if (!__wt_page_can_evict(session, ref, false, inmem_splitp)) + if (!__wt_page_can_evict(session, ref, inmem_splitp)) return (EBUSY); /* diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index ae29dc68003..6ee74c61a38 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -198,20 +198,9 @@ struct __wt_ovfl_txnc { * When a page is modified, there's additional information to maintain. */ struct __wt_page_modify { - /* - * Track the highest transaction ID at which the page was written to - * disk. This can be used to avoid trying to write the page multiple - * times if a snapshot is keeping old versions pinned (e.g., in a - * checkpoint). - */ - uint64_t disk_snap_min; - /* The first unwritten transaction ID (approximate). */ uint64_t first_dirty_txn; - /* In-memory split transaction ID. */ - uint64_t inmem_split_txn; - /* Avoid checking for obsolete updates during checkpoints. */ uint64_t obsolete_check_txn; @@ -221,10 +210,8 @@ struct __wt_page_modify { /* The largest update transaction ID (approximate). */ uint64_t update_txn; -#ifdef HAVE_DIAGNOSTIC /* Check that transaction time moves forward. */ uint64_t last_oldest_id; -#endif /* Dirty bytes added to the cache. */ size_t bytes_dirty; @@ -313,17 +300,8 @@ struct __wt_page_modify { * so they can be discarded when no longer needed. */ WT_PAGE *root_split; /* Linked list of root split pages */ - - /* - * When we deepen the tree, newly created internal pages cannot - * be evicted until all threads have exited the original page - * index structure. We set a transaction value during the split - * that's checked during eviction. - */ - uint64_t split_txn; /* Split eviction transaction value */ } intl; #define mod_root_split u2.intl.root_split -#define mod_split_txn u2.intl.split_txn struct { /* * Appended items to column-stores: there is only a single one diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h index ccdcccbaa0e..a1d8e395cfc 100644 --- a/src/third_party/wiredtiger/src/include/btree.h +++ b/src/third_party/wiredtiger/src/include/btree.h @@ -88,7 +88,8 @@ struct __wt_btree { uint32_t maxleafpage; /* Leaf page max size */ uint32_t maxleafkey; /* Leaf page max key size */ uint32_t maxleafvalue; /* Leaf page max value size */ - uint64_t maxmempage; /* In memory page max size */ + uint64_t maxmempage; /* In-memory page max size */ + uint64_t splitmempage; /* In-memory split trigger size */ void *huffman_key; /* Key huffman encoding */ void *huffman_value; /* Value huffman encoding */ diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index a92d52e784a..3f8dc08a1da 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -330,6 +330,8 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) { uint64_t last_running; + WT_ASSERT(session, !F_ISSET(session->dhandle, WT_DHANDLE_DEAD)); + last_running = 0; if (page->modify->write_gen == 0) last_running = S2C(session)->txn_global.last_running; @@ -347,13 +349,6 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) __wt_cache_dirty_incr(session, page); /* - * The page can never end up with changes older than the oldest - * running transaction. - */ - if (F_ISSET(&session->txn, WT_TXN_HAS_SNAPSHOT)) - page->modify->disk_snap_min = session->txn.snap_min; - - /* * We won the race to dirty the page, but another thread could * have committed in the meantime, and the last_running field * been updated past it. That is all very unlikely, but not @@ -968,11 +963,11 @@ __wt_ref_info(WT_SESSION_IMPL *session, } /* - * __wt_page_can_split -- + * __wt_leaf_page_can_split -- * Check whether a page can be split in memory. */ static inline bool -__wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) +__wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; WT_INSERT_HEAD *ins_head; @@ -1003,7 +998,7 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) * reconciliation will be wrong, so we can't evict immediately). */ if (page->type != WT_PAGE_ROW_LEAF || - page->memory_footprint < btree->maxmempage || + page->memory_footprint < btree->splitmempage || !__wt_page_is_modified(page)) return (false); @@ -1046,13 +1041,11 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) * Check whether a page can be evicted. */ static inline bool -__wt_page_can_evict(WT_SESSION_IMPL *session, - WT_REF *ref, bool check_splits, bool *inmem_splitp) +__wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp) { WT_BTREE *btree; WT_PAGE *page; WT_PAGE_MODIFY *mod; - WT_TXN_GLOBAL *txn_global; if (inmem_splitp != NULL) *inmem_splitp = false; @@ -1071,7 +1064,7 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, * detailed eviction tests. We don't need further tests since the page * won't be written or discarded from the cache. */ - if (__wt_page_can_split(session, page)) { + if (__wt_leaf_page_can_split(session, page)) { if (inmem_splitp != NULL) *inmem_splitp = true; return (true); @@ -1105,29 +1098,12 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, * pages cannot be evicted until all threads are known to have exited * the original parent page's index, because evicting an internal page * discards its WT_REF array, and a thread traversing the original - * parent page index might see a freed WT_REF. During the split we set - * a transaction value, we can evict the created page as soon as that - * transaction value is globally visible. + * parent page index might see a freed WT_REF. */ - if (check_splits && WT_PAGE_IS_INTERNAL(page) && - (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK) || - !__wt_txn_visible_all(session, mod->mod_split_txn))) + if (WT_PAGE_IS_INTERNAL(page) && + F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK)) return (false); - /* - * If the page was recently split in-memory, don't evict it immediately: - * we want to give application threads that are appending a chance to - * move to the new leaf page created by the split. - * - * Note the check here is similar to __wt_txn_visible_all, but ignores - * the checkpoint's transaction. - */ - if (check_splits) { - txn_global = &S2C(session)->txn_global; - if (WT_TXNID_LE(txn_global->oldest_id, mod->inmem_split_txn)) - return (false); - } - return (true); } @@ -1162,7 +1138,7 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) (void)__wt_atomic_addv32(&btree->evict_busy, 1); too_big = page->memory_footprint > btree->maxmempage; - if ((ret = __wt_evict(session, ref, 0)) == 0) { + if ((ret = __wt_evict(session, ref, false)) == 0) { if (too_big) WT_STAT_FAST_CONN_INCR(session, cache_eviction_force); else @@ -1221,7 +1197,7 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(session, WT_SESSION_NO_EVICTION) || F_ISSET(btree, WT_BTREE_NO_EVICTION) || - !__wt_page_can_evict(session, ref, true, NULL)) + !__wt_page_can_evict(session, ref, NULL)) return (__wt_hazard_clear(session, page)); WT_RET_BUSY_OK(__wt_page_release_evict(session, ref)); diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h index 76d79d17b2a..54787d2227b 100644 --- a/src/third_party/wiredtiger/src/include/cursor.h +++ b/src/third_party/wiredtiger/src/include/cursor.h @@ -197,7 +197,14 @@ struct __wt_cursor_btree { #define WT_CBT_ITERATE_NEXT 0x04 /* Next iteration configuration */ #define WT_CBT_ITERATE_PREV 0x08 /* Prev iteration configuration */ #define WT_CBT_MAX_RECORD 0x10 /* Col-store: past end-of-table */ -#define WT_CBT_SEARCH_SMALLEST 0x20 /* Row-store: small-key insert list */ +#define WT_CBT_NO_TXN 0x20 /* Non-transactional cursor + (e.g. on a checkpoint) */ +#define WT_CBT_SEARCH_SMALLEST 0x40 /* Row-store: small-key insert list */ + +#define WT_CBT_POSITION_MASK /* Flags associated with position */ \ + (WT_CBT_ITERATE_APPEND | WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV | \ + WT_CBT_MAX_RECORD | WT_CBT_SEARCH_SMALLEST) + uint8_t flags; }; @@ -302,7 +309,7 @@ struct __wt_cursor_join_entry { WT_CURSOR_JOIN_ENDPOINT *ends; /* reference endpoints */ size_t ends_allocated; - size_t ends_next; + u_int ends_next; WT_JOIN_STATS stats; /* Join statistics */ }; diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i index 9dd280534b4..2e382591313 100644 --- a/src/third_party/wiredtiger/src/include/cursor.i +++ b/src/third_party/wiredtiger/src/include/cursor.i @@ -41,11 +41,7 @@ __cursor_pos_clear(WT_CURSOR_BTREE *cbt) cbt->cip_saved = NULL; cbt->rip_saved = NULL; - /* - * Don't clear the active flag, it's owned by the cursor enter/leave - * functions. - */ - F_CLR(cbt, ~WT_CBT_ACTIVE); + F_CLR(cbt, WT_CBT_POSITION_MASK); } /* @@ -93,7 +89,8 @@ __curfile_enter(WT_CURSOR_BTREE *cbt) session = (WT_SESSION_IMPL *)cbt->iface.session; - WT_RET(__cursor_enter(session)); + if (!F_ISSET(cbt, WT_CBT_NO_TXN)) + WT_RET(__cursor_enter(session)); F_SET(cbt, WT_CBT_ACTIVE); return (0); } @@ -112,7 +109,8 @@ __curfile_leave(WT_CURSOR_BTREE *cbt) /* If the cursor was active, deactivate it. */ if (F_ISSET(cbt, WT_CBT_ACTIVE)) { - __cursor_leave(session); + if (!F_ISSET(cbt, WT_CBT_NO_TXN)) + __cursor_leave(session); F_CLR(cbt, WT_CBT_ACTIVE); } @@ -204,7 +202,7 @@ err: return (ret); /* * __wt_cursor_dhandle_incr_use -- - * Increment the in-use counter in cursor's data source. + * Increment the in-use counter in the cursor's data source. */ static inline void __wt_cursor_dhandle_incr_use(WT_SESSION_IMPL *session) @@ -221,7 +219,7 @@ __wt_cursor_dhandle_incr_use(WT_SESSION_IMPL *session) /* * __wt_cursor_dhandle_decr_use -- - * Decrement the in-use counter in cursor's data source. + * Decrement the in-use counter in the cursor's data source. */ static inline void __wt_cursor_dhandle_decr_use(WT_SESSION_IMPL *session) @@ -262,7 +260,13 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, bool reenter) if (!F_ISSET(cbt, WT_CBT_ACTIVE)) WT_RET(__curfile_enter(cbt)); - __wt_txn_cursor_op(session); + + /* + * If this is an ordinary transactional cursor, make sure we are set up + * to read. + */ + if (!F_ISSET(cbt, WT_CBT_NO_TXN)) + __wt_txn_cursor_op(session); return (0); } diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 3cbbb7b0072..af8a7aa70e9 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -122,7 +122,7 @@ extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool vi extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref); extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref); extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep); -extern void __wt_free_ref( WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref, bool free_pages); +extern void __wt_free_ref( WT_SESSION_IMPL *session, WT_REF *ref, int page_type, bool free_pages); extern void __wt_free_ref_index(WT_SESSION_IMPL *session, WT_PAGE *page, WT_PAGE_INDEX *pindex, bool free_pages); extern void __wt_free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd); extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]); diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index b199252a1dc..936164fa9a7 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -70,7 +70,7 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_txn_state { }; struct __wt_txn_global { - uint64_t alloc; /* Transaction ID to allocate. */ + WT_SPINLOCK id_lock; volatile uint64_t current; /* Current transaction ID. */ /* The oldest running transaction ID (may race). */ diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index ef9d5a273cf..1005d4a395d 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -323,7 +323,6 @@ __wt_txn_id_alloc(WT_SESSION_IMPL *session, bool publish) { WT_TXN_GLOBAL *txn_global; uint64_t id; - u_int i; txn_global = &S2C(session)->txn_global; @@ -350,20 +349,16 @@ __wt_txn_id_alloc(WT_SESSION_IMPL *session, bool publish) * global current ID, so we want post-increment semantics. Our atomic * add primitive does pre-increment, so adjust the result here. */ - id = __wt_atomic_addv64(&S2C(session)->txn_global.alloc, 1) - 1; + __wt_spin_lock(session, &txn_global->id_lock); + id = txn_global->current; if (publish) { session->txn.id = id; - WT_SESSION_TXN_STATE(session)->id = id; + WT_PUBLISH(WT_SESSION_TXN_STATE(session)->id, id); } - for (i = 0; txn_global->current != id; i++) - if (i < 100) - WT_PAUSE(); - else - __wt_yield(); - - WT_PUBLISH(txn_global->current, id + 1); + ++txn_global->current; + __wt_spin_unlock(session, &txn_global->id_lock); return (id); } diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 5cd8b10c06f..8bf81eafac2 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -351,6 +351,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_RECONCILE *r; + uint64_t oldest_id; page = ref->page; mod = page->modify; @@ -361,21 +362,14 @@ __wt_reconcile(WT_SESSION_IMPL *session, /* We shouldn't get called with a clean page, that's an error. */ WT_ASSERT(session, __wt_page_is_modified(page)); -#ifdef HAVE_DIAGNOSTIC - { /* * Check that transaction time always moves forward for a given page. * If this check fails, reconciliation can free something that a future * reconciliation will need. */ - uint64_t oldest_id = __wt_txn_oldest_id(session); + oldest_id = __wt_txn_oldest_id(session); WT_ASSERT(session, WT_TXNID_LE(mod->last_oldest_id, oldest_id)); mod->last_oldest_id = oldest_id; - } -#endif - - /* Record the most recent transaction ID we will *not* write. */ - mod->disk_snap_min = session->txn.snap_min; /* Initialize the reconciliation structure for each new run. */ WT_RET(__rec_write_init( @@ -2944,8 +2938,8 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) break; case SPLIT_TRACKING_RAW: /* - * We were configured for raw compression, but never actually - * wrote anything. + * We were configured for raw compression, and either we never + * wrote anything, or there's a remaindered block of data. */ break; WT_ILLEGAL_VALUE(session); @@ -2998,14 +2992,27 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) static int __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r) { - /* We're done reconciling - write the final page */ + WT_BTREE *btree; + size_t data_size; + + btree = S2BT(session); + + /* + * We're done reconciling, write the final page. Call raw compression + * until/unless there's not enough data to compress. + */ if (r->raw_compression && r->entries != 0) { - while (r->entries != 0) + while (r->entries != 0) { + data_size = + WT_PTRDIFF32(r->first_free, r->disk_image.mem); + if (data_size <= btree->allocsize) + break; WT_RET(__rec_split_raw_worker(session, r, 0, true)); - } else - WT_RET(__rec_split_finish_std(session, r)); - - return (0); + } + if (r->entries == 0) + return (0); + } + return (__rec_split_finish_std(session, r)); } /* diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index 12f7ce2ec3f..053f69ee7f8 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -206,6 +206,9 @@ __session_close(WT_SESSION *wt_session, const char *config) __wt_spin_unlock(session, &conn->api_lock); + /* We no longer have a session, don't try to update it. */ + session = NULL; + err: API_END_RET_NOTFOUND_MAP(session, ret); } diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 850e7e83803..f835fea8f67 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -712,9 +712,11 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) conn = S2C(session); txn_global = &conn->txn_global; - txn_global->alloc = txn_global->current = - txn_global->last_running = txn_global->oldest_id = WT_TXN_FIRST; + txn_global->current = txn_global->last_running = + txn_global->oldest_id = WT_TXN_FIRST; + WT_RET(__wt_spin_init(session, + &txn_global->id_lock, "transaction id lock")); WT_RET(__wt_rwlock_alloc(session, &txn_global->nsnap_rwlock, "named snapshot lock")); txn_global->nsnap_oldest_id = WT_TXN_NONE; @@ -747,6 +749,7 @@ __wt_txn_global_destroy(WT_SESSION_IMPL *session) if (txn_global == NULL) return (0); + __wt_spin_destroy(session, &txn_global->id_lock); WT_TRET(__wt_rwlock_destroy(session, &txn_global->nsnap_rwlock)); __wt_free(session, txn_global->states); |