diff options
Diffstat (limited to 'src/third_party/wiredtiger/src/include/btree.i')
-rw-r--r-- | src/third_party/wiredtiger/src/include/btree.i | 216 |
1 files changed, 126 insertions, 90 deletions
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 058a00d5a78..b54cecb6ce0 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -10,17 +10,17 @@ * __wt_ref_is_root -- * Return if the page reference is for the root page. */ -static inline int +static inline bool __wt_ref_is_root(WT_REF *ref) { - return (ref->home == NULL ? 1 : 0); + return (ref->home == NULL); } /* * __wt_page_is_empty -- * Return if the page is empty. */ -static inline int +static inline bool __wt_page_is_empty(WT_PAGE *page) { return (page->modify != NULL && @@ -31,10 +31,10 @@ __wt_page_is_empty(WT_PAGE *page) * __wt_page_is_modified -- * Return if the page is dirty. */ -static inline int +static inline bool __wt_page_is_modified(WT_PAGE *page) { - return (page->modify != NULL && page->modify->write_gen != 0 ? 1 : 0); + return (page->modify != NULL && page->modify->write_gen != 0); } /* @@ -84,6 +84,9 @@ __wt_cache_decr_check_size( __wt_errx(session, "%s underflow: decrementing %" WT_SIZET_FMT, fld, v); first = 0; } +#else + WT_UNUSED(fld); + WT_UNUSED(session); #endif } @@ -109,6 +112,9 @@ __wt_cache_decr_check_uint64( __wt_errx(session, "%s underflow: decrementing %" WT_SIZET_FMT, fld, v); first = 0; } +#else + WT_UNUSED(fld); + WT_UNUSED(session); #endif } @@ -352,9 +358,13 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) * have committed in the meantime, and the last_running field * been updated past it. That is all very unlikely, but not * impossible, so we take care to read the global state before - * the atomic increment. If we raced with reconciliation, just - * leave the previous value here: at worst, we will write a - * page in a checkpoint when not absolutely necessary. + * the atomic increment. + * + * If the page was dirty on entry, then last_running == 0. The + * page could have become clean since then, if reconciliation + * completed. In that case, we leave the previous value for + * first_dirty_txn rather than potentially racing to update it, + * at worst, we'll unnecessarily write a page in a checkpoint. */ if (last_running != 0) page->modify->first_dirty_txn = last_running; @@ -366,6 +376,25 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) } /* + * __wt_page_modify_clear -- + * Clean a modified page. + */ +static inline void +__wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + /* + * The page must be held exclusive when this call is made, this call + * can only be used when the page is owned by a single thread. + * + * Allow the call to be made on clean pages. + */ + if (__wt_page_is_modified(page)) { + page->modify->write_gen = 0; + __wt_cache_dirty_decr(session, page); + } +} + +/* * __wt_page_modify_set -- * Mark the page and tree dirty. */ @@ -385,6 +414,9 @@ __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) * shouldn't cause problems; regardless, let's play it safe.) */ if (S2BT(session)->modified == 0) { + /* Assert we never dirty a checkpoint handle. */ + WT_ASSERT(session, session->dhandle->checkpoint == NULL); + S2BT(session)->modified = 1; WT_FULL_BARRIER(); } @@ -426,7 +458,7 @@ __wt_page_parent_modify_set( * __wt_off_page -- * Return if a pointer references off-page data. */ -static inline int +static inline bool __wt_off_page(WT_PAGE *page, const void *p) { /* @@ -527,7 +559,12 @@ __wt_ref_key_instantiated(WT_REF *ref) static inline void __wt_ref_key_clear(WT_REF *ref) { - /* The key union has 2 fields, both of which are 8B. */ + /* + * The key union has 2 8B fields; this is equivalent to: + * + * ref->key.recno = WT_RECNO_OOB; + * ref->key.ikey = NULL; + */ ref->key.recno = 0; } @@ -537,7 +574,7 @@ __wt_ref_key_clear(WT_REF *ref) * had without unpacking a cell, and information about the cell, if the key * isn't cheaply available. */ -static inline int +static inline bool __wt_row_leaf_key_info(WT_PAGE *page, void *copy, WT_IKEY **ikeyp, WT_CELL **cellp, void *datap, size_t *sizep) { @@ -628,7 +665,7 @@ __wt_row_leaf_key_info(WT_PAGE *page, void *copy, if (cellp != NULL) *cellp = WT_PAGE_REF_OFFSET(page, WT_CELL_DECODE_OFFSET(v)); - return (0); + return (false); case WT_K_FLAG: /* Encoded key: no instantiated key, no cell. */ if (cellp != NULL) @@ -639,9 +676,9 @@ __wt_row_leaf_key_info(WT_PAGE *page, void *copy, *(void **)datap = WT_PAGE_REF_OFFSET(page, WT_K_DECODE_KEY_OFFSET(v)); *sizep = WT_K_DECODE_KEY_LEN(v); - return (1); + return (true); } - return (0); + return (false); case WT_KV_FLAG: /* Encoded key/value pair: no instantiated key, no cell. */ if (cellp != NULL) @@ -652,9 +689,9 @@ __wt_row_leaf_key_info(WT_PAGE *page, void *copy, *(void **)datap = WT_PAGE_REF_OFFSET( page, WT_KV_DECODE_KEY_OFFSET(v)); *sizep = WT_KV_DECODE_KEY_LEN(v); - return (1); + return (true); } - return (0); + return (false); } @@ -667,9 +704,9 @@ __wt_row_leaf_key_info(WT_PAGE *page, void *copy, if (datap != NULL) { *(void **)datap = WT_IKEY_DATA(ikey); *sizep = ikey->size; - return (1); + return (true); } - return (0); + return (false); } /* @@ -857,7 +894,7 @@ __wt_row_leaf_value_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *kpack) * __wt_row_leaf_value -- * Return the value for a row-store leaf page encoded key/value pair. */ -static inline int +static inline bool __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value) { uintptr_t v; @@ -873,9 +910,9 @@ __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value) value->data = WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_VALUE_OFFSET(v)); value->size = WT_KV_DECODE_VALUE_LEN(v); - return (1); + return (true); } - return (0); + return (false); } /* @@ -934,11 +971,13 @@ __wt_ref_info(WT_SESSION_IMPL *session, * __wt_page_can_split -- * Check whether a page can be split in memory. */ -static inline int +static inline bool __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; WT_INSERT_HEAD *ins_head; + WT_INSERT *ins; + int i; btree = S2BT(session); @@ -947,58 +986,54 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) * of the page could continually split without benefit. */ if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT)) - return (0); + return (false); /* * Check for pages with append-only workloads. A common application * pattern is to have multiple threads frantically appending to the * tree. We want to reconcile and evict this page, but we'd like to - * do it without making the appending threads wait. If we're not - * discarding the tree, check and see if it's worth doing a split to - * let the threads continue before doing eviction. - * - * Ignore anything other than large, dirty row-store leaf pages. + * do it without making the appending threads wait. See if it's worth + * doing a split to let the threads continue before doing eviction. * - * XXX KEITH - * Need a better test for append-only workloads. + * Ignore anything other than large, dirty row-store leaf pages. The + * split code only supports row-store pages, and we depend on the page + * being dirty for correctness (the page must be reconciled again + * before being evicted after the split, information from a previous + * reconciliation will be wrong, so we can't evict immediately). */ if (page->type != WT_PAGE_ROW_LEAF || page->memory_footprint < btree->maxmempage || !__wt_page_is_modified(page)) - return (0); - - /* Don't split a page that is pending a multi-block split. */ - if (F_ISSET(page->modify, WT_PM_REC_MULTIBLOCK)) - return (0); + return (false); /* * There is no point splitting if the list is small, no deep items is - * our heuristic for that. (A 1/4 probability of adding a new skiplist - * level means there will be a new 6th level for roughly each 4KB of - * entries in the list. If we have at least two 6th level entries, the - * list is at least large enough to work with.) - * - * The following code requires at least two items on the insert list, - * this test serves the additional purpose of confirming that. + * our heuristic for that. A 1/4 probability of adding a new skiplist + * level, with level-0 always created, means there will be a 5th level + * entry for roughly every 1024 entries in the list. If there are at + * least 4 5th level entries (4K items), the list is large enough. */ -#define WT_MIN_SPLIT_SKIPLIST_DEPTH WT_MIN(6, WT_SKIP_MAXDEPTH - 1) +#define WT_MIN_SPLIT_SKIPLIST_DEPTH WT_MIN(5, WT_SKIP_MAXDEPTH - 1) ins_head = page->pg_row_entries == 0 ? WT_ROW_INSERT_SMALLEST(page) : WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); - if (ins_head == NULL || - ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH] == NULL || - ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH] == - ins_head->tail[WT_MIN_SPLIT_SKIPLIST_DEPTH]) - return (0); - - return (1); + if (ins_head == NULL) + return (false); + for (i = 0, ins = ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH]; + ins != NULL; ins = ins->next[WT_MIN_SPLIT_SKIPLIST_DEPTH]) + if (++i == 4) { + WT_STAT_FAST_CONN_INCR(session, cache_inmem_splittable); + WT_STAT_FAST_DATA_INCR(session, cache_inmem_splittable); + return (true); + } + return (false); } /* * __wt_page_can_evict -- * Check whether a page can be evicted. */ -static inline int +static inline bool __wt_page_can_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int check_splits, int *inmem_splitp) { @@ -1011,11 +1046,22 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, btree = S2BT(session); mod = page->modify; - txn_global = &S2C(session)->txn_global; /* Pages that have never been modified can always be evicted. */ if (mod == NULL) - return (1); + return (true); + + /* + * Check for in-memory splits before other eviction tests. If the page + * should split in-memory, return success immediately and skip more + * detailed eviction tests. We don't need further tests since the page + * won't be written or discarded from the cache. + */ + if (__wt_page_can_split(session, page)) { + if (inmem_splitp != NULL) + *inmem_splitp = 1; + return (true); + } /* * If the tree was deepened, there's a requirement that newly created @@ -1028,20 +1074,7 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, */ if (check_splits && WT_PAGE_IS_INTERNAL(page) && !__wt_txn_visible_all(session, mod->mod_split_txn)) - return (0); - - /* - * Allow for the splitting of pages when a checkpoint is underway only - * if the allow_splits flag has been passed, we know we are performing - * a checkpoint, the page is larger than the stated maximum and there - * has not already been a split on this page as the WT_PM_REC_MULTIBLOCK - * flag is unset. - */ - if (__wt_page_can_split(session, page)) { - if (inmem_splitp != NULL) - *inmem_splitp = 1; - return (1); - } + return (false); /* * If the file is being checkpointed, we can't evict dirty pages: @@ -1049,25 +1082,27 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, * previous version might be referenced by an internal page already * been written in the checkpoint, leaving the checkpoint inconsistent. */ - if (btree->checkpointing && - (__wt_page_is_modified(page) || - F_ISSET(mod, WT_PM_REC_MULTIBLOCK))) { + if (btree->checkpointing && __wt_page_is_modified(page)) { WT_STAT_FAST_CONN_INCR(session, cache_eviction_checkpoint); WT_STAT_FAST_DATA_INCR(session, cache_eviction_checkpoint); - return (0); + return (false); } /* - * If the page was recently split in-memory, don't force it out: we - * hope an eviction thread will find it first. The check here is - * similar to __wt_txn_visible_all, but ignores the checkpoint's - * transaction. + * If the page was recently split in-memory, don't evict it immediately: + * we want to give application threads that are appending a chance to + * move to the new leaf page created by the split. + * + * Note the check here is similar to __wt_txn_visible_all, but ignores + * the checkpoint's transaction. */ - if (check_splits && - WT_TXNID_LE(txn_global->oldest_id, mod->inmem_split_txn)) - return (0); + if (check_splits) { + txn_global = &S2C(session)->txn_global; + if (WT_TXNID_LE(txn_global->oldest_id, mod->inmem_split_txn)) + return (false); + } - return (1); + return (true); } /* @@ -1100,7 +1135,7 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) (void)__wt_atomic_addv32(&btree->evict_busy, 1); too_big = (page->memory_footprint > btree->maxmempage) ? 1 : 0; - if ((ret = __wt_evict_page(session, ref)) == 0) { + if ((ret = __wt_evict(session, ref, 0)) == 0) { if (too_big) WT_STAT_FAST_CONN_INCR(session, cache_eviction_force); else @@ -1151,12 +1186,13 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) * memory_page_max setting, when we see many deleted items, and when we * are attempting to scan without trashing the cache. * - * Fast checks if eviction is disabled for this operation or this tree, - * then perform a general check if eviction will be possible. + * Fast checks if eviction is disabled for this handle, operation or + * tree, then perform a general check if eviction will be possible. */ page = ref->page; if (page->read_gen != WT_READGEN_OLDEST || LF_ISSET(WT_READ_NO_EVICT) || + F_ISSET(session, WT_SESSION_NO_EVICTION) || F_ISSET(btree, WT_BTREE_NO_EVICTION) || !__wt_page_can_evict(session, page, 1, NULL)) return (__wt_hazard_clear(session, page)); @@ -1272,13 +1308,13 @@ __wt_skip_choose_depth(WT_SESSION_IMPL *session) } /* - * __wt_btree_lsm_size -- + * __wt_btree_lsm_over_size -- * Return if the size of an in-memory tree with a single leaf page is over * a specified maximum. If called on anything other than a simple tree with a * single leaf page, returns true so our LSM caller will switch to a new tree. */ -static inline int -__wt_btree_lsm_size(WT_SESSION_IMPL *session, uint64_t maxsize) +static inline bool +__wt_btree_lsm_over_size(WT_SESSION_IMPL *session, uint64_t maxsize) { WT_BTREE *btree; WT_PAGE *child, *root; @@ -1290,20 +1326,20 @@ __wt_btree_lsm_size(WT_SESSION_IMPL *session, uint64_t maxsize) /* Check for a non-existent tree. */ if (root == NULL) - return (0); + return (false); /* A tree that can be evicted always requires a switch. */ if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) - return (1); + return (true); /* Check for a tree with a single leaf page. */ WT_INTL_INDEX_GET(session, root, pindex); if (pindex->entries != 1) /* > 1 child page, switch */ - return (1); + return (true); first = pindex->index[0]; if (first->state != WT_REF_MEM) /* no child page, ignore */ - return (0); + return (false); /* * We're reaching down into the page without a hazard pointer, but @@ -1312,7 +1348,7 @@ __wt_btree_lsm_size(WT_SESSION_IMPL *session, uint64_t maxsize) */ child = first->page; if (child->type != WT_PAGE_ROW_LEAF) /* not a single leaf page */ - return (1); + return (true); return (child->memory_footprint > maxsize); } |