diff options
author | Michael Cahill <michael.cahill@mongodb.com> | 2015-11-18 11:28:45 +1100 |
---|---|---|
committer | Michael Cahill <michael.cahill@mongodb.com> | 2015-11-18 11:28:45 +1100 |
commit | 9bd711a3726aa49236e63d82074eedfc6d91a528 (patch) | |
tree | 1425393c6a0e29e93334dd74706392d40610ec76 /src | |
parent | ee33b0875afe660fa3888a4361c9cf215f27d582 (diff) | |
parent | e2f11301fe0f877225dbd102aba458b887cd40ee (diff) | |
download | mongo-9bd711a3726aa49236e63d82074eedfc6d91a528.tar.gz |
Merge branch 'develop' into wt-1315-join-cursor
Also fix some type conversion warnings.
Diffstat (limited to 'src')
40 files changed, 1497 insertions, 1030 deletions
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index e60f7b3fb02..389ac761c5b 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -586,8 +586,8 @@ skip_evict: * CPU to no purpose. */ if (stalled) - wait_cnt += 1000; - else if (++wait_cnt < 1000) { + wait_cnt += WT_THOUSAND; + else if (++wait_cnt < WT_THOUSAND) { __wt_yield(); continue; } @@ -603,7 +603,7 @@ skip_evict: if (cache_work) continue; } - sleep_cnt = WT_MIN(sleep_cnt + 1000, 10000); + sleep_cnt = WT_MIN(sleep_cnt + WT_THOUSAND, 10000); WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt); __wt_sleep(0, sleep_cnt); } diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 9e45bf10a5c..eaeac683f9a 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -169,54 +169,58 @@ __split_safe_free(WT_SESSION_IMPL *session, return (__split_stash_add(session, split_gen, p, s)); } +#ifdef HAVE_DIAGNOSTIC /* - * __split_should_deepen -- - * Return if we should deepen the tree. + * __split_verify_intl_key_order -- + * Verify the key order on an internal page after a split, diagnostic only. */ -static bool -__split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref) +static void +__split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; - WT_PAGE *page; - WT_PAGE_INDEX *pindex; + WT_ITEM *next, _next, *last, _last, *tmp; + WT_REF *ref; + uint64_t recno; + int cmp; + bool first; btree = S2BT(session); - page = ref->page; - - /* - * Our caller is holding the parent page locked to single-thread splits, - * which means we can safely look at the page's index without setting a - * split generation. - */ - pindex = WT_INTL_INDEX_GET_SAFE(page); - - /* - * Sanity check for a reasonable number of keys on-page keys. Splitting - * with too few keys leads to excessively deep trees. - */ - if (pindex->entries < 100) - return (false); - - /* - * Deepen the tree if the page's memory footprint is larger than the - * maximum size for a page in memory (presumably putting eviction - * pressure on the cache). - */ - if (page->memory_footprint > btree->maxmempage) - return (true); - /* - * Check if the page has enough keys to make it worth splitting. If - * the number of keys is allowed to grow too large, the cost of - * splitting into parent pages can become large enough to result - * in slow operations. - */ - if (!__wt_ref_is_root(ref) && - pindex->entries > btree->split_deepen_min_child) - return (true); + switch (page->type) { + case WT_PAGE_COL_INT: + recno = 0; /* Less than any valid record number. */ + WT_INTL_FOREACH_BEGIN(session, page, ref) { + WT_ASSERT(session, ref->key.recno > recno); + recno = ref->key.recno; + } WT_INTL_FOREACH_END; + break; + case WT_PAGE_ROW_INT: + next = &_next; + WT_CLEAR(_next); + last = &_last; + WT_CLEAR(_last); - return (false); + first = true; + WT_INTL_FOREACH_BEGIN(session, page, ref) { + __wt_ref_key(page, ref, &next->data, &next->size); + if (last->size == 0) { + if (first) + first = false; + else { + WT_ASSERT(session, __wt_compare( + session, btree->collator, last, + next, &cmp) == 0); + WT_ASSERT(session, cmp < 0); + } + } + tmp = last; + last = next; + next = tmp; + } WT_INTL_FOREACH_END; + break; + } } +#endif /* * __split_ovfl_key_cleanup -- @@ -267,47 +271,58 @@ __split_ovfl_key_cleanup(WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref) } /* - * __split_ref_deepen_move -- - * Move a WT_REF from a parent to a child in service of a split to deepen - * the tree, including updating the accounting information. + * __split_ref_move -- + * Move a WT_REF from one page to another, including updating accounting + * information. */ static int -__split_ref_deepen_move(WT_SESSION_IMPL *session, - WT_PAGE *parent, WT_REF *ref, size_t *parent_decrp, size_t *child_incrp) +__split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, + WT_REF **from_refp, size_t *decrp, WT_REF **to_refp, size_t *incrp) { WT_ADDR *addr; WT_CELL_UNPACK unpack; WT_DECL_RET; WT_IKEY *ikey; + WT_REF *ref; size_t size; void *key; + ref = *from_refp; + /* + * The from-home argument is the page into which the "from" WT_REF may + * point, for example, if there's an on-page key the "from" WT_REF + * references, it will be on the page "from-home". + * * Instantiate row-store keys, and column- and row-store addresses in - * the WT_REF structures referenced by a page that's being split (and - * deepening the tree). The WT_REF structures aren't moving, but the - * index references are moving from the page we're splitting to a set - * of child pages, and so we can no longer reference the block image - * that remains with the page being split. + * the WT_REF structures referenced by a page that's being split. The + * WT_REF structures aren't moving, but the index references are moving + * from the page we're splitting to a set of new pages, and so we can + * no longer reference the block image that remains with the page being + * split. * * No locking is required to update the WT_REF structure because we're - * the only thread splitting the parent page, and there's no way for - * readers to race with our updates of single pointers. The changes - * have to be written before the page goes away, of course, our caller - * owns that problem. - * - * Row-store keys, first. + * the only thread splitting the page, and there's no way for readers + * to race with our updates of single pointers. The changes have to be + * written before the page goes away, of course, our caller owns that + * problem. */ - if (parent->type == WT_PAGE_ROW_INT) { + if (from_home->type == WT_PAGE_ROW_INT) { + /* + * Row-store keys: if it's not yet instantiated, instantiate it. + * If already instantiated, check for overflow cleanup (overflow + * keys are always instantiated). + */ if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) { - __wt_ref_key(parent, ref, &key, &size); + __wt_ref_key(from_home, ref, &key, &size); WT_RET(__wt_row_ikey(session, 0, key, size, ref)); ikey = ref->key.ikey; } else { - WT_RET(__split_ovfl_key_cleanup(session, parent, ref)); - *parent_decrp += sizeof(WT_IKEY) + ikey->size; + WT_RET( + __split_ovfl_key_cleanup(session, from_home, ref)); + *decrp += sizeof(WT_IKEY) + ikey->size; } - *child_incrp += sizeof(WT_IKEY) + ikey->size; + *incrp += sizeof(WT_IKEY) + ikey->size; } /* @@ -316,7 +331,7 @@ __split_ref_deepen_move(WT_SESSION_IMPL *session, * get the address from the on-page cell. */ addr = ref->addr; - if (addr != NULL && !__wt_off_page(parent, addr)) { + if (addr != NULL && !__wt_off_page(from_home, addr)) { __wt_cell_unpack((WT_CELL *)ref->addr, &unpack); WT_RET(__wt_calloc_one(session, &addr)); if ((ret = __wt_strndup( @@ -330,364 +345,1042 @@ __split_ref_deepen_move(WT_SESSION_IMPL *session, ref->addr = addr; } - /* And finally, the WT_REF itself. */ - WT_MEM_TRANSFER(*parent_decrp, *child_incrp, sizeof(WT_REF)); + /* And finally, copy the WT_REF pointer itself. */ + *to_refp = ref; + WT_MEM_TRANSFER(*decrp, *incrp, sizeof(WT_REF)); return (0); } -#ifdef HAVE_DIAGNOSTIC /* - * __split_verify_intl_key_order -- - * Verify the key order on an internal page after a split, diagnostic only. + * __split_child_block_evict_and_split -- + * Ensure the newly created child isn't evicted or split for now. */ static void -__split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) +__split_child_block_evict_and_split(WT_PAGE *child) { - WT_BTREE *btree; - WT_ITEM *next, _next, *last, _last, *tmp; - WT_REF *ref; - uint64_t recno; - int cmp; - bool first; + /* + * Once the split is live, newly created internal pages might be evicted + * and their WT_REF structures freed. If that happens before all threads + * exit the index of the page which previously "owned" the WT_REF, a + * thread might see a freed WT_REF. To ensure that doesn't happen, the + * newly created page's modify structure has a field with a transaction + * ID that's checked before any internal page is evicted. Unfortunately, + * we don't know the correct value until we update the original page's + * index (we need a transaction ID from after that update), but the act + * of updating the original page's index is what allows the eviction to + * happen. + * + * Once the split is live, newly created internal pages might themselves + * split. The split itself is not the problem: if a page splits before + * we fix up its WT_REF (in other words, a WT_REF we move is then moved + * again, before we reset the underlying page's parent reference), it's + * OK because the test we use to find a WT_REF and WT_PAGE that require + * fixing up is only that the WT_REF points to the wrong parent, not it + * points to a specific wrong parent. The problem is our fix up of the + * WT_REFs in the created page could race with the subsequent fix of the + * same WT_REFs (in a different created page), we'd have to acquire some + * lock to prevent that race, and that's going to be difficult at best. + * + * For now, block eviction and splits in newly created pages until they + * have been fixed up. + */ + F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); +} - btree = S2BT(session); +/* + * __split_ref_move_final -- + * Finalize the moved WT_REF structures after the split succeeds. + */ +static int +__split_ref_move_final( + WT_SESSION_IMPL *session, WT_REF **refp, uint32_t entries) +{ + WT_DECL_RET; + WT_PAGE *child; + WT_REF *ref, *child_ref; + uint64_t txn_new_id; + uint32_t i; - switch (page->type) { - case WT_PAGE_COL_INT: - recno = 0; /* Less than any valid record number. */ - WT_INTL_FOREACH_BEGIN(session, page, ref) { - WT_ASSERT(session, ref->key.recno > recno); - recno = ref->key.recno; - } WT_INTL_FOREACH_END; - break; - case WT_PAGE_ROW_INT: - next = &_next; - WT_CLEAR(_next); - last = &_last; - WT_CLEAR(_last); + /* + * When creating new internal pages as part of a split, we set a field + * in those pages modify structure to prevent them from being evicted + * until all threads are known to have exited the index of the page that + * previously "owned" the WT_REF. Set that field to a safe value. + */ + txn_new_id = __wt_txn_new_id(session); - first = true; - WT_INTL_FOREACH_BEGIN(session, page, ref) { - __wt_ref_key(page, ref, &next->data, &next->size); - if (last->size == 0) { - if (first) - first = false; - else { - WT_ASSERT(session, __wt_compare( - session, btree->collator, last, - next, &cmp) == 0); - WT_ASSERT(session, cmp < 0); - } + /* + * The WT_REF structures moved to newly allocated child pages reference + * the wrong parent page and we have to fix that up. The problem is + * revealed when a thread of control searches for the child page's + * reference structure slot, and fails to find it because the parent + * page being searched no longer references the child. When that failure + * happens the thread waits for the reference's home page to be updated, + * which we do here: walk the children and fix them up. + */ + for (i = 0; i < entries; ++i, ++refp) { + ref = *refp; + + /* + * We don't hold hazard pointers on created pages, they cannot + * be evicted because the page-modify transaction value set as + * they were created prevents eviction. (See above, we reset + * that value as part of fixing up the page.) But, an eviction + * thread might be attempting to evict the page (the WT_REF may + * be WT_REF_LOCKED), or it may be a disk based page (the WT_REF + * may be WT_REF_READING), or it may be in some other state. + * Acquire a hazard pointer for any in-memory pages so we know + * the state of the page. Ignore pages not in-memory (deleted, + * on-disk, being read), there's no in-memory structure to fix. + */ + if ((ret = __wt_page_in(session, + ref, WT_READ_CACHE | WT_READ_NO_EVICT)) == WT_NOTFOUND) + continue; + WT_ERR(ret); + + child = ref->page; +#ifdef HAVE_DIAGNOSTIC + WT_WITH_PAGE_INDEX(session, + __split_verify_intl_key_order(session, child)); +#endif + /* + * We use a page flag to prevent the child from splitting from + * underneath us, but the split-generation error checks don't + * know about that flag; use the standard macros to ensure that + * reading the child's page index structure is safe. + */ + WT_ENTER_PAGE_INDEX(session); + WT_INTL_FOREACH_BEGIN(session, child, child_ref) { + /* + * The page's home reference may not be wrong, as we + * opened up access from the top of the tree already, + * disk pages may have been read in since then, and + * those pages would have correct parent references. + */ + if (child_ref->home != child) { + child_ref->home = child; + child_ref->pindex_hint = 0; + + child->modify->mod_split_txn = txn_new_id; } - tmp = last; - last = next; - next = tmp; } WT_INTL_FOREACH_END; - break; + WT_LEAVE_PAGE_INDEX(session); + + /* The child can now be evicted or split. */ + F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); + + WT_ERR(__wt_hazard_clear(session, child)); } + + /* + * Push out the changes: not required for correctness, but don't let + * threads spin on incorrect page references longer than necessary. + */ + WT_FULL_BARRIER(); + return (0); + +err: /* Something really bad just happened. */ + WT_PANIC_RET(session, ret, "fatal error resolving a split"); } -#endif /* - * __split_deepen -- - * Split an internal page in-memory, deepening the tree. + * __split_root -- + * Split the root page in-memory, deepening the tree. */ static int -__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent) +__split_root(WT_SESSION_IMPL *session, WT_PAGE *root) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *child; WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex; WT_REF **alloc_refp; - WT_REF *child_ref, **child_refp, *parent_ref, **parent_refp, *ref; - size_t child_incr, parent_decr, parent_incr, size; + WT_REF **child_refp, *ref, **root_refp; + size_t child_incr, root_decr, root_incr, size; uint64_t split_gen; - uint32_t children, chunk, i, j, moved_entries, new_entries, remain; - uint32_t skip_leading, slots; + uint32_t children, chunk, i, j, remain; + uint32_t slots; bool complete; void *p; WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen); WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen); + WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal); + WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_internal); btree = S2BT(session); alloc_index = NULL; - parent_incr = parent_decr = 0; + root_decr = root_incr = 0; complete = false; + /* The root page will be marked dirty, make sure that will succeed. */ + WT_RET(__wt_page_modify_init(session, root)); + /* - * Our caller is holding the parent page locked to single-thread splits, + * Our caller is holding the root page locked to single-thread splits, * which means we can safely look at the page's index without setting a * split generation. */ - pindex = WT_INTL_INDEX_GET_SAFE(parent); - - /* - * A prepending/appending workload will repeatedly deepen parts of the - * tree that aren't changing, and appending workloads are not uncommon. - * First, keep the first/last pages of the tree at their current level, - * to catch simple workloads. Second, track the number of entries which - * resulted from the last time we deepened this page, and if we refilled - * this page without splitting into those slots, ignore them for this - * split. It's not exact because an eviction might split into any part - * of the page: if 80% of the splits are at the end of the page, assume - * an append-style workload. Of course, the plan eventually fails: when - * repeatedly deepening this page for an append-only workload, we will - * progressively ignore more and more of the slots. When ignoring 90% of - * the slots, deepen the entire page again. - * - * Figure out how many slots we're leaving at this level and how many - * child pages we're creating. - */ -#undef skip_trailing -#define skip_trailing 1 - skip_leading = 1; - new_entries = pindex->entries - parent->pg_intl_deepen_split_last; - if (parent->pg_intl_deepen_split_append > (new_entries * 8) / 10) - skip_leading = parent->pg_intl_deepen_split_last; - if (skip_leading > (pindex->entries * 9) * 10) - skip_leading = 1; + pindex = WT_INTL_INDEX_GET_SAFE(root); /* - * In a few (rare) cases we split pages with only a few entries, and in - * those cases we keep it simple, 10 children, skip only first and last - * entries. Otherwise, split into a lot of child pages. + * Decide how many child pages to create, then calculate the standard + * chunk and whatever remains. Sanity check the number of children: + * the decision to split matched to the deepen-per-child configuration + * might get it wrong. */ - moved_entries = pindex->entries - (skip_leading + skip_trailing); - children = moved_entries / btree->split_deepen_per_child; + children = pindex->entries / btree->split_deepen_per_child; if (children < 10) { + if (pindex->entries < 100) + return (EBUSY); children = 10; - skip_leading = 1; - moved_entries = - pindex->entries - (skip_leading + skip_trailing); } + chunk = pindex->entries / children; + remain = pindex->entries - chunk * (children - 1); WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, - "%p: %" PRIu32 " elements, splitting into %" PRIu32 " children", - parent, pindex->entries, children)); + "%p: %" PRIu32 " root page elements, splitting into %" PRIu32 + " children", + root, pindex->entries, children)); /* - * Allocate a new WT_PAGE_INDEX and set of WT_REF objects. Initialize - * the slots of the allocated WT_PAGE_INDEX to point to the pages we're - * keeping at the current level, and the rest of the slots to point to - * new WT_REF objects. + * Allocate a new WT_PAGE_INDEX and set of WT_REF objects to be inserted + * into the root page, replacing the root's page-index. */ - size = sizeof(WT_PAGE_INDEX) + - (children + skip_leading + skip_trailing) * sizeof(WT_REF *); + size = sizeof(WT_PAGE_INDEX) + children * sizeof(WT_REF *); WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); - parent_incr += size; + root_incr += size; alloc_index->index = (WT_REF **)(alloc_index + 1); - alloc_index->entries = children + skip_leading + skip_trailing; - for (alloc_refp = alloc_index->index, - i = 0; i < skip_leading; ++alloc_refp, ++i) - alloc_index->index[i] = pindex->index[i]; - for (i = 0; i < children; ++alloc_refp, ++i) + alloc_index->entries = children; + alloc_refp = alloc_index->index; + for (i = 0; i < children; alloc_refp++, ++i) WT_ERR(__wt_calloc_one(session, alloc_refp)); - parent_incr += children * sizeof(WT_REF); - alloc_index->index[alloc_index->entries - 1] = - pindex->index[pindex->entries - 1]; + root_incr += children * sizeof(WT_REF); /* Allocate child pages, and connect them into the new page index. */ - chunk = moved_entries / children; - remain = moved_entries - chunk * (children - 1); - for (parent_refp = pindex->index + skip_leading, - alloc_refp = alloc_index->index + skip_leading, - i = 0; i < children; ++i) { + for (root_refp = pindex->index, + alloc_refp = alloc_index->index, i = 0; i < children; ++i) { slots = i == children - 1 ? remain : chunk; WT_ERR(__wt_page_alloc( - session, parent->type, 0, slots, false, &child)); + session, root->type, 0, slots, false, &child)); /* - * Initialize the parent page's child reference; we need a copy - * of the page's key. + * Initialize the page's child reference; we need a copy of the + * page's key. */ ref = *alloc_refp++; - ref->home = parent; + ref->home = root; ref->page = child; ref->addr = NULL; - if (parent->type == WT_PAGE_ROW_INT) { - __wt_ref_key(parent, *parent_refp, &p, &size); + if (root->type == WT_PAGE_ROW_INT) { + __wt_ref_key(root, *root_refp, &p, &size); WT_ERR(__wt_row_ikey(session, 0, p, size, ref)); - parent_incr += sizeof(WT_IKEY) + size; + root_incr += sizeof(WT_IKEY) + size; } else - ref->key.recno = (*parent_refp)->key.recno; + ref->key.recno = (*root_refp)->key.recno; ref->state = WT_REF_MEM; /* Initialize the child page. */ - if (parent->type == WT_PAGE_COL_INT) - child->pg_intl_recno = (*parent_refp)->key.recno; + if (root->type == WT_PAGE_COL_INT) + child->pg_intl_recno = (*root_refp)->key.recno; child->pg_intl_parent_ref = ref; /* Mark it dirty. */ WT_ERR(__wt_page_modify_init(session, child)); __wt_page_modify_set(session, child); - /* - * Once the split goes live, the newly created internal pages - * might be evicted and their WT_REF structures freed. If those - * pages are evicted before threads exit the previous page index - * array, a thread might see a freed WT_REF. Set the eviction - * transaction requirement for the newly created internal pages. - */ - child->modify->mod_split_txn = __wt_txn_new_id(session); + /* Ensure the page isn't evicted or split for now. */ + __split_child_block_evict_and_split(child); /* * The newly allocated child's page index references the same - * structures as the parent. (We cannot move WT_REF structures, + * structures as the root. (We cannot move WT_REF structures, * threads may be underneath us right now changing the structure * state.) However, if the WT_REF structures reference on-page * information, we have to fix that, because the disk image for * the page that has an page index entry for the WT_REF is about * to change. */ - child_incr = 0; child_pindex = WT_INTL_INDEX_GET_SAFE(child); - for (child_refp = child_pindex->index, j = 0; j < slots; ++j) { - WT_ERR(__split_ref_deepen_move(session, - parent, *parent_refp, &parent_decr, &child_incr)); - *child_refp++ = *parent_refp++; - } + child_incr = 0; + for (child_refp = child_pindex->index, + j = 0; j < slots; ++child_refp, ++root_refp, ++j) + WT_ERR(__split_ref_move(session, root, + root_refp, &root_decr, child_refp, &child_incr)); + __wt_cache_page_inmem_incr(session, child, child_incr); } WT_ASSERT(session, - alloc_refp - alloc_index->index == - (ptrdiff_t)(alloc_index->entries - skip_trailing)); - WT_ASSERT(session, parent_refp - pindex->index == - (ptrdiff_t)(pindex->entries - skip_trailing)); + alloc_refp - alloc_index->index == (ptrdiff_t)alloc_index->entries); + WT_ASSERT(session, + root_refp - pindex->index == (ptrdiff_t)pindex->entries); /* - * Confirm the parent page's index hasn't moved, then update it, which + * Confirm the root page's index hasn't moved, then update it, which * makes the split visible to threads descending the tree. From this * point on, we're committed to the split. * * A note on error handling: until this point, there's no problem with * unwinding on error. We allocated a new page index, a new set of * WT_REFs and a new set of child pages -- if an error occurred, the - * parent remained unchanged, although it may have an incorrect memory - * footprint. From now on we've modified the parent page, attention + * root remained unchanged, although it may have an incorrect memory + * footprint. From now on we've modified the root page, attention * needs to be paid. However, subsequent failures are relatively benign, * the split is OK and complete. For that reason, we ignore errors past * this point unless there's a panic. */ + WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(root) == pindex); + WT_INTL_INDEX_SET(root, alloc_index); + complete = true; + +#ifdef HAVE_DIAGNOSTIC + WT_WITH_PAGE_INDEX(session, + __split_verify_intl_key_order(session, root)); +#endif + /* Fix up the moved WT_REF structures. */ + WT_ERR(__split_ref_move_final( + session, alloc_index->index, alloc_index->entries)); + + /* We've installed the allocated page-index, ensure error handling. */ + alloc_index = NULL; + + /* + * We can't free the previous root's index, there may be threads using + * it. Add to the session's discard list, to be freed once we know no + * threads can still be using it. + * + * This change requires care with error handling: we have already + * updated the page with a new index. Even if stashing the old value + * fails, we don't roll back that change, because threads may already + * be using the new index. + */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); + WT_TRET(__split_safe_free(session, split_gen, false, pindex, size)); + root_decr += size; + + /* Adjust the root's memory footprint and mark it dirty. */ + __wt_cache_page_inmem_incr(session, root, root_incr); + __wt_cache_page_inmem_decr(session, root, root_decr); + __wt_page_modify_set(session, root); + +err: /* + * If complete is true, we saw an error after opening up the tree to + * descent through the root page's new index. There is nothing we + * can do, there are threads potentially active in both versions of + * the tree. + * + * A note on error handling: if we completed the split, return success, + * nothing really bad can have happened, and our caller has to proceed + * with the split. + */ + if (!complete) + __wt_free_ref_index(session, root, alloc_index, true); + + if (ret != 0 && ret != WT_PANIC) + __wt_err(session, ret, + "ignoring not-fatal error during root page split to " + "deepen the tree"); + return (ret == WT_PANIC || !complete ? ret : 0); +} + +/* + * __split_parent -- + * Resolve a multi-page split, inserting new information into the parent. + */ +static int +__split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, + uint32_t new_entries, size_t parent_incr, bool exclusive, bool discard) +{ + WT_DECL_RET; + WT_IKEY *ikey; + WT_PAGE *parent; + WT_PAGE_INDEX *alloc_index, *pindex; + WT_REF **alloc_refp, *next_ref; + size_t parent_decr, size; + uint64_t split_gen; + uint32_t i, j; + uint32_t deleted_entries, parent_entries, result_entries; + bool complete, empty_parent; + + parent = ref->home; + + alloc_index = pindex = NULL; + parent_decr = 0; + parent_entries = 0; + complete = empty_parent = false; + + /* The parent page will be marked dirty, make sure that will succeed. */ + WT_RET(__wt_page_modify_init(session, parent)); + + /* + * We've locked the parent, which means it cannot split (which is the + * only reason to worry about split generation values). + */ + pindex = WT_INTL_INDEX_GET_SAFE(parent); + parent_entries = pindex->entries; + + /* + * Remove any refs to deleted pages while we are splitting, we have + * the internal page locked down, and are copying the refs into a new + * array anyway. Switch them to the special split state, so that any + * reading thread will restart. + */ + for (deleted_entries = 0, i = 0; i < parent_entries; ++i) { + next_ref = pindex->index[i]; + WT_ASSERT(session, next_ref->state != WT_REF_SPLIT); + if (next_ref->state == WT_REF_DELETED && + __wt_delete_page_skip(session, next_ref, true) && + __wt_atomic_casv32( + &next_ref->state, WT_REF_DELETED, WT_REF_SPLIT)) + deleted_entries++; + } + + /* + * The final entry count consists of the original count, plus any new + * pages, less any WT_REFs we're removing (deleted entries plus the + * entry we're replacing). + */ + result_entries = (parent_entries + new_entries) - (deleted_entries + 1); + + /* + * If there are no remaining entries on the parent, give up, we can't + * leave an empty internal page. Mark it to be evicted soon and clean + * up any references that have changed state. + */ + if (result_entries == 0) { + empty_parent = true; + __wt_page_evict_soon(parent); + goto err; + } + + /* + * Allocate and initialize a new page index array for the parent, then + * copy references from the original index array, plus references from + * the newly created split array, into place. + */ + size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *); + WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); + parent_incr += size; + alloc_index->index = (WT_REF **)(alloc_index + 1); + alloc_index->entries = result_entries; + for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) { + next_ref = pindex->index[i]; + if (next_ref == ref) + for (j = 0; j < new_entries; ++j) { + ref_new[j]->home = parent; + *alloc_refp++ = ref_new[j]; + } + else if (next_ref->state != WT_REF_SPLIT) + /* Skip refs we have marked for deletion. */ + *alloc_refp++ = next_ref; + } + + /* Check that we filled in all the entries. */ + WT_ASSERT(session, + alloc_refp - alloc_index->index == (ptrdiff_t)result_entries); + + /* + * Confirm the parent page's index hasn't moved then update it, which + * makes the split visible to threads descending the tree. + */ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex); WT_INTL_INDEX_SET(parent, alloc_index); - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); - complete = true; + alloc_index = NULL; #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, __split_verify_intl_key_order(session, parent)); #endif + + /* + * If discarding the page's original WT_REF field, reset it to split and + * increment the number of entries being discarded. Threads cursoring + * through the tree were blocked because that WT_REF state was set to + * locked. Changing the locked state to split unblocks those threads and + * causes them to re-calculate their position based on the just-updated + * parent page's index. + */ + if (discard) { + ++deleted_entries; + WT_PUBLISH(ref->state, WT_REF_SPLIT); + } + + /* + * Push out the changes: not required for correctness, but don't let + * threads spin on incorrect page references longer than necessary. + */ + WT_FULL_BARRIER(); + /* - * Save the number of entries created by deepening the tree and reset - * the count of splits into this page after that point. + * A note on error handling: failures before we swapped the new page + * index into the parent can be resolved by freeing allocated memory + * because the original page is unchanged, we can continue to use it + * and we have not yet modified the parent. Failures after we swap + * the new page index into the parent are also relatively benign, the + * split is OK and complete. For those reasons, we ignore errors past + * this point unless there's a panic. */ - parent->pg_intl_deepen_split_append = 0; - parent->pg_intl_deepen_split_last = alloc_index->entries; + complete = true; + + WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, + "%p: %s %s" "split into parent %p, %" PRIu32 " -> %" PRIu32 + " (%s%" PRIu32 ")", + ref->page, ref->page == NULL ? + "unknown page type" : __wt_page_type_string(ref->page->type), + ref->page == NULL ? "reverse " : "", parent, + parent_entries, result_entries, + ref->page == NULL ? "-" : "+", + ref->page == NULL ? + parent_entries - result_entries : result_entries - parent_entries)); /* - * The moved reference structures now reference the wrong parent page, - * and we have to fix that up. The problem is revealed when a thread - * of control searches for a page's reference structure slot, and fails - * to find it because the page it's searching no longer references it. - * When that failure happens, the thread waits for the reference's home - * page to be updated, which we do here: walk the children and fix them - * up. + * The new page index is in place, free the WT_REF we were splitting and + * any deleted WT_REFs we found, modulo the usual safe free semantics. * - * We're not acquiring hazard pointers on these pages, they cannot be - * evicted because of the eviction transaction value set above. - */ - for (parent_refp = alloc_index->index, - i = alloc_index->entries; i > 0; ++parent_refp, --i) { - parent_ref = *parent_refp; - WT_ASSERT(session, parent_ref->home == parent); - if (parent_ref->state != WT_REF_MEM) + * Acquire a new split generation. + */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + for (i = 0; deleted_entries > 0 && i < parent_entries; ++i) { + next_ref = pindex->index[i]; + if (next_ref->state != WT_REF_SPLIT) continue; + --deleted_entries; /* - * We left the first/last children of the parent at the current - * level to avoid bad split patterns, they might be leaf pages; - * check the page type before we continue. - */ - child = parent_ref->page; - if (!WT_PAGE_IS_INTERNAL(child)) - continue; -#ifdef HAVE_DIAGNOSTIC - WT_WITH_PAGE_INDEX(session, - __split_verify_intl_key_order(session, child)); -#endif - /* - * We have the parent locked, but there's nothing to prevent - * this child from splitting beneath us; ensure that reading - * the child's page index structure is safe. + * We set the WT_REF to split, discard it, freeing any resources + * it holds. + * + * Row-store trees where the old version of the page is being + * discarded: the previous parent page's key for this child page + * may have been an on-page overflow key. In that case, if the + * key hasn't been deleted, delete it now, including its backing + * blocks. We are exchanging the WT_REF that referenced it for + * the split page WT_REFs and their keys, and there's no longer + * any reference to it. Done after completing the split (if we + * failed, we'd leak the underlying blocks, but the parent page + * would be unaffected). */ - WT_ENTER_PAGE_INDEX(session); - WT_INTL_FOREACH_BEGIN(session, child, child_ref) { + if (parent->type == WT_PAGE_ROW_INT) { + WT_TRET(__split_ovfl_key_cleanup( + session, parent, next_ref)); + ikey = __wt_ref_key_instantiated(next_ref); + if (ikey != NULL) { + size = sizeof(WT_IKEY) + ikey->size; + WT_TRET(__split_safe_free( + session, split_gen, exclusive, ikey, size)); + parent_decr += size; + } /* - * The page's parent reference may not be wrong, as we - * opened up access from the top of the tree already, - * pages may have been read in since then. Check and - * only update pages that reference the original page, - * they must be wrong. + * The page_del structure can be freed immediately: it + * is only read when the ref state is WT_REF_DELETED. + * The size of the structure wasn't added to the parent, + * don't decrement. */ - if (child_ref->home == parent) { - child_ref->home = child; - child_ref->pindex_hint = 0; + if (next_ref->page_del != NULL) { + __wt_free(session, + next_ref->page_del->update_list); + __wt_free(session, next_ref->page_del); } - } WT_INTL_FOREACH_END; - WT_LEAVE_PAGE_INDEX(session); + } + + WT_TRET(__split_safe_free( + session, split_gen, exclusive, next_ref, sizeof(WT_REF))); + parent_decr += sizeof(WT_REF); } + /* We freed the reference that was split in the loop above. */ + ref = NULL; + /* - * Push out the changes: not required for correctness, but don't let - * threads spin on incorrect page references longer than necessary. + * We can't free the previous page index, there may be threads using it. + * Add it to the session discard list, to be freed when it's safe. */ - WT_FULL_BARRIER(); - alloc_index = NULL; + size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); + WT_TRET(__split_safe_free(session, split_gen, exclusive, pindex, size)); + parent_decr += size; + + /* Adjust the parent's memory footprint and mark it dirty. */ + __wt_cache_page_inmem_incr(session, parent, parent_incr); + __wt_cache_page_inmem_decr(session, parent, parent_decr); + __wt_page_modify_set(session, parent); + +err: /* + * A note on error handling: if we completed the split, return success, + * nothing really bad can have happened, and our caller has to proceed + * with the split. + */ + if (!complete) { + for (i = 0; i < parent_entries; ++i) { + next_ref = pindex->index[i]; + if (next_ref->state == WT_REF_SPLIT) + next_ref->state = WT_REF_DELETED; + } + + __wt_free_ref_index(session, NULL, alloc_index, false); + + /* + * The split couldn't proceed because the parent would be empty, + * return EBUSY so our caller knows to unlock the WT_REF that's + * being deleted, but don't be noisy, there's nothing wrong. + */ + if (empty_parent) + return (EBUSY); + } + + if (ret != 0 && ret != WT_PANIC) + __wt_err(session, ret, + "ignoring not-fatal error during parent page split"); + return (ret == WT_PANIC || !complete ? ret : 0); +} + +/* + * __split_internal -- + * Split an internal page into its parent. + */ +static int +__split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_PAGE *child; + WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex, *replace_index; + WT_REF **alloc_refp; + WT_REF **child_refp, *page_ref, **page_refp, *ref; + size_t child_incr, page_decr, page_incr, parent_incr, size; + uint64_t split_gen; + uint32_t children, chunk, i, j, remain; + uint32_t slots; + bool complete; + void *p; + + WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal); + WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_internal); + + /* The page will be marked dirty, make sure that will succeed. */ + WT_RET(__wt_page_modify_init(session, page)); + + btree = S2BT(session); + alloc_index = replace_index = NULL; + page_ref = page->pg_intl_parent_ref; + page_decr = page_incr = parent_incr = 0; + complete = false; /* - * We can't free the previous parent's index, there may be threads using - * it. Add to the session's discard list, to be freed once we know no - * threads can still be using it. + * Our caller is holding the page locked to single-thread splits, which + * means we can safely look at the page's index without setting a split + * generation. + */ + pindex = WT_INTL_INDEX_GET_SAFE(page); + + /* + * Decide how many child pages to create, then calculate the standard + * chunk and whatever remains. Sanity check the number of children: + * the decision to split matched to the deepen-per-child configuration + * might get it wrong. + */ + children = pindex->entries / btree->split_deepen_per_child; + if (children < 10) { + if (pindex->entries < 100) + return (EBUSY); + children = 10; + } + chunk = pindex->entries / children; + remain = pindex->entries - chunk * (children - 1); + + WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, + "%p: %" PRIu32 " internal page elements, splitting %" PRIu32 + " children into parent %p", + page, pindex->entries, children, parent)); + + /* + * Ideally, we'd discard the original page, but that's hard since other + * threads of control are using it (for example, if eviction is walking + * the tree and looking at the page.) Instead, perform a right-split, + * moving all except the first chunk of the page's WT_REF objects to new + * pages. * - * This change requires care with error handling: we have already - * updated the page with a new index. Even if stashing the old value - * fails, we don't roll back that change, because threads may already - * be using the new index. + * Create and initialize a replacement WT_PAGE_INDEX for the original + * page. */ - size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); - WT_TRET(__split_safe_free(session, split_gen, 0, pindex, size)); - parent_decr += size; + size = sizeof(WT_PAGE_INDEX) + chunk * sizeof(WT_REF *); + WT_ERR(__wt_calloc(session, 1, size, &replace_index)); + page_incr += size; + replace_index->index = (WT_REF **)(replace_index + 1); + replace_index->entries = chunk; + for (page_refp = pindex->index, i = 0; i < chunk; ++i) + replace_index->index[i] = *page_refp++; /* - * Adjust the parent's memory footprint. + * Allocate a new WT_PAGE_INDEX and set of WT_REF objects to be inserted + * into the page's parent, replacing the page's page-index. + * + * The first slot of the new WT_PAGE_INDEX is the original page WT_REF. + * The remainder of the slots are allocated WT_REFs. */ - __wt_cache_page_inmem_incr(session, parent, parent_incr); - __wt_cache_page_inmem_decr(session, parent, parent_decr); + size = sizeof(WT_PAGE_INDEX) + children * sizeof(WT_REF *); + WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); + parent_incr += size; + alloc_index->index = (WT_REF **)(alloc_index + 1); + alloc_index->entries = children; + alloc_refp = alloc_index->index; + *alloc_refp++ = page_ref; + for (i = 1; i < children; ++alloc_refp, ++i) + WT_ERR(__wt_calloc_one(session, alloc_refp)); + parent_incr += children * sizeof(WT_REF); + + /* Allocate child pages, and connect them into the new page index. */ + WT_ASSERT(session, page_refp == pindex->index + chunk); + for (alloc_refp = alloc_index->index + 1, i = 1; i < children; ++i) { + slots = i == children - 1 ? remain : chunk; + WT_ERR(__wt_page_alloc( + session, page->type, 0, slots, false, &child)); + + /* + * Initialize the page's child reference; we need a copy of the + * page's key. + */ + ref = *alloc_refp++; + ref->home = parent; + ref->page = child; + ref->addr = NULL; + if (page->type == WT_PAGE_ROW_INT) { + __wt_ref_key(page, *page_refp, &p, &size); + WT_ERR(__wt_row_ikey(session, 0, p, size, ref)); + parent_incr += sizeof(WT_IKEY) + size; + } else + ref->key.recno = (*page_refp)->key.recno; + ref->state = WT_REF_MEM; + + /* Initialize the child page. */ + if (page->type == WT_PAGE_COL_INT) + child->pg_intl_recno = (*page_refp)->key.recno; + child->pg_intl_parent_ref = ref; + + /* Mark it dirty. */ + WT_ERR(__wt_page_modify_init(session, child)); + __wt_page_modify_set(session, child); + + /* Ensure the page isn't evicted or split for now. */ + __split_child_block_evict_and_split(child); + + /* + * The newly allocated child's page index references the same + * structures as the parent. (We cannot move WT_REF structures, + * threads may be underneath us right now changing the structure + * state.) However, if the WT_REF structures reference on-page + * information, we have to fix that, because the disk image for + * the page that has an page index entry for the WT_REF is about + * to be discarded. + */ + child_pindex = WT_INTL_INDEX_GET_SAFE(child); + child_incr = 0; + for (child_refp = child_pindex->index, + j = 0; j < slots; ++child_refp, ++page_refp, ++j) + WT_ERR(__split_ref_move(session, page, + page_refp, &page_decr, child_refp, &child_incr)); + + __wt_cache_page_inmem_incr(session, child, child_incr); + } + WT_ASSERT(session, alloc_refp - + alloc_index->index == (ptrdiff_t)alloc_index->entries); + WT_ASSERT(session, + page_refp - pindex->index == (ptrdiff_t)pindex->entries); + + /* Split into the parent. */ + WT_ERR(__split_parent(session, page_ref, alloc_index->index, + alloc_index->entries, parent_incr, false, false)); + + /* + * A note on error handling: until this point, there's no problem with + * unwinding on error. We allocated a new page index, a new set of + * WT_REFs and a new set of child pages -- if an error occurred, the + * page remained unchanged, although it may have an incorrect memory + * footprint. From now on we've modified the parent page, attention + * needs to be paid. However, subsequent failures are relatively benign, + * the split is OK and complete. For that reason, we ignore errors past + * this point unless there's a panic. + */ + complete = true; + + /* Confirm the page's index hasn't moved, then update it. */ + WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex); + WT_INTL_INDEX_SET(page, replace_index); + +#ifdef HAVE_DIAGNOSTIC + WT_WITH_PAGE_INDEX(session, + __split_verify_intl_key_order(session, parent)); + WT_WITH_PAGE_INDEX(session, + __split_verify_intl_key_order(session, page)); +#endif + + /* Fix up the moved WT_REF structures. */ + WT_ERR(__split_ref_move_final( + session, alloc_index->index + 1, alloc_index->entries - 1)); + + /* + * We don't care about the page-index we allocated, all we needed was + * the array of WT_REF structures, which has now been split into the + * parent page. + */ + __wt_free(session, alloc_index); + + /* + * We can't free the previous page's index, there may be threads using + * it. Add to the session's discard list, to be freed once we know no + * threads can still be using it. + * + * This change requires care with error handling, we've already updated + * the parent page. Even if stashing the old value fails, we don't roll + * back that change, because threads may already be using the new parent + * page. + */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); + WT_TRET(__split_safe_free(session, split_gen, false, pindex, size)); + page_decr += size; + + /* Adjust the page's memory footprint, and mark it dirty. */ + __wt_cache_page_inmem_incr(session, page, page_incr); + __wt_cache_page_inmem_decr(session, page, page_decr); + __wt_page_modify_set(session, page); err: /* * If complete is true, we saw an error after opening up the tree to - * descent through the parent page's new index. There is nothing we - * can do, there are threads potentially active in both versions of - * the tree. + * descent through the page's new index. There is nothing we can do, + * there are threads potentially active in both versions of the tree. * * A note on error handling: if we completed the split, return success, * nothing really bad can have happened, and our caller has to proceed * with the split. */ - if (!complete) - __wt_free_ref_index(session, parent, alloc_index, true); + if (!complete) { + __wt_free_ref_index(session, page, alloc_index, true); + __wt_free_ref_index(session, page, replace_index, false); + } if (ret != 0 && ret != WT_PANIC) __wt_err(session, ret, - "ignoring not-fatal error during parent page split to " - "deepen the tree"); + "ignoring not-fatal error during internal page split"); return (ret == WT_PANIC || !complete ? ret : 0); } /* + * __split_internal_lock -- + * Lock an internal page. + */ +static int +__split_internal_lock( + WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE **parentp, bool *hazardp) +{ + WT_DECL_RET; + WT_PAGE *parent; + WT_REF *parent_ref; + + *hazardp = false; + *parentp = NULL; + + /* + * A checkpoint reconciling this parent page can deadlock with + * our split. We have an exclusive page lock on the child before + * we acquire the page's reconciliation lock, and reconciliation + * acquires the page's reconciliation lock before it encounters + * the child's exclusive lock (which causes reconciliation to + * loop until the exclusive lock is resolved). If we want to split + * the parent, give up to avoid that deadlock. + */ + if (S2BT(session)->checkpointing != WT_CKPT_OFF) + return (EBUSY); + + /* + * Get a page-level lock on the parent to single-thread splits into the + * page because we need to single-thread sizing/growing the page index. + * It's OK to queue up multiple splits as the child pages split, but the + * actual split into the parent has to be serialized. Note we allocate + * memory inside of the lock and may want to invest effort in making the + * locked period shorter. + * + * We use the reconciliation lock here because not only do we have to + * single-thread the split, we have to lock out reconciliation of the + * parent because reconciliation of the parent can't deal with finding + * a split child during internal page traversal. Basically, there's no + * reason to use a different lock if we have to block reconciliation + * anyway. + */ + for (;;) { + parent = ref->home; + + /* Skip pages that aren't ready to split. */ + if (F_ISSET_ATOMIC(parent, WT_PAGE_SPLIT_BLOCK)) + return (EBUSY); + + WT_RET(__wt_fair_lock(session, &parent->page_lock)); + if (parent == ref->home) + break; + WT_RET(__wt_fair_unlock(session, &parent->page_lock)); + } + + /* + * We have exclusive access to split the parent, and at this point, the + * child prevents the parent from being evicted. However, once we + * update the parent's index, it may no longer refer to the child, and + * could conceivably be evicted. Get a hazard pointer on the parent + * now, so that we can safely access it after updating the index. + * + * Take care getting the page doesn't trigger eviction work: we could + * block trying to split a different child of our parent and deadlock + * or we could be the eviction server relied upon by other threads to + * populate the eviction queue. + */ + if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) { + WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT)); + *hazardp = true; + } + + *parentp = parent; + return (0); + +err: WT_TRET(__wt_fair_unlock(session, &parent->page_lock)); + return (ret); +} + +/* + * __split_internal_unlock -- + * Unlock the parent page. + */ +static int +__split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard) +{ + WT_DECL_RET; + + if (hazard) + ret = __wt_hazard_clear(session, parent); + + WT_TRET(__wt_fair_unlock(session, &parent->page_lock)); + return (ret); +} + +/* + * __split_internal_should_split -- + * Return if we should split an internal page. + */ +static bool +__split_internal_should_split(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_BTREE *btree; + WT_PAGE *page; + WT_PAGE_INDEX *pindex; + + btree = S2BT(session); + page = ref->page; + + /* + * Our caller is holding the parent page locked to single-thread splits, + * which means we can safely look at the page's index without setting a + * split generation. + */ + pindex = WT_INTL_INDEX_GET_SAFE(page); + + /* Sanity check for a reasonable number of on-page keys. */ + if (pindex->entries < 100) + return (false); + + /* + * Deepen the tree if the page's memory footprint is larger than the + * maximum size for a page in memory (presumably putting eviction + * pressure on the cache). + */ + if (page->memory_footprint > btree->maxmempage) + return (true); + + /* + * Check if the page has enough keys to make it worth splitting. If + * the number of keys is allowed to grow too large, the cost of + * splitting into parent pages can become large enough to result + * in slow operations. + */ + if (pindex->entries > btree->split_deepen_min_child) + return (true); + + return (false); +} + +/* + * __split_parent_climb -- + * Check if we should split up the tree. + */ +static int +__split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard) +{ + WT_DECL_RET; + WT_PAGE *parent; + WT_REF *ref; + bool parent_hazard; + + /* + * Page splits trickle up the tree, that is, as leaf pages grow large + * enough and are evicted, they'll split into their parent. And, as + * that parent page grows large enough and is evicted, it splits into + * its parent and so on. When the page split wave reaches the root, + * the tree will permanently deepen as multiple root pages are written. + * + * However, this only helps if internal pages are evicted (and we resist + * evicting internal pages for obvious reasons), or if the tree were to + * be closed and re-opened from a disk image, which may be a rare event. + * + * To avoid internal pages becoming too large absent eviction, check + * parent pages each time pages are split into them. If the page is big + * enough, either split the page into its parent or, in the case of the + * root, deepen the tree. + * + * Split up the tree. + */ + for (;;) { + parent = NULL; + parent_hazard = false; + ref = page->pg_intl_parent_ref; + + /* If we don't need to split the page, we're done. */ + if (!__split_internal_should_split(session, ref)) + break; + + /* + * If we've reached the root page, there are no subsequent pages + * to review, deepen the tree and quit. + */ + if (__wt_ref_is_root(ref)) { + ret = __split_root(session, page); + break; + } + + /* + * Lock the parent and split into it, then swap the parent/page + * locks, lock-coupling up the tree. + */ + WT_ERR(__split_internal_lock( + session, ref, &parent, &parent_hazard)); + ret = __split_internal(session, parent, page); + WT_TRET(__split_internal_unlock(session, page, page_hazard)); + + page = parent; + page_hazard = parent_hazard; + parent = NULL; + parent_hazard = false; + WT_ERR(ret); + } + +err: if (parent != NULL) + WT_TRET( + __split_internal_unlock(session, parent, parent_hazard)); + WT_TRET(__split_internal_unlock(session, page, page_hazard)); + + /* A page may have been busy, in which case return without error. */ + WT_RET_BUSY_OK(ret); + return (0); +} + +/* * __split_multi_inmem -- * Instantiate a page in a multi-block set. */ @@ -901,369 +1594,6 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, } /* - * __split_parent_lock -- - * Lock the parent page. - */ -static int -__split_parent_lock( - WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE **parentp, bool *hazardp) -{ - WT_DECL_RET; - WT_PAGE *parent; - WT_REF *parent_ref; - - *hazardp = false; - *parentp = NULL; - - /* - * A checkpoint reconciling this parent page can deadlock with - * our split. We have an exclusive page lock on the child before - * we acquire the page's reconciliation lock, and reconciliation - * acquires the page's reconciliation lock before it encounters - * the child's exclusive lock (which causes reconciliation to - * loop until the exclusive lock is resolved). If we want to split - * the parent, give up to avoid that deadlock. - */ - if (S2BT(session)->checkpointing != WT_CKPT_OFF) - return (EBUSY); - - /* - * Get a page-level lock on the parent to single-thread splits into the - * page because we need to single-thread sizing/growing the page index. - * It's OK to queue up multiple splits as the child pages split, but the - * actual split into the parent has to be serialized. Note we allocate - * memory inside of the lock and may want to invest effort in making the - * locked period shorter. - * - * We use the reconciliation lock here because not only do we have to - * single-thread the split, we have to lock out reconciliation of the - * parent because reconciliation of the parent can't deal with finding - * a split child during internal page traversal. Basically, there's no - * reason to use a different lock if we have to block reconciliation - * anyway. - */ - for (;;) { - parent = ref->home; - WT_RET(__wt_fair_lock(session, &parent->page_lock)); - if (parent == ref->home) - break; - /* Try again if the page deepened while we were waiting */ - WT_RET(__wt_fair_unlock(session, &parent->page_lock)); - } - - /* - * We have exclusive access to split the parent, and at this point, the - * child prevents the parent from being evicted. However, once we - * update the parent's index, it will no longer refer to the child, and - * could conceivably be evicted. Get a hazard pointer on the parent - * now, so that we can safely access it after updating the index. - * - * Take care getting the page doesn't trigger eviction work: we could - * block trying to split a different child of our parent and deadlock - * or we could be the eviction server relied upon by other threads to - * populate the eviction queue. - */ - if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) { - WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT)); - *hazardp = true; - } - - *parentp = parent; - return (0); - -err: WT_TRET(__wt_fair_unlock(session, &parent->page_lock)); - return (ret); -} - -/* - * __split_parent_unlock -- - * Unlock the parent page. - */ -static int -__split_parent_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard) -{ - WT_DECL_RET; - - if (hazard) - ret = __wt_hazard_clear(session, parent); - - WT_TRET(__wt_fair_unlock(session, &parent->page_lock)); - return (ret); -} - -/* - * __split_parent -- - * Resolve a multi-page split, inserting new information into the parent. - */ -static int -__split_parent(WT_SESSION_IMPL *session, WT_REF *ref, - WT_REF **ref_new, uint32_t new_entries, size_t parent_incr, int exclusive) -{ - WT_DECL_RET; - WT_IKEY *ikey; - WT_PAGE *parent; - WT_PAGE_INDEX *alloc_index, *pindex; - WT_REF **alloc_refp, *next_ref, *parent_ref; - size_t parent_decr, size; - uint64_t split_gen; - uint32_t i, j; - uint32_t deleted_entries, parent_entries, result_entries; - bool complete; - - parent = ref->home; - parent_ref = parent->pg_intl_parent_ref; - - alloc_index = pindex = NULL; - parent_decr = 0; - parent_entries = 0; - complete = false; - - /* - * We've locked the parent, which means it cannot split (which is the - * only reason to worry about split generation values). - */ - pindex = WT_INTL_INDEX_GET_SAFE(parent); - parent_entries = pindex->entries; - - /* - * Remove any refs to deleted pages while we are splitting, we have - * the internal page locked down, and are copying the refs into a new - * array anyway. Switch them to the special split state, so that any - * reading thread will restart. Include the ref we are splitting in - * the count to be deleted. - */ - for (deleted_entries = 1, i = 0; i < parent_entries; ++i) { - next_ref = pindex->index[i]; - WT_ASSERT(session, next_ref->state != WT_REF_SPLIT); - if (next_ref->state == WT_REF_DELETED && - __wt_delete_page_skip(session, next_ref, true) && - __wt_atomic_casv32( - &next_ref->state, WT_REF_DELETED, WT_REF_SPLIT)) - deleted_entries++; - } - - /* - * The final entry count consists of the original count, plus any new - * pages, less any WT_REFs we're removing. - */ - result_entries = (parent_entries + new_entries) - deleted_entries; - - /* - * If the entire (sub)tree is empty, give up: we can't leave an empty - * internal page. Mark it to be evicted soon and clean up any - * references that have changed state. - */ - if (result_entries == 0) { - __wt_page_evict_soon(parent); - goto err; - } - - /* - * Allocate and initialize a new page index array for the parent, then - * copy references from the original index array, plus references from - * the newly created split array, into place. - */ - size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *); - WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); - parent_incr += size; - alloc_index->index = (WT_REF **)(alloc_index + 1); - alloc_index->entries = result_entries; - for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) { - next_ref = pindex->index[i]; - if (next_ref == ref) { - for (j = 0; j < new_entries; ++j) { - ref_new[j]->home = parent; - *alloc_refp++ = ref_new[j]; - - /* - * Clear the split reference as it moves to the - * allocated page index, so it never appears on - * both after an error. - */ - ref_new[j] = NULL; - } - - /* - * We detect append-style workloads to avoid repeatedly - * deepening parts of the tree where no work is being - * done by tracking if we're splitting after the slots - * created by the last split to deepen this parent. - * - * Note the calculation: i is a 0-based array offset and - * split-last is a count of entries, also either or both - * i and split-last might be unsigned 0, don't decrement - * either one. - */ - if (i > parent->pg_intl_deepen_split_last) - parent-> - pg_intl_deepen_split_append += new_entries; - } else if (next_ref->state != WT_REF_SPLIT) - /* Skip refs we have marked for deletion. */ - *alloc_refp++ = next_ref; - } - - /* Check that we filled in all the entries. */ - WT_ASSERT(session, - alloc_refp - alloc_index->index == (ptrdiff_t)result_entries); - - /* - * Confirm the parent page's index hasn't moved then update it, which - * makes the split visible to threads descending the tree. - */ - WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex); - WT_INTL_INDEX_SET(parent, alloc_index); - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); - alloc_index = NULL; - -#ifdef HAVE_DIAGNOSTIC - WT_WITH_PAGE_INDEX(session, - __split_verify_intl_key_order(session, parent)); -#endif - - /* - * Reset the page's original WT_REF field to split. Threads cursoring - * through the tree were blocked because that WT_REF state was set to - * locked. This update changes the locked state to split, unblocking - * those threads and causing them to re-calculate their position based - * on the updated parent page's index. - */ - WT_PUBLISH(ref->state, WT_REF_SPLIT); - - /* - * A note on error handling: failures before we swapped the new page - * index into the parent can be resolved by freeing allocated memory - * because the original page is unchanged, we can continue to use it - * and we have not yet modified the parent. Failures after we swap - * the new page index into the parent are also relatively benign, the - * split is OK and complete. For those reasons, we ignore errors past - * this point unless there's a panic. - */ - complete = true; - - WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, - "%s split into parent %" PRIu32 " -> %" PRIu32 - " (%" PRIu32 ")", ref->page == NULL ? - "reverse" : __wt_page_type_string(ref->page->type), - parent_entries, result_entries, result_entries - parent_entries)); - - /* - * The new page index is in place, free the WT_REF we were splitting - * and any deleted WT_REFs we found, modulo the usual safe free - * semantics. - */ - for (i = 0; deleted_entries > 0 && i < parent_entries; ++i) { - next_ref = pindex->index[i]; - if (next_ref->state != WT_REF_SPLIT) - continue; - --deleted_entries; - - /* - * We set the WT_REF to split, discard it, freeing any resources - * it holds. - * - * Row-store trees where the old version of the page is being - * discarded: the previous parent page's key for this child page - * may have been an on-page overflow key. In that case, if the - * key hasn't been deleted, delete it now, including its backing - * blocks. We are exchanging the WT_REF that referenced it for - * the split page WT_REFs and their keys, and there's no longer - * any reference to it. Done after completing the split (if we - * failed, we'd leak the underlying blocks, but the parent page - * would be unaffected). - */ - if (parent->type == WT_PAGE_ROW_INT) { - WT_TRET(__split_ovfl_key_cleanup( - session, parent, next_ref)); - ikey = __wt_ref_key_instantiated(next_ref); - if (ikey != NULL) { - size = sizeof(WT_IKEY) + ikey->size; - WT_TRET(__split_safe_free( - session, split_gen, 0, ikey, size)); - parent_decr += size; - } - /* - * The page_del structure can be freed immediately: it - * is only read when the ref state is WT_REF_DELETED. - * The size of the structure wasn't added to the parent, - * don't decrement. - */ - if (next_ref->page_del != NULL) { - __wt_free(session, - next_ref->page_del->update_list); - __wt_free(session, next_ref->page_del); - } - } - - WT_TRET(__split_safe_free( - session, split_gen, 0, next_ref, sizeof(WT_REF))); - parent_decr += sizeof(WT_REF); - } - - /* We freed the reference that was split in the loop above. */ - ref = NULL; - - /* - * We can't free the previous page index, there may be threads using it. - * Add it to the session discard list, to be freed when it's safe. - */ - size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); - WT_TRET(__split_safe_free(session, split_gen, exclusive, pindex, size)); - parent_decr += size; - - /* - * Adjust the parent's memory footprint. - */ - __wt_cache_page_inmem_incr(session, parent, parent_incr); - __wt_cache_page_inmem_decr(session, parent, parent_decr); - - /* - * Simple page splits trickle up the tree, that is, as leaf pages grow - * large enough and are evicted, they'll split into their parent. And, - * as that parent grows large enough and is evicted, it will split into - * its parent and so on. When the page split wave reaches the root, - * the tree will permanently deepen as multiple root pages are written. - * However, this only helps if first, the pages are evicted (and - * we resist evicting internal pages for obvious reasons), and second, - * if the tree is closed and re-opened from a disk image, which may be - * a rare event. - * To avoid the case of internal pages becoming too large when they - * aren't being evicted, check internal pages each time a leaf page is - * split into them. If it's big enough, deepen the tree at that point. - * Do the check here because we've just grown the parent page and - * are holding it locked. - */ - if (ret == 0 && !exclusive && - __split_should_deepen(session, parent_ref)) - ret = __split_deepen(session, parent); - -err: /* - * A note on error handling: if we completed the split, return success, - * nothing really bad can have happened, and our caller has to proceed - * with the split. - */ - if (!complete) { - for (i = 0; i < parent_entries; ++i) { - next_ref = pindex->index[i]; - if (next_ref->state == WT_REF_SPLIT) - next_ref->state = WT_REF_DELETED; - } - - /* If we gave up on a reverse split, unlock the child. */ - if (ref_new == NULL) { - WT_ASSERT(session, ref->state == WT_REF_LOCKED); - ref->state = WT_REF_DELETED; - } - - __wt_free_ref_index(session, NULL, alloc_index, false); - } - - if (ret != 0 && ret != WT_PANIC) - __wt_err(session, ret, - "ignoring not-fatal error during parent page split"); - return (ret == WT_PANIC || !complete ? ret : 0); -} - -/* * __split_insert -- * Split a page's last insert list entries into a separate page. */ @@ -1279,6 +1609,9 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) size_t page_decr, parent_incr, right_incr; int i; + WT_STAT_FAST_CONN_INCR(session, cache_inmem_split); + WT_STAT_FAST_DATA_INCR(session, cache_inmem_split); + page = ref->page; right = NULL; page_decr = parent_incr = right_incr = 0; @@ -1491,7 +1824,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) */ page = NULL; if ((ret = __split_parent( - session, ref, split_ref, 2, parent_incr, false)) != 0) { + session, ref, split_ref, 2, parent_incr, false, true)) != 0) { /* * Move the insert list element back to the original page list. * For simplicity, the previous skip list pointers originally @@ -1513,9 +1846,6 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) WT_ERR(ret); } - WT_STAT_FAST_CONN_INCR(session, cache_inmem_split); - WT_STAT_FAST_DATA_INCR(session, cache_inmem_split); - return (0); err: if (split_ref[0] != NULL) { @@ -1543,83 +1873,21 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) WT_PAGE *parent; bool hazard; - WT_RET(__split_parent_lock(session, ref, &parent, &hazard)); - ret = __split_insert(session, ref); - WT_TRET(__split_parent_unlock(session, parent, hazard)); - return (ret); -} - -/* - * __wt_split_reverse -- - * We have a locked ref that is empty and we want to rewrite the index in - * its parent. - */ -int -__wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref) -{ - WT_DECL_RET; - WT_PAGE *parent; - bool hazard; - - WT_RET(__split_parent_lock(session, ref, &parent, &hazard)); - ret = __split_parent(session, ref, NULL, 0, 0, 0); - WT_TRET(__split_parent_unlock(session, parent, hazard)); - return (ret); -} - -/* - * __wt_split_rewrite -- - * Rewrite an in-memory page with a new version. - */ -int -__wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref) -{ - WT_DECL_RET; - WT_PAGE *page; - WT_PAGE_MODIFY *mod; - WT_REF new; - - page = ref->page; - mod = page->modify; - - /* - * This isn't a split: a reconciliation failed because we couldn't write - * something, and in the case of forced eviction, we need to stop this - * page from being such a problem. We have exclusive access, rewrite the - * page in memory. The code lives here because the split code knows how - * to re-create a page in memory after it's been reconciled, and that's - * exactly what we want to do. - * - * Build the new page. - */ - memset(&new, 0, sizeof(new)); - WT_ERR(__split_multi_inmem(session, page, &new, &mod->mod_multi[0])); + WT_RET(__wt_verbose( + session, WT_VERB_SPLIT, "%p: split-insert", ref->page)); - /* - * The rewrite succeeded, we can no longer fail. - * - * Finalize the move, discarding moved update lists from the original - * page. - */ - __split_multi_inmem_final(page, &mod->mod_multi[0]); + WT_RET(__split_internal_lock(session, ref, &parent, &hazard)); + if ((ret = __split_insert(session, ref)) != 0) { + WT_TRET(__split_internal_unlock(session, parent, hazard)); + return (ret); + } /* - * Discard the original page. - * - * Pages with unresolved changes are not marked clean during - * reconciliation, do it now. + * Split up through the tree as necessary; we're holding the original + * parent page locked, note the functions we call are responsible for + * releasing that lock. */ - __wt_page_modify_clear(session, page); - __wt_ref_out(session, ref); - - /* Swap the new page into place. */ - ref->page = new.page; - WT_PUBLISH(ref->state, WT_REF_MEM); - - return (0); - -err: __split_multi_inmem_fail(session, &new); - return (ret); + return (__split_parent_climb(session, parent, hazard)); } /* @@ -1636,6 +1904,9 @@ __split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) size_t parent_incr; uint32_t i, new_entries; + WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_leaf); + WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_leaf); + page = ref->page; mod = page->modify; new_entries = mod->mod_multi_entries; @@ -1656,10 +1927,7 @@ __split_multi(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) * exclusively. */ WT_ERR(__split_parent( - session, ref, ref_new, new_entries, parent_incr, closing)); - - WT_STAT_FAST_CONN_INCR(session, cache_eviction_split); - WT_STAT_FAST_DATA_INCR(session, cache_eviction_split); + session, ref, ref_new, new_entries, parent_incr, closing, true)); /* * The split succeeded, we can no longer fail. @@ -1697,8 +1965,98 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing) WT_PAGE *parent; bool hazard; - WT_RET(__split_parent_lock(session, ref, &parent, &hazard)); - ret = __split_multi(session, ref, closing); - WT_TRET(__split_parent_unlock(session, parent, hazard)); + WT_RET(__wt_verbose( + session, WT_VERB_SPLIT, "%p: split-multi", ref->page)); + + WT_RET(__split_internal_lock(session, ref, &parent, &hazard)); + if ((ret = __split_multi(session, ref, closing)) != 0 || closing) { + WT_TRET(__split_internal_unlock(session, parent, hazard)); + return (ret); + } + + /* + * Split up through the tree as necessary; we're holding the original + * parent page locked, note the functions we call are responsible for + * releasing that lock. + */ + return (__split_parent_climb(session, parent, hazard)); +} + +/* + * __wt_split_reverse -- + * We have a locked ref that is empty and we want to rewrite the index in + * its parent. + */ +int +__wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_DECL_RET; + WT_PAGE *parent; + bool hazard; + + WT_RET(__wt_verbose( + session, WT_VERB_SPLIT, "%p: reverse-split", ref->page)); + + WT_RET(__split_internal_lock(session, ref, &parent, &hazard)); + ret = __split_parent(session, ref, NULL, 0, 0, false, true); + WT_TRET(__split_internal_unlock(session, parent, hazard)); + return (ret); +} + +/* + * __wt_split_rewrite -- + * Rewrite an in-memory page with a new version. + */ +int +__wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_MODIFY *mod; + WT_REF new; + + page = ref->page; + mod = page->modify; + + WT_RET(__wt_verbose( + session, WT_VERB_SPLIT, "%p: split-rewrite", ref->page)); + + /* + * This isn't a split: a reconciliation failed because we couldn't write + * something, and in the case of forced eviction, we need to stop this + * page from being such a problem. We have exclusive access, rewrite the + * page in memory. The code lives here because the split code knows how + * to re-create a page in memory after it's been reconciled, and that's + * exactly what we want to do. + * + * Build the new page. + */ + memset(&new, 0, sizeof(new)); + WT_ERR(__split_multi_inmem(session, page, &new, &mod->mod_multi[0])); + + /* + * The rewrite succeeded, we can no longer fail. + * + * Finalize the move, discarding moved update lists from the original + * page. + */ + __split_multi_inmem_final(page, &mod->mod_multi[0]); + + /* + * Discard the original page. + * + * Pages with unresolved changes are not marked clean during + * reconciliation, do it now. + */ + __wt_page_modify_clear(session, page); + __wt_ref_out(session, ref); + + /* Swap the new page into place. */ + ref->page = new.page; + WT_PUBLISH(ref->state, WT_REF_MEM); + + return (0); + +err: __split_multi_inmem_fail(session, &new); return (ret); } diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index 7395cce11e1..07bb2eb3a01 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -191,7 +191,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_bytes, leaf_pages, internal_bytes, internal_pages, - WT_TIMEDIFF(end, start) / WT_MILLION)); + WT_TIMEDIFF_MS(end, start))); } err: /* On error, clear any left-over tree walk. */ diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c index d02f23ed164..e9fa570f97b 100644 --- a/src/btree/col_srch.c +++ b/src/btree/col_srch.c @@ -22,7 +22,7 @@ __wt_col_search(WT_SESSION_IMPL *session, WT_INSERT *ins; WT_INSERT_HEAD *ins_head; WT_PAGE *page; - WT_PAGE_INDEX *pindex; + WT_PAGE_INDEX *pindex, *parent_pindex; WT_REF *current, *descent; uint32_t base, indx, limit; int depth; @@ -37,10 +37,12 @@ __wt_col_search(WT_SESSION_IMPL *session, goto leaf_only; } +restart_root: /* Search the internal pages of the tree. */ current = &btree->root; - for (depth = 2;; ++depth) { -restart: page = current->page; + for (depth = 2, pindex = NULL;; ++depth) { + parent_pindex = pindex; +restart_page: page = current->page; if (page->type != WT_PAGE_COL_INT) break; @@ -51,8 +53,19 @@ restart: page = current->page; descent = pindex->index[base - 1]; /* Fast path appends. */ - if (recno >= descent->key.recno) + if (recno >= descent->key.recno) { + /* + * If on the last slot (the key is larger than any key + * on the page), check for an internal page split race. + */ + if (parent_pindex != NULL && + __wt_split_intl_race( + session, current->home, parent_pindex)) { + WT_RET(__wt_page_release(session, current, 0)); + goto restart_root; + } goto descend; + } /* Binary search of internal pages. */ for (base = 0, @@ -90,15 +103,13 @@ descend: /* * page; otherwise return on error, the swap call ensures we're * holding nothing on failure. */ - switch (ret = __wt_page_swap(session, current, descent, 0)) { - case 0: + if ((ret = __wt_page_swap(session, current, descent, 0)) == 0) { current = descent; - break; - case WT_RESTART: - goto restart; - default: - return (ret); + continue; } + if (ret == WT_RESTART) + goto restart_page; + return (ret); } /* Track how deep the tree gets. */ diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c index 7b21f1e40bb..d2d8a4640ca 100644 --- a/src/btree/row_srch.c +++ b/src/btree/row_srch.c @@ -144,7 +144,7 @@ __wt_row_search(WT_SESSION_IMPL *session, WT_DECL_RET; WT_ITEM *item; WT_PAGE *page; - WT_PAGE_INDEX *pindex; + WT_PAGE_INDEX *pindex, *parent_pindex; WT_REF *current, *descent; WT_ROW *rip; size_t match, skiphigh, skiplow; @@ -155,16 +155,16 @@ __wt_row_search(WT_SESSION_IMPL *session, btree = S2BT(session); collator = btree->collator; item = cbt->tmp; + current = NULL; __cursor_pos_clear(cbt); /* - * The row-store search routine uses a different comparison API. - * The assumption is we're comparing more than a few keys with - * matching prefixes, and it's a win to avoid the memory fetches - * by skipping over those prefixes. That's done by tracking the - * length of the prefix match for the lowest and highest keys we - * compare as we descend the tree. + * In some cases we expect we're comparing more than a few keys with + * matching prefixes, so it's faster to avoid the memory fetches by + * skipping over those prefixes. That's done by tracking the length of + * the prefix match for the lowest and highest keys we compare as we + * descend the tree. */ skiphigh = skiplow = 0; @@ -186,10 +186,11 @@ __wt_row_search(WT_SESSION_IMPL *session, } /* Search the internal pages of the tree. */ - cmp = -1; +restart_root: current = &btree->root; - for (depth = 2;; ++depth) { -restart: page = current->page; + for (depth = 2, pindex = NULL;; ++depth) { + parent_pindex = pindex; +restart_page: page = current->page; if (page->type != WT_PAGE_ROW_INT) break; @@ -211,7 +212,7 @@ restart: page = current->page; WT_ERR(__wt_compare( session, collator, srch_key, item, &cmp)); if (cmp >= 0) - goto descend; + goto append; /* A failed append check turns off append checks. */ append_check = false; @@ -252,7 +253,26 @@ restart: page = current->page; } else if (cmp == 0) goto descend; } - else if (collator == NULL) + else if (collator == NULL) { + /* + * Reset the skipped prefix counts; we'd normally expect + * the parent's skipped prefix values to be larger than + * the child's values and so we'd only increase them as + * we walk down the tree (in other words, if we can skip + * N bytes on the parent, we can skip at least N bytes + * on the child). However, if a child internal page was + * split up into the parent, the child page's key space + * will have been truncated, and the values from the + * parent's search may be wrong for the child. We only + * need to reset the high count because the split-page + * algorithm truncates the end of the internal page's + * key space, the low count is still correct. We also + * don't need to clear either count when transitioning + * to a leaf page, a leaf page's key space can't change + * in flight. + */ + skiphigh = 0; + for (; limit != 0; limit >>= 1) { indx = base + (limit >> 1); descent = pindex->index[indx]; @@ -271,7 +291,7 @@ restart: page = current->page; else goto descend; } - else + } else for (; limit != 0; limit >>= 1) { indx = base + (limit >> 1); descent = pindex->index[indx]; @@ -288,9 +308,10 @@ restart: page = current->page; } /* - * Set the slot to descend the tree: descent is already set if - * there was an exact match on the page, otherwise, base is - * the smallest index greater than key, possibly (last + 1). + * Set the slot to descend the tree: descent was already set if + * there was an exact match on the page, otherwise, base is the + * smallest index greater than key, possibly one past the last + * slot. */ descent = pindex->index[base - 1]; @@ -298,25 +319,41 @@ restart: page = current->page; * If we end up somewhere other than the last slot, it's not a * right-side descent. */ - if (pindex->entries != base - 1) + if (pindex->entries != base) descend_right = false; + /* + * If on the last slot (the key is larger than any key on the + * page), check for an internal page split race. + */ + if (pindex->entries == base) { +append: if (parent_pindex != NULL && + __wt_split_intl_race( + session, current->home, parent_pindex)) { + if ((ret = __wt_page_release( + session, current, 0)) != 0) + return (ret); + + skiplow = skiphigh = 0; + goto restart_root; + } + } + descend: /* * Swap the current page for the child page. If the page splits * while we're retrieving it, restart the search in the current * page; otherwise return on error, the swap call ensures we're * holding nothing on failure. */ - switch (ret = __wt_page_swap(session, current, descent, 0)) { - case 0: + if ((ret = __wt_page_swap(session, current, descent, 0)) == 0) { current = descent; - break; - case WT_RESTART: + continue; + } + if (ret == WT_RESTART) { skiphigh = skiplow = 0; - goto restart; - default: - return (ret); + goto restart_page; } + return (ret); } /* Track how deep the tree gets. */ @@ -517,7 +554,7 @@ __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) __cursor_pos_clear(cbt); -restart: +restart_root: /* Walk the internal pages of the tree. */ current = &btree->root; for (;;) { @@ -544,7 +581,7 @@ restart: */ if (ret == WT_RESTART && (ret = __wt_page_release(session, current, 0)) == 0) - goto restart; + goto restart_root; return (ret); } diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c index aa14e9aadde..8d16f94c092 100644 --- a/src/conn/conn_cache_pool.c +++ b/src/conn/conn_cache_pool.c @@ -734,7 +734,7 @@ __wt_cache_pool_server(void *arg) F_ISSET(cache, WT_CACHE_POOL_RUN)) { if (cp->currently_used <= cp->size) WT_ERR(__wt_cond_wait(session, - cp->cache_pool_cond, 1000000)); + cp->cache_pool_cond, WT_MILLION)); /* * Re-check pool run flag - since we want to avoid getting the diff --git a/src/conn/conn_ckpt.c b/src/conn/conn_ckpt.c index 8f039e61654..b47e2550b23 100644 --- a/src/conn/conn_ckpt.c +++ b/src/conn/conn_ckpt.c @@ -31,7 +31,7 @@ __ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, bool *startp) * Checkpoints based on log size also require logging be enabled. */ WT_RET(__wt_config_gets(session, cfg, "checkpoint.wait", &cval)); - conn->ckpt_usecs = (uint64_t)cval.val * 1000000; + conn->ckpt_usecs = (uint64_t)cval.val * WT_MILLION; WT_RET(__wt_config_gets(session, cfg, "checkpoint.log_size", &cval)); conn->ckpt_logsize = (wt_off_t)cval.val; diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index 007d4273e72..1d44d816467 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -538,8 +538,8 @@ restart: while (i < WT_SLOT_POOL) { save_i = i; slot = &log->slot_pool[i++]; - WT_ASSERT(session, slot->slot_state != 0 || - slot->slot_release_lsn.file >= log->write_lsn.file); + WT_ASSERT(session, slot->slot_state != 0 || + slot->slot_release_lsn.file >= log->write_lsn.file); if (slot->slot_state != WT_LOG_SLOT_WRITTEN) continue; written[written_i].slot_index = save_i; diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c index f08c2b7996c..31438e10606 100644 --- a/src/conn/conn_stat.c +++ b/src/conn/conn_stat.c @@ -83,7 +83,7 @@ __statlog_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp) WT_RET(__wt_config_gets(session, cfg, "statistics_log.wait", &cval)); /* Only start the server if wait time is non-zero */ *runp = cval.val != 0; - conn->stat_usecs = (uint64_t)cval.val * 1000000; + conn->stat_usecs = (uint64_t)cval.val * WT_MILLION; WT_RET(__wt_config_gets( session, cfg, "statistics_log.on_close", &cval)); diff --git a/src/cursor/cur_join.c b/src/cursor/cur_join.c index 4913a06764a..407b0b7f386 100644 --- a/src/cursor/cur_join.c +++ b/src/cursor/cur_join.c @@ -260,8 +260,8 @@ __curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, const char *raw_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), "raw", NULL }; const char *mainkey_str, *p; - int cmp, mainkey_len, skip; - size_t i, size; + int cmp, skip; + size_t i, mainkey_len, size; void *allocbuf; c = NULL; @@ -281,7 +281,7 @@ __curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, p != NULL && i < maintable->nkey_columns; i++) p = strchr(p + 1, ','); WT_ASSERT(session, p != 0); - mainkey_len = p - mainkey_str; + mainkey_len = WT_PTRDIFF(p, mainkey_str); size = strlen(entry->index->name) + mainkey_len + 3; WT_ERR(__wt_calloc(session, size, 1, &uri)); snprintf(uri, size, "%s(%.*s)", entry->index->name, @@ -894,33 +894,32 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_CURSOR_JOIN_ENTRY *entry; WT_DECL_RET; WT_CURSOR_JOIN_ENDPOINT *end, *newend; - bool range_eq; - int nonbloom; - size_t i; - ssize_t ins; + bool hasins, needbloom, range_eq; + u_int i, ins, nonbloom; const char *raw_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), "raw", NULL }; char *main_uri; size_t namesize, newsize; entry = NULL; - ins = -1; + hasins = needbloom = false; main_uri = NULL; - nonbloom = -1; namesize = strlen(cjoin->table->name); for (i = 0; i < cjoin->entries_next; i++) { if (cjoin->entries[i].index == index) { entry = &cjoin->entries[i]; break; } - if (nonbloom == -1 && i > 0 && - !F_ISSET(&cjoin->entries[i], WT_CURJOIN_ENTRY_BLOOM)) + if (!needbloom && i > 0 && + !F_ISSET(&cjoin->entries[i], WT_CURJOIN_ENTRY_BLOOM)) { + needbloom = true; nonbloom = i; + } } if (entry == NULL) { WT_ERR(__wt_realloc_def(session, &cjoin->entries_allocated, cjoin->entries_next + 1, &cjoin->entries)); - if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM) && nonbloom != -1) { + if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM) && needbloom) { /* * Reorder the list so that after the first entry, * the Bloom filtered entries come next, followed by @@ -1001,11 +1000,13 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, * Sort "gt"/"ge" to the front, followed by any number * of "eq", and finally "lt"/"le". */ - if (ins == -1 && + if (!hasins && ((range & WT_CURJOIN_END_GT) != 0 || (range == WT_CURJOIN_END_EQ && - !F_ISSET(end, WT_CURJOIN_END_GT)))) + !F_ISSET(end, WT_CURJOIN_END_GT)))) { ins = i; + hasins = true; + } } /* All checks completed, merge any new configuration now */ entry->count = count; @@ -1016,7 +1017,7 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, } WT_ERR(__wt_realloc_def(session, &entry->ends_allocated, entry->ends_next + 1, &entry->ends)); - if (ins == -1) + if (!hasins) ins = entry->ends_next; newend = &entry->ends[ins]; memmove(newend + 1, newend, diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox index 3b227d00198..c40e764e2f6 100644 --- a/src/docs/upgrading.dox +++ b/src/docs/upgrading.dox @@ -11,6 +11,13 @@ was relying on this behavior, a connection will be opened with different settings after upgrading, which could lead to errors or unexpected behavior. </dd> +<dt>Statistic change</dt> +<dd> +The statistic "pages split during eviction" was replaced. It has been +replaced by a pair of statistics "internal pages split during eviction" and +"leaf pages split during eviction". +</dd> + <dt>Change to WT_CURSOR::insert</dt> <dd> The WT_CURSOR::insert method in this release has slightly different semantics diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 18335d6fb5e..fa6c4f4313f 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -183,10 +183,10 @@ __evict_server(void *arg) session, &conn->dhandle_lock)) == EBUSY && !F_ISSET(cache, WT_CACHE_CLEAR_WALKS); spins++) { - if (spins < 1000) + if (spins < WT_THOUSAND) __wt_yield(); else - __wt_sleep(0, 1000); + __wt_sleep(0, WT_THOUSAND); } /* * If we gave up acquiring the lock, that indicates a @@ -210,7 +210,7 @@ __evict_server(void *arg) else { /* After being stuck for 5 minutes, give up. */ WT_ERR(__wt_epoch(session, &now)); - if (WT_TIMEDIFF(now, stuck_ts) / WT_BILLION > 300) { + if (WT_TIMEDIFF_SEC(now, stuck_ts) > 300) { __wt_errx(session, "Cache stuck for too long, giving up"); (void)__wt_cache_dump(session, NULL); @@ -601,7 +601,7 @@ __evict_pass(WT_SESSION_IMPL *session) * that can free space in cache, such as LSM discarding * handles. */ - __wt_sleep(0, 1000 * (uint64_t)loop); + __wt_sleep(0, WT_THOUSAND * (uint64_t)loop); if (loop == 100) { /* * Mark the cache as stuck if we need space @@ -992,10 +992,10 @@ retry: while (slot < max_entries && ret == 0) { session, &conn->dhandle_lock)) == EBUSY && !F_ISSET(cache, WT_CACHE_CLEAR_WALKS); spins++) { - if (spins < 1000) + if (spins < WT_THOUSAND) __wt_yield(); else - __wt_sleep(0, 1000); + __wt_sleep(0, WT_THOUSAND); } if (ret != 0) break; diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c index e49098e90db..94c969fa5bb 100644 --- a/src/evict/evict_page.c +++ b/src/evict/evict_page.c @@ -179,9 +179,17 @@ __evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) * something is busy, be sure that the page still ends up * marked deleted. */ - if (ndeleted > pindex->entries / 10 && pindex->entries > 1 && - (ret = __wt_split_reverse(session, ref)) != EBUSY) - return (ret); + if (ndeleted > pindex->entries / 10 && pindex->entries > 1) { + if ((ret = __wt_split_reverse(session, ref)) == 0) + return (0); + WT_RET_BUSY_OK(ret); + + /* + * The child must be locked after a failed reverse + * split. + */ + WT_ASSERT(session, ref->state == WT_REF_LOCKED); + } } WT_PUBLISH(ref->state, WT_REF_DELETED); diff --git a/src/include/btmem.h b/src/include/btmem.h index 02819237c13..ae29dc68003 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -437,24 +437,10 @@ struct __wt_page { uint32_t deleted_entries; WT_REF **index; } * volatile __index; /* Collated children */ - - /* - * When splitting to deepen the tree, track the number - * of entries in the newly created parent, and how many - * subsequent splits follow the initial set of entries. - * If future splits into the page are generally after - * the initial set of items, perform future deepening - * splits in this page to optimize for an append-style - * workload. - */ - uint32_t deepen_split_append; - uint32_t deepen_split_last; } intl; #undef pg_intl_recno #define pg_intl_recno u.intl.recno #define pg_intl_parent_ref u.intl.parent_ref -#define pg_intl_deepen_split_append u.intl.deepen_split_append -#define pg_intl_deepen_split_last u.intl.deepen_split_last /* * Macros to copy/set the index because the name is obscured to ensure @@ -581,7 +567,8 @@ struct __wt_page { #define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */ #define WT_PAGE_OVERFLOW_KEYS 0x10 /* Page has overflow keys */ #define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */ -#define WT_PAGE_UPDATE_IGNORE 0x40 /* Ignore updates on page discard */ +#define WT_PAGE_SPLIT_BLOCK 0x40 /* Split blocking eviction and splits */ +#define WT_PAGE_UPDATE_IGNORE 0x80 /* Ignore updates on page discard */ uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ uint8_t unused[2]; /* Unused padding */ diff --git a/src/include/btree.i b/src/include/btree.i index 23e212eb772..a92d52e784a 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -1101,16 +1101,17 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, return (false); /* - * If the tree was deepened, there's a requirement that newly created - * internal pages not be evicted until all threads are known to have - * exited the original page index array, because evicting an internal - * page discards its WT_REF array, and a thread traversing the original - * page index array might see a freed WT_REF. During the split we set - * a transaction value, once that's globally visible, we know we can - * evict the created page. + * If a split created new internal pages, those newly created internal + * pages cannot be evicted until all threads are known to have exited + * the original parent page's index, because evicting an internal page + * discards its WT_REF array, and a thread traversing the original + * parent page index might see a freed WT_REF. During the split we set + * a transaction value, we can evict the created page as soon as that + * transaction value is globally visible. */ if (check_splits && WT_PAGE_IS_INTERNAL(page) && - !__wt_txn_visible_all(session, mod->mod_split_txn)) + (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK) || + !__wt_txn_visible_all(session, mod->mod_split_txn))) return (false); /* @@ -1374,3 +1375,34 @@ __wt_btree_lsm_over_size(WT_SESSION_IMPL *session, uint64_t maxsize) return (child->memory_footprint > maxsize); } + +/* + * __wt_split_intl_race -- + * Return if we raced with an internal page split when descending the tree. + */ +static inline bool +__wt_split_intl_race( + WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE_INDEX *saved_pindex) +{ + WT_PAGE_INDEX *pindex; + + /* + * A place to hang this comment... + * + * There's a page-split race when we walk the tree: if we're splitting + * an internal page into its parent, we update the parent's page index + * and then update the page being split, and it's not an atomic update. + * A thread could read the parent page's original page index, and then + * read the page's replacement index. Because internal page splits work + * by replacing the original page with the initial part of the original + * page, the result of this race is we will have a key that's past the + * end of the current page, and the parent's page index will have moved. + * + * It's also possible a thread could read the parent page's replacement + * page index, and then read the page's original index. Because internal + * splits work by truncating the original page, the original page's old + * content is compatible, this isn't a problem and we ignore this race. + */ + WT_INTL_INDEX_GET(session, parent, pindex); + return (pindex != saved_pindex); +} diff --git a/src/include/cursor.h b/src/include/cursor.h index 98f520b9fd8..23d3f3745db 100644 --- a/src/include/cursor.h +++ b/src/include/cursor.h @@ -315,7 +315,7 @@ struct __wt_cursor_join { WT_CURSOR_JOIN_ITER *iter; WT_CURSOR_JOIN_ENTRY *entries; size_t entries_allocated; - size_t entries_next; + u_int entries_next; uint8_t recno_buf[10]; /* holds packed recno */ #define WT_CURJOIN_ERROR 0x01 /* Error in initialization */ @@ -361,7 +361,7 @@ struct __wt_cursor_metadata { struct __wt_join_stats_group { const char *desc_prefix; /* Prefix appears before description */ WT_CURSOR_JOIN *join_cursor; - size_t join_cursor_entry; /* Position in entries */ + ssize_t join_cursor_entry; /* Position in entries */ WT_JOIN_STATS join_stats; }; diff --git a/src/include/extern.h b/src/include/extern.h index 6516d1ff624..c999ee2752f 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -157,9 +157,9 @@ extern void __wt_split_stash_discard(WT_SESSION_IMPL *session); extern void __wt_split_stash_discard_all( WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session); extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp); extern int __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref); +extern int __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing); extern int __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref); -extern int __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing); extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst); extern int __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_CACHE_OP op); extern int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]); diff --git a/src/include/misc.h b/src/include/misc.h index eca77214b47..e542baec642 100644 --- a/src/include/misc.h +++ b/src/include/misc.h @@ -13,6 +13,7 @@ #define WT_UNUSED(var) (void)(var) /* Basic constants. */ +#define WT_THOUSAND (1000) #define WT_MILLION (1000000) #define WT_BILLION (1000000000) diff --git a/src/include/mutex.i b/src/include/mutex.i index 843c4ad9350..7eb042dd79f 100644 --- a/src/include/mutex.i +++ b/src/include/mutex.i @@ -18,7 +18,7 @@ /* Default to spinning 1000 times before yielding. */ #ifndef WT_SPIN_COUNT -#define WT_SPIN_COUNT 1000 +#define WT_SPIN_COUNT WT_THOUSAND #endif /* @@ -300,7 +300,7 @@ __wt_fair_lock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock) * situation happens if there are more threads than cores in the * system and we're thrashing on shared resources. */ - if (++pause_cnt < 1000) + if (++pause_cnt < WT_THOUSAND) WT_PAUSE(); else __wt_sleep(0, 10); diff --git a/src/include/os.h b/src/include/os.h index 4ba588111b8..d135fd9eb1f 100644 --- a/src/include/os.h +++ b/src/include/os.h @@ -65,9 +65,16 @@ typedef enum { } \ } while (0) -#define WT_TIMEDIFF(end, begin) \ - (1000000000 * (uint64_t)((end).tv_sec - (begin).tv_sec) + \ +#define WT_TIMEDIFF_NS(end, begin) \ + (WT_BILLION * (uint64_t)((end).tv_sec - (begin).tv_sec) + \ (uint64_t)(end).tv_nsec - (uint64_t)(begin).tv_nsec) +#define WT_TIMEDIFF_US(end, begin) \ + (WT_TIMEDIFF_NS((end), (begin)) / WT_THOUSAND) +#define WT_TIMEDIFF_MS(end, begin) \ + (WT_TIMEDIFF_NS((end), (begin)) / WT_MILLION) +#define WT_TIMEDIFF_SEC(end, begin) \ + (WT_TIMEDIFF_NS((end), (begin)) / WT_BILLION) + #define WT_TIMECMP(t1, t2) \ ((t1).tv_sec < (t2).tv_sec ? -1 : \ (t1).tv_sec == (t2.tv_sec) ? \ diff --git a/src/include/stat.h b/src/include/stat.h index 32cee27f832..a77fe3a615f 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -272,7 +272,8 @@ struct __wt_connection_stats { int64_t cache_eviction_server_evicting; int64_t cache_eviction_server_not_evicting; int64_t cache_eviction_slow; - int64_t cache_eviction_split; + int64_t cache_eviction_split_internal; + int64_t cache_eviction_split_leaf; int64_t cache_eviction_walk; int64_t cache_eviction_worker_evicting; int64_t cache_inmem_split; @@ -432,7 +433,8 @@ struct __wt_dsrc_stats { int64_t cache_eviction_fail; int64_t cache_eviction_hazard; int64_t cache_eviction_internal; - int64_t cache_eviction_split; + int64_t cache_eviction_split_internal; + int64_t cache_eviction_split_leaf; int64_t cache_inmem_split; int64_t cache_inmem_splittable; int64_t cache_overflow_value; diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 4e773ba2234..7e7fc95cdce 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -3769,224 +3769,226 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING 1043 /*! cache: eviction server unable to reach eviction goal */ #define WT_STAT_CONN_CACHE_EVICTION_SLOW 1044 -/*! cache: pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT 1045 +/*! cache: internal pages split during eviction */ +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1045 +/*! cache: leaf pages split during eviction */ +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1046 /*! cache: pages walked for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK 1046 +#define WT_STAT_CONN_CACHE_EVICTION_WALK 1047 /*! cache: eviction worker thread evicting pages */ -#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1047 +#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1048 /*! cache: in-memory page splits */ -#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1048 +#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1049 /*! cache: in-memory page passed criteria to be split */ -#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1049 +#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1050 /*! cache: lookaside table insert calls */ -#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1050 +#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1051 /*! cache: lookaside table remove calls */ -#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1051 +#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1052 /*! cache: percentage overhead */ -#define WT_STAT_CONN_CACHE_OVERHEAD 1052 +#define WT_STAT_CONN_CACHE_OVERHEAD 1053 /*! cache: tracked dirty pages in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1053 +#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1054 /*! cache: pages currently held in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_INUSE 1054 +#define WT_STAT_CONN_CACHE_PAGES_INUSE 1055 /*! cache: pages read into cache */ -#define WT_STAT_CONN_CACHE_READ 1055 +#define WT_STAT_CONN_CACHE_READ 1056 /*! cache: pages read into cache requiring lookaside entries */ -#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1056 +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1057 /*! cache: pages written from cache */ -#define WT_STAT_CONN_CACHE_WRITE 1057 +#define WT_STAT_CONN_CACHE_WRITE 1058 /*! cache: page written requiring lookaside records */ -#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1058 +#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1059 /*! cache: pages written requiring in-memory restoration */ -#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1059 +#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1060 /*! connection: pthread mutex condition wait calls */ -#define WT_STAT_CONN_COND_WAIT 1060 +#define WT_STAT_CONN_COND_WAIT 1061 /*! cursor: cursor create calls */ -#define WT_STAT_CONN_CURSOR_CREATE 1061 +#define WT_STAT_CONN_CURSOR_CREATE 1062 /*! cursor: cursor insert calls */ -#define WT_STAT_CONN_CURSOR_INSERT 1062 +#define WT_STAT_CONN_CURSOR_INSERT 1063 /*! cursor: cursor next calls */ -#define WT_STAT_CONN_CURSOR_NEXT 1063 +#define WT_STAT_CONN_CURSOR_NEXT 1064 /*! cursor: cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV 1064 +#define WT_STAT_CONN_CURSOR_PREV 1065 /*! cursor: cursor remove calls */ -#define WT_STAT_CONN_CURSOR_REMOVE 1065 +#define WT_STAT_CONN_CURSOR_REMOVE 1066 /*! cursor: cursor reset calls */ -#define WT_STAT_CONN_CURSOR_RESET 1066 +#define WT_STAT_CONN_CURSOR_RESET 1067 /*! cursor: cursor restarted searches */ -#define WT_STAT_CONN_CURSOR_RESTART 1067 +#define WT_STAT_CONN_CURSOR_RESTART 1068 /*! cursor: cursor search calls */ -#define WT_STAT_CONN_CURSOR_SEARCH 1068 +#define WT_STAT_CONN_CURSOR_SEARCH 1069 /*! cursor: cursor search near calls */ -#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1069 +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1070 /*! cursor: cursor update calls */ -#define WT_STAT_CONN_CURSOR_UPDATE 1070 +#define WT_STAT_CONN_CURSOR_UPDATE 1071 /*! data-handle: connection data handles currently active */ -#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1071 +#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1072 /*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1072 +#define WT_STAT_CONN_DH_SESSION_HANDLES 1073 /*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1073 +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1074 /*! data-handle: connection sweep dhandles closed */ -#define WT_STAT_CONN_DH_SWEEP_CLOSE 1074 +#define WT_STAT_CONN_DH_SWEEP_CLOSE 1075 /*! data-handle: connection sweep candidate became referenced */ -#define WT_STAT_CONN_DH_SWEEP_REF 1075 +#define WT_STAT_CONN_DH_SWEEP_REF 1076 /*! data-handle: connection sweep dhandles removed from hash list */ -#define WT_STAT_CONN_DH_SWEEP_REMOVE 1076 +#define WT_STAT_CONN_DH_SWEEP_REMOVE 1077 /*! data-handle: connection sweep time-of-death sets */ -#define WT_STAT_CONN_DH_SWEEP_TOD 1077 +#define WT_STAT_CONN_DH_SWEEP_TOD 1078 /*! data-handle: connection sweeps */ -#define WT_STAT_CONN_DH_SWEEPS 1078 +#define WT_STAT_CONN_DH_SWEEPS 1079 /*! connection: files currently open */ -#define WT_STAT_CONN_FILE_OPEN 1079 +#define WT_STAT_CONN_FILE_OPEN 1080 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1080 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1081 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1081 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1082 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1082 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1083 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1083 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1084 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1084 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1085 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1085 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1086 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1086 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1087 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1087 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1088 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1088 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1089 /*! log: log flush operations */ -#define WT_STAT_CONN_LOG_FLUSH 1089 +#define WT_STAT_CONN_LOG_FLUSH 1090 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1090 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1091 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1091 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1092 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1092 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1093 /*! log: pre-allocated log files not ready and missed */ -#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1093 +#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1094 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1094 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1095 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1095 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1096 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1096 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1097 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1097 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1098 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1098 +#define WT_STAT_CONN_LOG_SCANS 1099 /*! log: consolidated slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1099 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1100 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1100 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1101 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1101 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1102 /*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1102 +#define WT_STAT_CONN_LOG_SLOT_JOINS 1103 /*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1103 +#define WT_STAT_CONN_LOG_SLOT_RACES 1104 /*! log: busy returns attempting to switch slots */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1104 +#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1105 /*! log: consolidated slot join transitions */ -#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1105 +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1106 /*! log: consolidated slot unbuffered writes */ -#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1106 +#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1107 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1107 +#define WT_STAT_CONN_LOG_SYNC 1108 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1108 +#define WT_STAT_CONN_LOG_SYNC_DIR 1109 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1109 +#define WT_STAT_CONN_LOG_WRITE_LSN 1110 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1110 +#define WT_STAT_CONN_LOG_WRITES 1111 /*! log: log files manually zero-filled */ -#define WT_STAT_CONN_LOG_ZERO_FILLS 1111 +#define WT_STAT_CONN_LOG_ZERO_FILLS 1112 /*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1112 +#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1113 /*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1113 +#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1114 /*! LSM: rows merged in an LSM tree */ -#define WT_STAT_CONN_LSM_ROWS_MERGED 1114 +#define WT_STAT_CONN_LSM_ROWS_MERGED 1115 /*! LSM: application work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1115 +#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1116 /*! LSM: merge work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1116 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1117 /*! LSM: tree queue hit maximum */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1117 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1118 /*! LSM: switch work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1118 +#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1119 /*! LSM: tree maintenance operations scheduled */ -#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1119 +#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1120 /*! LSM: tree maintenance operations discarded */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1120 +#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1121 /*! LSM: tree maintenance operations executed */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1121 +#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1122 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1122 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1123 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1123 +#define WT_STAT_CONN_MEMORY_FREE 1124 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1124 +#define WT_STAT_CONN_MEMORY_GROW 1125 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1125 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1126 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1126 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1127 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1127 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1128 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1128 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1129 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1129 +#define WT_STAT_CONN_PAGE_SLEEP 1130 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1130 +#define WT_STAT_CONN_READ_IO 1131 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1131 +#define WT_STAT_CONN_REC_PAGES 1132 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1132 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1133 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1133 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1134 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1134 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1135 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1135 +#define WT_STAT_CONN_RWLOCK_READ 1136 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1136 +#define WT_STAT_CONN_RWLOCK_WRITE 1137 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1137 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1138 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1138 +#define WT_STAT_CONN_SESSION_OPEN 1139 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1139 +#define WT_STAT_CONN_TXN_BEGIN 1140 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1140 +#define WT_STAT_CONN_TXN_CHECKPOINT 1141 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1141 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1142 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1142 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1143 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1143 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1144 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1144 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1145 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1145 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1146 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1146 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1147 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1147 +#define WT_STAT_CONN_TXN_COMMIT 1148 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1148 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1149 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1149 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1150 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1150 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1151 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1151 +#define WT_STAT_CONN_TXN_ROLLBACK 1152 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1152 +#define WT_STAT_CONN_TXN_SYNC 1153 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1153 +#define WT_STAT_CONN_WRITE_IO 1154 /*! * @} @@ -4082,112 +4084,114 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 2042 /*! cache: internal pages evicted */ #define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 2043 -/*! cache: pages split during eviction */ -#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT 2044 +/*! cache: internal pages split during eviction */ +#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT_INTERNAL 2044 +/*! cache: leaf pages split during eviction */ +#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT_LEAF 2045 /*! cache: in-memory page splits */ -#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2045 +#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2046 /*! cache: in-memory page passed criteria to be split */ -#define WT_STAT_DSRC_CACHE_INMEM_SPLITTABLE 2046 +#define WT_STAT_DSRC_CACHE_INMEM_SPLITTABLE 2047 /*! cache: overflow values cached in memory */ -#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2047 +#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2048 /*! cache: pages read into cache */ -#define WT_STAT_DSRC_CACHE_READ 2048 +#define WT_STAT_DSRC_CACHE_READ 2049 /*! cache: pages read into cache requiring lookaside entries */ -#define WT_STAT_DSRC_CACHE_READ_LOOKASIDE 2049 +#define WT_STAT_DSRC_CACHE_READ_LOOKASIDE 2050 /*! cache: overflow pages read into cache */ -#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2050 +#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2051 /*! cache: pages written from cache */ -#define WT_STAT_DSRC_CACHE_WRITE 2051 +#define WT_STAT_DSRC_CACHE_WRITE 2052 /*! cache: page written requiring lookaside records */ -#define WT_STAT_DSRC_CACHE_WRITE_LOOKASIDE 2052 +#define WT_STAT_DSRC_CACHE_WRITE_LOOKASIDE 2053 /*! cache: pages written requiring in-memory restoration */ -#define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2053 +#define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2054 /*! compression: raw compression call failed, no additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2054 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2055 /*! compression: raw compression call failed, additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2055 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2056 /*! compression: raw compression call succeeded */ -#define WT_STAT_DSRC_COMPRESS_RAW_OK 2056 +#define WT_STAT_DSRC_COMPRESS_RAW_OK 2057 /*! compression: compressed pages read */ -#define WT_STAT_DSRC_COMPRESS_READ 2057 +#define WT_STAT_DSRC_COMPRESS_READ 2058 /*! compression: compressed pages written */ -#define WT_STAT_DSRC_COMPRESS_WRITE 2058 +#define WT_STAT_DSRC_COMPRESS_WRITE 2059 /*! compression: page written failed to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2059 +#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2060 /*! compression: page written was too small to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2060 +#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2061 /*! cursor: create calls */ -#define WT_STAT_DSRC_CURSOR_CREATE 2061 +#define WT_STAT_DSRC_CURSOR_CREATE 2062 /*! cursor: insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT 2062 +#define WT_STAT_DSRC_CURSOR_INSERT 2063 /*! cursor: bulk-loaded cursor-insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2063 +#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2064 /*! cursor: cursor-insert key and value bytes inserted */ -#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2064 +#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2065 /*! cursor: next calls */ -#define WT_STAT_DSRC_CURSOR_NEXT 2065 +#define WT_STAT_DSRC_CURSOR_NEXT 2066 /*! cursor: prev calls */ -#define WT_STAT_DSRC_CURSOR_PREV 2066 +#define WT_STAT_DSRC_CURSOR_PREV 2067 /*! cursor: remove calls */ -#define WT_STAT_DSRC_CURSOR_REMOVE 2067 +#define WT_STAT_DSRC_CURSOR_REMOVE 2068 /*! cursor: cursor-remove key bytes removed */ -#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2068 +#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2069 /*! cursor: reset calls */ -#define WT_STAT_DSRC_CURSOR_RESET 2069 +#define WT_STAT_DSRC_CURSOR_RESET 2070 /*! cursor: restarted searches */ -#define WT_STAT_DSRC_CURSOR_RESTART 2070 +#define WT_STAT_DSRC_CURSOR_RESTART 2071 /*! cursor: search calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH 2071 +#define WT_STAT_DSRC_CURSOR_SEARCH 2072 /*! cursor: search near calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2072 +#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2073 /*! cursor: update calls */ -#define WT_STAT_DSRC_CURSOR_UPDATE 2073 +#define WT_STAT_DSRC_CURSOR_UPDATE 2074 /*! cursor: cursor-update value bytes updated */ -#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2074 +#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2075 /*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2075 +#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2076 /*! LSM: chunks in the LSM tree */ -#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2076 +#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2077 /*! LSM: highest merge generation in the LSM tree */ -#define WT_STAT_DSRC_LSM_GENERATION_MAX 2077 +#define WT_STAT_DSRC_LSM_GENERATION_MAX 2078 /*! LSM: queries that could have benefited from a Bloom filter that did * not exist */ -#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2078 +#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2079 /*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2079 +#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2080 /*! reconciliation: dictionary matches */ -#define WT_STAT_DSRC_REC_DICTIONARY 2080 +#define WT_STAT_DSRC_REC_DICTIONARY 2081 /*! reconciliation: internal page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2081 +#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2082 /*! reconciliation: leaf page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2082 +#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2083 /*! reconciliation: maximum blocks required for a page */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2083 +#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2084 /*! reconciliation: internal-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2084 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2085 /*! reconciliation: leaf-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2085 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2086 /*! reconciliation: overflow values written */ -#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2086 +#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2087 /*! reconciliation: pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE 2087 +#define WT_STAT_DSRC_REC_PAGE_DELETE 2088 /*! reconciliation: page checksum matches */ -#define WT_STAT_DSRC_REC_PAGE_MATCH 2088 +#define WT_STAT_DSRC_REC_PAGE_MATCH 2089 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_DSRC_REC_PAGES 2089 +#define WT_STAT_DSRC_REC_PAGES 2090 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_DSRC_REC_PAGES_EVICTION 2090 +#define WT_STAT_DSRC_REC_PAGES_EVICTION 2091 /*! reconciliation: leaf page key bytes discarded using prefix compression */ -#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2091 +#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2092 /*! reconciliation: internal page key bytes discarded using suffix * compression */ -#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2092 +#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2093 /*! session: object compaction */ -#define WT_STAT_DSRC_SESSION_COMPACT 2093 +#define WT_STAT_DSRC_SESSION_COMPACT 2094 /*! session: open cursor count */ -#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2094 +#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2095 /*! transaction: update conflicts */ -#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2095 +#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2096 /*! * @} diff --git a/src/log/log.c b/src/log/log.c index 44dc7dc30a7..3106094e7e3 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -1313,7 +1313,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) __wt_spin_unlock(session, &log->log_slot_lock); WT_ERR(__wt_cond_signal(session, conn->log_wrlsn_cond)); - if (++yield_count < 1000) + if (++yield_count < WT_THOUSAND) __wt_yield(); else ret = __wt_cond_wait(session, log->log_write_cond, 200); diff --git a/src/log/log_slot.c b/src/log/log_slot.c index b3790412536..255551f99a4 100644 --- a/src/log/log_slot.c +++ b/src/log/log_slot.c @@ -380,7 +380,7 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, * There should almost always be a slot open. */ #ifdef HAVE_DIAGNOSTIC - unbuf_force = (++log->write_calls % 1000) == 0; + unbuf_force = (++log->write_calls % WT_THOUSAND) == 0; #endif for (;;) { WT_BARRIER(); diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index 5742427736a..953698476ef 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -81,7 +81,7 @@ __wt_clsm_await_switch(WT_CURSOR_LSM *clsm) lsm_tree->nchunks == 0 || clsm->dsk_gen == lsm_tree->dsk_gen; ++waited) { - if (waited % 1000 == 0) + if (waited % WT_THOUSAND == 0) WT_RET(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_SWITCH, 0, lsm_tree)); __wt_sleep(0, 10); diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c index 1c5124c32af..d8cf36f2cc1 100644 --- a/src/lsm/lsm_manager.c +++ b/src/lsm/lsm_manager.c @@ -388,8 +388,7 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) continue; WT_ERR(__wt_epoch(session, &now)); pushms = lsm_tree->work_push_ts.tv_sec == 0 ? 0 : - WT_TIMEDIFF( - now, lsm_tree->work_push_ts) / WT_MILLION; + WT_TIMEDIFF_MS(now, lsm_tree->work_push_ts); fillms = 3 * lsm_tree->chunk_fill_ms; if (fillms == 0) fillms = 10000; diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c index dd1419fe67d..1a2608803e4 100644 --- a/src/lsm/lsm_merge.c +++ b/src/lsm/lsm_merge.c @@ -94,7 +94,7 @@ __lsm_merge_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_RET(__wt_epoch(session, &now)); msec_since_last_merge = - WT_TIMEDIFF(now, lsm_tree->merge_aggressive_ts) / WT_MILLION; + WT_TIMEDIFF_MS(now, lsm_tree->merge_aggressive_ts); /* * If there is no estimate for how long it's taking to fill chunks @@ -457,7 +457,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) cfg[2] = NULL; WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest)); -#define LSM_MERGE_CHECK_INTERVAL 1000 +#define LSM_MERGE_CHECK_INTERVAL WT_THOUSAND for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) { if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c index a43d63afa74..0c3642e70e8 100644 --- a/src/lsm/lsm_tree.c +++ b/src/lsm/lsm_tree.c @@ -111,7 +111,7 @@ __lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) * other schema level operations will return EBUSY, even though * we're dropping the schema lock here. */ - if (i % 1000 == 0) { + if (i % WT_THOUSAND == 0) { WT_WITHOUT_LOCKS(session, ret = __wt_lsm_manager_clear_tree(session, lsm_tree)); WT_RET(ret); @@ -752,7 +752,7 @@ __wt_lsm_tree_throttle( WT_ASSERT(session, WT_TIMECMP(last_chunk->create_ts, ondisk->create_ts) >= 0); timediff = - WT_TIMEDIFF(last_chunk->create_ts, ondisk->create_ts); + WT_TIMEDIFF_NS(last_chunk->create_ts, ondisk->create_ts); lsm_tree->ckpt_throttle = (in_memory - 2) * timediff / (20 * record_count); @@ -788,8 +788,8 @@ __wt_lsm_tree_throttle( } /* Put an upper bound of 1s on both throttle calculations. */ - lsm_tree->ckpt_throttle = WT_MIN(1000000, lsm_tree->ckpt_throttle); - lsm_tree->merge_throttle = WT_MIN(1000000, lsm_tree->merge_throttle); + lsm_tree->ckpt_throttle = WT_MIN(WT_MILLION, lsm_tree->ckpt_throttle); + lsm_tree->merge_throttle = WT_MIN(WT_MILLION, lsm_tree->merge_throttle); /* * Update our estimate of how long each in-memory chunk stays active. @@ -803,15 +803,16 @@ __wt_lsm_tree_throttle( WT_ASSERT(session, prev_chunk->generation == 0); WT_ASSERT(session, WT_TIMECMP( last_chunk->create_ts, prev_chunk->create_ts) >= 0); - timediff = - WT_TIMEDIFF(last_chunk->create_ts, prev_chunk->create_ts); + timediff = WT_TIMEDIFF_NS( + last_chunk->create_ts, prev_chunk->create_ts); WT_ASSERT(session, WT_TIMECMP(prev_chunk->create_ts, ondisk->create_ts) >= 0); - oldtime = WT_TIMEDIFF(prev_chunk->create_ts, ondisk->create_ts); + oldtime = WT_TIMEDIFF_NS( + prev_chunk->create_ts, ondisk->create_ts); if (timediff < 10 * oldtime) lsm_tree->chunk_fill_ms = (3 * lsm_tree->chunk_fill_ms + - timediff / 1000000) / 4; + timediff / WT_MILLION) / 4; } } diff --git a/src/os_posix/os_mtx_cond.c b/src/os_posix/os_mtx_cond.c index fac2c06957d..d5fc86b648b 100644 --- a/src/os_posix/os_mtx_cond.c +++ b/src/os_posix/os_mtx_cond.c @@ -76,9 +76,9 @@ __wt_cond_wait_signal( if (usecs > 0) { WT_ERR(__wt_epoch(session, &ts)); ts.tv_sec += (time_t) - (((uint64_t)ts.tv_nsec + 1000 * usecs) / WT_BILLION); + (((uint64_t)ts.tv_nsec + WT_THOUSAND * usecs) / WT_BILLION); ts.tv_nsec = (long) - (((uint64_t)ts.tv_nsec + 1000 * usecs) % WT_BILLION); + (((uint64_t)ts.tv_nsec + WT_THOUSAND * usecs) % WT_BILLION); ret = pthread_cond_timedwait(&cond->cond, &cond->mtx, &ts); } else ret = pthread_cond_wait(&cond->cond, &cond->mtx); diff --git a/src/os_posix/os_mtx_rw.c b/src/os_posix/os_mtx_rw.c index d47ab197643..46f134feabb 100644 --- a/src/os_posix/os_mtx_rw.c +++ b/src/os_posix/os_mtx_rw.c @@ -201,7 +201,7 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) * Don't sleep long when waiting on a read lock, hopefully we're * waiting on another read thread to increment the reader count. */ - if (++pause_cnt < 1000) + if (++pause_cnt < WT_THOUSAND) WT_PAUSE(); else __wt_sleep(0, 10); @@ -300,7 +300,7 @@ __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) * situation happens if there are more threads than cores in the * system and we're thrashing on shared resources. */ - if (++pause_cnt < 1000) + if (++pause_cnt < WT_THOUSAND) WT_PAUSE(); else __wt_sleep(0, 10); diff --git a/src/os_posix/os_sleep.c b/src/os_posix/os_sleep.c index f888e51bf7f..4e90edabc53 100644 --- a/src/os_posix/os_sleep.c +++ b/src/os_posix/os_sleep.c @@ -17,8 +17,8 @@ __wt_sleep(uint64_t seconds, uint64_t micro_seconds) { struct timeval t; - t.tv_sec = (time_t)(seconds + micro_seconds / 1000000); - t.tv_usec = (suseconds_t)(micro_seconds % 1000000); + t.tv_sec = (time_t)(seconds + micro_seconds / WT_MILLION); + t.tv_usec = (suseconds_t)(micro_seconds % WT_MILLION); (void)select(0, NULL, NULL, NULL, &t); } diff --git a/src/os_posix/os_time.c b/src/os_posix/os_time.c index 6bddf4a18dd..c3052df62e7 100644 --- a/src/os_posix/os_time.c +++ b/src/os_posix/os_time.c @@ -28,7 +28,7 @@ __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) WT_SYSCALL_RETRY(gettimeofday(&v, NULL), ret); if (ret == 0) { tsp->tv_sec = v.tv_sec; - tsp->tv_nsec = v.tv_usec * 1000; + tsp->tv_nsec = v.tv_usec * WT_THOUSAND; return (0); } WT_RET_MSG(session, ret, "gettimeofday"); diff --git a/src/os_win/os_sleep.c b/src/os_win/os_sleep.c index 484cf218f26..33e04c1d8a9 100644 --- a/src/os_win/os_sleep.c +++ b/src/os_win/os_sleep.c @@ -19,7 +19,7 @@ __wt_sleep(uint64_t seconds, uint64_t micro_seconds) * If the caller wants a small pause, set to our * smallest granularity. */ - if (seconds == 0 && micro_seconds < 1000) - micro_seconds = 1000; - Sleep(seconds * 1000 + micro_seconds / 1000); + if (seconds == 0 && micro_seconds < WT_THOUSAND) + micro_seconds = WT_THOUSAND; + Sleep(seconds * WT_THOUSAND + micro_seconds / WT_THOUSAND); } diff --git a/src/packing/pack_impl.c b/src/packing/pack_impl.c index 99fa0d54869..447c887dc6f 100644 --- a/src/packing/pack_impl.c +++ b/src/packing/pack_impl.c @@ -132,7 +132,7 @@ __wt_struct_unpack_size(WT_SESSION_IMPL *session, if (ret != WT_NOTFOUND) return (ret); - *resultp = (p - (uint8_t *)buffer); + *resultp = WT_PTRDIFF(p, buffer); return (0); } @@ -190,7 +190,7 @@ __wt_struct_repack(WT_SESSION_IMPL *session, const char *infmt, WT_ERR(ENOTSUP); } if (pout != NULL) { - memcpy(pout, before, p - before); + memcpy(pout, before, WT_PTRDIFF(p, before)); pout += p - before; } else if (start == NULL) start = before; @@ -202,10 +202,10 @@ __wt_struct_repack(WT_SESSION_IMPL *session, const char *infmt, if (pout != NULL) { outbuf->data = *reallocp; - outbuf->size = (pout - (uint8_t *)*reallocp); + outbuf->size = WT_PTRDIFF(pout, *reallocp); } else { outbuf->data = start; - outbuf->size = (p - (uint8_t *)start); + outbuf->size = WT_PTRDIFF(p, start); } err: return (ret); diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index 4479f4a8515..fe60cc16063 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -960,7 +960,7 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, bool destroy) * than 10,000 boundary structure elements, discard the boundary array * entirely and start over next time. */ - if (destroy || r->bnd_entries > 10 * 1000) { + if (destroy || r->bnd_entries > 10 * WT_THOUSAND) { for (bnd = r->bnd, i = 0; i < r->bnd_entries; ++bnd, ++i) { __wt_free(session, bnd->addr.addr); __wt_free(session, bnd->disk_image); diff --git a/src/session/session_api.c b/src/session/session_api.c index e3a28732494..25de2f2983a 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -711,7 +711,7 @@ __session_join(WT_SESSION *wt_session, WT_CURSOR *join_cursor, } WT_ERR(__wt_config_gets(session, cfg, "count", &cval)); if (cval.len != 0) - count = cval.val; + count = (uint64_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "strategy", &cval)); if (cval.len != 0) { @@ -1150,7 +1150,7 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config) while (__wt_log_cmp(&session->bg_sync_lsn, &log->sync_lsn) > 0) { WT_ERR(__wt_cond_signal(session, conn->log_file_cond)); WT_ERR(__wt_epoch(session, &now)); - waited_ms = WT_TIMEDIFF(now, start) / WT_MILLION; + waited_ms = WT_TIMEDIFF_MS(now, start); if (forever || waited_ms < timeout_ms) /* * Note, we will wait an increasing amount of time diff --git a/src/session/session_compact.c b/src/session/session_compact.c index bd503cd7826..456fcd3ce03 100644 --- a/src/session/session_compact.c +++ b/src/session/session_compact.c @@ -133,8 +133,7 @@ __session_compact_check_timeout( return (0); WT_RET(__wt_epoch(session, &end)); - if (session->compact->max_time < - WT_TIMEDIFF(end, begin) / WT_BILLION) + if (session->compact->max_time < WT_TIMEDIFF_SEC(end, begin)) WT_RET(ETIMEDOUT); return (0); } diff --git a/src/support/err.c b/src/support/err.c index c4bf4e8946a..de518cbf08b 100644 --- a/src/support/err.c +++ b/src/support/err.c @@ -199,7 +199,8 @@ __wt_eventv(WT_SESSION_IMPL *session, bool msg_event, int error, remain = WT_PTRDIFF(end, p); wlen = (size_t)snprintf(p, remain, "[%" PRIuMAX ":%" PRIuMAX "][%s]", - (uintmax_t)ts.tv_sec, (uintmax_t)ts.tv_nsec / 1000, tid); + (uintmax_t)ts.tv_sec, + (uintmax_t)ts.tv_nsec / WT_THOUSAND, tid); p = wlen >= remain ? end : p + wlen; prefix_cnt = 1; } diff --git a/src/support/stat.c b/src/support/stat.c index 83c0166b46e..10db4043235 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -47,7 +47,8 @@ static const char * const __stats_dsrc_desc[] = { "cache: data source pages selected for eviction unable to be evicted", "cache: hazard pointer blocked page eviction", "cache: internal pages evicted", - "cache: pages split during eviction", + "cache: internal pages split during eviction", + "cache: leaf pages split during eviction", "cache: in-memory page splits", "cache: in-memory page passed criteria to be split", "cache: overflow values cached in memory", @@ -165,6 +166,8 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) stats->cache_inmem_splittable = 0; stats->cache_inmem_split = 0; stats->cache_eviction_internal = 0; + stats->cache_eviction_split_internal = 0; + stats->cache_eviction_split_leaf = 0; stats->cache_eviction_dirty = 0; stats->cache_read_overflow = 0; stats->cache_overflow_value = 0; @@ -172,7 +175,6 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) stats->cache_write_lookaside = 0; stats->cache_read = 0; stats->cache_read_lookaside = 0; - stats->cache_eviction_split = 0; stats->cache_write = 0; stats->cache_write_restore = 0; stats->cache_eviction_clean = 0; @@ -282,6 +284,9 @@ __wt_stat_dsrc_aggregate_single( to->cache_inmem_splittable += from->cache_inmem_splittable; to->cache_inmem_split += from->cache_inmem_split; to->cache_eviction_internal += from->cache_eviction_internal; + to->cache_eviction_split_internal += + from->cache_eviction_split_internal; + to->cache_eviction_split_leaf += from->cache_eviction_split_leaf; to->cache_eviction_dirty += from->cache_eviction_dirty; to->cache_read_overflow += from->cache_read_overflow; to->cache_overflow_value += from->cache_overflow_value; @@ -289,7 +294,6 @@ __wt_stat_dsrc_aggregate_single( to->cache_write_lookaside += from->cache_write_lookaside; to->cache_read += from->cache_read; to->cache_read_lookaside += from->cache_read_lookaside; - to->cache_eviction_split += from->cache_eviction_split; to->cache_write += from->cache_write; to->cache_write_restore += from->cache_write_restore; to->cache_eviction_clean += from->cache_eviction_clean; @@ -409,6 +413,10 @@ __wt_stat_dsrc_aggregate( to->cache_inmem_split += WT_STAT_READ(from, cache_inmem_split); to->cache_eviction_internal += WT_STAT_READ(from, cache_eviction_internal); + to->cache_eviction_split_internal += + WT_STAT_READ(from, cache_eviction_split_internal); + to->cache_eviction_split_leaf += + WT_STAT_READ(from, cache_eviction_split_leaf); to->cache_eviction_dirty += WT_STAT_READ(from, cache_eviction_dirty); to->cache_read_overflow += WT_STAT_READ(from, cache_read_overflow); to->cache_overflow_value += WT_STAT_READ(from, cache_overflow_value); @@ -418,7 +426,6 @@ __wt_stat_dsrc_aggregate( WT_STAT_READ(from, cache_write_lookaside); to->cache_read += WT_STAT_READ(from, cache_read); to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside); - to->cache_eviction_split += WT_STAT_READ(from, cache_eviction_split); to->cache_write += WT_STAT_READ(from, cache_write); to->cache_write_restore += WT_STAT_READ(from, cache_write_restore); to->cache_eviction_clean += WT_STAT_READ(from, cache_eviction_clean); @@ -531,7 +538,8 @@ static const char * const __stats_connection_desc[] = { "cache: eviction server evicting pages", "cache: eviction server populating queue, but not evicting pages", "cache: eviction server unable to reach eviction goal", - "cache: pages split during eviction", + "cache: internal pages split during eviction", + "cache: leaf pages split during eviction", "cache: pages walked for eviction", "cache: eviction worker thread evicting pages", "cache: in-memory page splits", @@ -705,6 +713,8 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->cache_inmem_splittable = 0; stats->cache_inmem_split = 0; stats->cache_eviction_internal = 0; + stats->cache_eviction_split_internal = 0; + stats->cache_eviction_split_leaf = 0; stats->cache_lookaside_insert = 0; stats->cache_lookaside_remove = 0; /* not clearing cache_bytes_max */ @@ -719,7 +729,6 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->cache_read = 0; stats->cache_read_lookaside = 0; stats->cache_eviction_fail = 0; - stats->cache_eviction_split = 0; stats->cache_eviction_walk = 0; stats->cache_write = 0; stats->cache_write_restore = 0; @@ -884,6 +893,10 @@ __wt_stat_connection_aggregate( to->cache_inmem_split += WT_STAT_READ(from, cache_inmem_split); to->cache_eviction_internal += WT_STAT_READ(from, cache_eviction_internal); + to->cache_eviction_split_internal += + WT_STAT_READ(from, cache_eviction_split_internal); + to->cache_eviction_split_leaf += + WT_STAT_READ(from, cache_eviction_split_leaf); to->cache_lookaside_insert += WT_STAT_READ(from, cache_lookaside_insert); to->cache_lookaside_remove += @@ -904,7 +917,6 @@ __wt_stat_connection_aggregate( to->cache_read += WT_STAT_READ(from, cache_read); to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside); to->cache_eviction_fail += WT_STAT_READ(from, cache_eviction_fail); - to->cache_eviction_split += WT_STAT_READ(from, cache_eviction_split); to->cache_eviction_walk += WT_STAT_READ(from, cache_eviction_walk); to->cache_write += WT_STAT_READ(from, cache_write); to->cache_write_restore += WT_STAT_READ(from, cache_write_restore); diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 066abc9ed0f..bc1537ca878 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -297,7 +297,7 @@ __checkpoint_stats( /* * Get time diff in microseconds. */ - msec = WT_TIMEDIFF(*stop, *start) / WT_MILLION; + msec = WT_TIMEDIFF_MS(*stop, *start); if (msec > conn->ckpt_time_max) conn->ckpt_time_max = msec; @@ -327,7 +327,7 @@ __checkpoint_verbose_track(WT_SESSION_IMPL *session, /* * Get time diff in microseconds. */ - msec = WT_TIMEDIFF(stop, *start) / WT_MILLION; + msec = WT_TIMEDIFF_MS(stop, *start); WT_RET(__wt_verbose(session, WT_VERB_CHECKPOINT, "time: %" PRIu64 " us, gen: %" PRIu64 ": Full database checkpoint %s", |