diff options
Diffstat (limited to 'src')
193 files changed, 6960 insertions, 4499 deletions
diff --git a/src/async/async_api.c b/src/async/async_api.c index 54bcb7cd26c..b9cc995f5a5 100644 --- a/src/async/async_api.c +++ b/src/async/async_api.c @@ -240,8 +240,7 @@ __async_start(WT_SESSION_IMPL *session) async = conn->async; TAILQ_INIT(&async->formatqh); WT_RET(__wt_spin_init(session, &async->ops_lock, "ops")); - WT_RET(__wt_cond_alloc( - session, "async flush", false, &async->flush_cond)); + WT_RET(__wt_cond_alloc(session, "async flush", &async->flush_cond)); WT_RET(__wt_async_op_init(session)); /* @@ -339,17 +338,15 @@ __wt_async_reconfig(WT_SESSION_IMPL *session, const char *cfg[]) * 2. If async is off, and the user wants it on, start it. * 3. If not a toggle and async is off, we're done. */ - if (conn->async_cfg && !run) { - /* Case 1 */ + if (conn->async_cfg && !run) { /* Case 1 */ WT_TRET(__wt_async_flush(session)); ret = __wt_async_destroy(session); conn->async_cfg = false; return (ret); - } else if (!conn->async_cfg && run) - /* Case 2 */ + } + if (!conn->async_cfg && run) /* Case 2 */ return (__async_start(session)); - else if (!conn->async_cfg) - /* Case 3 */ + if (!conn->async_cfg) /* Case 3 */ return (0); /* @@ -541,7 +538,7 @@ retry: async->flush_op.state = WT_ASYNCOP_READY; WT_RET(__wt_async_op_enqueue(session, &async->flush_op)); while (async->flush_state != WT_ASYNC_FLUSH_COMPLETE) - __wt_cond_wait(session, async->flush_cond, 100000); + __wt_cond_wait(session, async->flush_cond, 100000, NULL); /* * Flush is done. Clear the flags. */ diff --git a/src/async/async_worker.c b/src/async/async_worker.c index b1bc3902f7c..11f59ed14f1 100644 --- a/src/async/async_worker.c +++ b/src/async/async_worker.c @@ -107,7 +107,7 @@ __async_flush_wait(WT_SESSION_IMPL *session, WT_ASYNC *async, uint64_t my_gen) { while (async->flush_state == WT_ASYNC_FLUSHING && async->flush_gen == my_gen) - __wt_cond_wait(session, async->flush_cond, 10000); + __wt_cond_wait(session, async->flush_cond, 10000, NULL); } /* diff --git a/src/block/block_addr.c b/src/block/block_addr.c index 580316bdfc6..a67efca62a3 100644 --- a/src/block/block_addr.c +++ b/src/block/block_addr.c @@ -226,7 +226,7 @@ __wt_block_ckpt_to_buffer(WT_SESSION_IMPL *session, ci->discard.offset, ci->discard.size, ci->discard.checksum)); a = (uint64_t)ci->file_size; WT_RET(__wt_vpack_uint(pp, 0, a)); - a = (uint64_t)ci->ckpt_size; + a = ci->ckpt_size; WT_RET(__wt_vpack_uint(pp, 0, a)); return (0); diff --git a/src/block/block_ext.c b/src/block/block_ext.c index 26acc8c560f..da7a06d873d 100644 --- a/src/block/block_ext.c +++ b/src/block/block_ext.c @@ -634,11 +634,11 @@ __wt_block_off_free( */ if ((ret = __wt_block_off_remove_overlap( session, block, &block->live.alloc, offset, size)) == 0) - ret = __block_merge(session, block, - &block->live.avail, offset, (wt_off_t)size); + ret = __block_merge( + session, block, &block->live.avail, offset, size); else if (ret == WT_NOTFOUND) - ret = __block_merge(session, block, - &block->live.discard, offset, (wt_off_t)size); + ret = __block_merge( + session, block, &block->live.discard, offset, size); return (ret); } @@ -1247,7 +1247,8 @@ __wt_block_extlist_write(WT_SESSION_IMPL *session, WT_DECL_RET; WT_EXT *ext; WT_PAGE_HEADER *dsk; - size_t entries, size; + uint32_t entries; + size_t size; uint8_t *p; WT_RET(__block_extlist_dump(session, block, el, "write")); @@ -1377,8 +1378,8 @@ __wt_block_extlist_init(WT_SESSION_IMPL *session, size = (name == NULL ? 0 : strlen(name)) + strlen(".") + (extname == NULL ? 0 : strlen(extname) + 1); WT_RET(__wt_calloc_def(session, size, &el->name)); - (void)snprintf(el->name, size, "%s.%s", - name == NULL ? "" : name, extname == NULL ? "" : extname); + WT_RET(__wt_snprintf(el->name, size, "%s.%s", + name == NULL ? "" : name, extname == NULL ? "" : extname)); el->offset = WT_BLOCK_INVALID_OFFSET; el->track_size = track_size; diff --git a/src/block/block_read.c b/src/block/block_read.c index 869a92b6ae1..8d4aec7df75 100644 --- a/src/block/block_read.c +++ b/src/block/block_read.c @@ -39,7 +39,7 @@ __wt_bm_preload( (uint8_t *)bm->map + offset, size, bm->mapped_cookie); if (!mapped && handle->fh_advise != NULL) ret = handle->fh_advise(handle, (WT_SESSION *)session, - (wt_off_t)offset, (wt_off_t)size, WT_FILE_HANDLE_WILLNEED); + offset, (wt_off_t)size, WT_FILE_HANDLE_WILLNEED); if (ret != EBUSY && ret != ENOTSUP) return (ret); diff --git a/src/block/block_vrfy.c b/src/block/block_vrfy.c index 94824ad19f8..154765ed079 100644 --- a/src/block/block_vrfy.c +++ b/src/block/block_vrfy.c @@ -22,7 +22,7 @@ static int __verify_set_file_size(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *); ((off) / (block)->allocsize - 1) #ifdef HAVE_VERBOSE #define WT_FRAG_TO_OFF(block, frag) \ - (((wt_off_t)(frag + 1)) * (block)->allocsize) + (((wt_off_t)((frag) + 1)) * (block)->allocsize) #endif /* diff --git a/src/block/block_write.c b/src/block/block_write.c index d08aba45920..ea7859d6a38 100644 --- a/src/block/block_write.c +++ b/src/block/block_write.c @@ -43,10 +43,10 @@ __wt_block_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t len) * more targeted solution at some point. */ if (!conn->hot_backup) { - __wt_readlock(session, conn->hot_backup_lock); + __wt_readlock(session, &conn->hot_backup_lock); if (!conn->hot_backup) ret = __wt_ftruncate(session, block->fh, len); - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); } /* diff --git a/src/bloom/bloom.c b/src/bloom/bloom.c index be3230437d3..b8d75678835 100644 --- a/src/bloom/bloom.c +++ b/src/bloom/bloom.c @@ -37,8 +37,8 @@ __bloom_init(WT_SESSION_IMPL *session, len += strlen(config); WT_ERR(__wt_calloc_def(session, len, &bloom->config)); /* Add the standard config at the end, so it overrides user settings. */ - (void)snprintf(bloom->config, len, - "%s,%s", config == NULL ? "" : config, WT_BLOOM_TABLE_CONFIG); + WT_ERR(__wt_snprintf(bloom->config, len, + "%s,%s", config == NULL ? "" : config, WT_BLOOM_TABLE_CONFIG)); bloom->session = session; diff --git a/src/btree/bt_curnext.c b/src/btree/bt_curnext.c index 4d3976f9647..21e575ffca9 100644 --- a/src/btree/bt_curnext.c +++ b/src/btree/bt_curnext.c @@ -338,7 +338,7 @@ new_insert: if ((ins = cbt->ins) != NULL) { } /* Check for the end of the page. */ - if (cbt->row_iteration_slot >= page->pg_row_entries * 2 + 1) + if (cbt->row_iteration_slot >= page->entries * 2 + 1) return (WT_NOTFOUND); ++cbt->row_iteration_slot; @@ -356,7 +356,7 @@ new_insert: if ((ins = cbt->ins) != NULL) { cbt->ins = NULL; cbt->slot = cbt->row_iteration_slot / 2 - 1; - rip = &page->pg_row_d[cbt->slot]; + rip = &page->pg_row[cbt->slot]; upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) @@ -579,20 +579,20 @@ __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt) int __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) { + WT_CURSOR *cursor; WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; bool newpage; + cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_CONN_INCR(session, cursor_next); WT_STAT_DATA_INCR(session, cursor_next); - flags = WT_READ_SKIP_INTL; /* Tree walk flags. */ - if (truncating) - LF_SET(WT_READ_TRUNCATE); + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); WT_RET(__cursor_func_init(cbt, false)); @@ -608,6 +608,9 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) * found. Then, move to the next page, until we reach the end of the * file. */ + flags = WT_READ_SKIP_INTL; /* tree walk flags */ + if (truncating) + LF_SET(WT_READ_TRUNCATE); for (newpage = false;; newpage = true) { page = cbt->ref == NULL ? NULL : cbt->ref->page; @@ -676,6 +679,8 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) if (ret == 0) WT_ERR(__wt_cursor_key_order_check(session, cbt, true)); #endif + if (ret == 0) + F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c index 2dd443ffac1..bf4bdad6529 100644 --- a/src/btree/bt_curprev.c +++ b/src/btree/bt_curprev.c @@ -458,13 +458,13 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage) if (!F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS)) WT_RET(__wt_row_leaf_keys(session, page)); - if (page->pg_row_entries == 0) + if (page->entries == 0) cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); else cbt->ins_head = - WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); + WT_ROW_INSERT_SLOT(page, page->entries - 1); cbt->ins = WT_SKIP_LAST(cbt->ins_head); - cbt->row_iteration_slot = page->pg_row_entries * 2 + 1; + cbt->row_iteration_slot = page->entries * 2 + 1; cbt->rip_saved = NULL; goto new_insert; } @@ -515,7 +515,7 @@ new_insert: if ((ins = cbt->ins) != NULL) { cbt->ins = NULL; cbt->slot = cbt->row_iteration_slot / 2 - 1; - rip = &page->pg_row_d[cbt->slot]; + rip = &page->pg_row[cbt->slot]; upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) @@ -535,20 +535,20 @@ new_insert: if ((ins = cbt->ins) != NULL) { int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) { + WT_CURSOR *cursor; WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; bool newpage; + cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_CONN_INCR(session, cursor_prev); WT_STAT_DATA_INCR(session, cursor_prev); - flags = WT_READ_PREV | WT_READ_SKIP_INTL; /* Tree walk flags. */ - if (truncating) - LF_SET(WT_READ_TRUNCATE); + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); WT_RET(__cursor_func_init(cbt, false)); @@ -564,6 +564,9 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) * found. Then, move to the previous page, until we reach the start * of the file. */ + flags = WT_READ_PREV | WT_READ_SKIP_INTL; /* tree walk flags */ + if (truncating) + LF_SET(WT_READ_TRUNCATE); for (newpage = false;; newpage = true) { page = cbt->ref == NULL ? NULL : cbt->ref->page; @@ -631,6 +634,8 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) if (ret == 0) WT_ERR(__wt_cursor_key_order_check(session, cbt, false)); #endif + if (ret == 0) + F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index 650289f2cd8..944e276fc01 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -9,6 +9,84 @@ #include "wt_internal.h" /* + * When returning an error, we need to restore the cursor to a valid state, the + * upper-level cursor code is likely to retry. This structure and the associated + * functions are used save and restore the cursor state. + */ +typedef struct { + WT_ITEM key; + WT_ITEM value; + uint64_t recno; + uint32_t flags; +} WT_CURFILE_STATE; + +/* + * __cursor_state_save -- + * Save the cursor's external state. + */ +static inline void +__cursor_state_save(WT_CURSOR *cursor, WT_CURFILE_STATE *state) +{ + WT_ITEM_SET(state->key, cursor->key); + WT_ITEM_SET(state->value, cursor->value); + state->recno = cursor->recno; + state->flags = cursor->flags; +} + +/* + * __cursor_state_restore -- + * Restore the cursor's external state. + */ +static inline void +__cursor_state_restore(WT_CURSOR *cursor, WT_CURFILE_STATE *state) +{ + if (F_ISSET(state, WT_CURSTD_KEY_EXT)) + WT_ITEM_SET(cursor->key, state->key); + if (F_ISSET(state, WT_CURSTD_VALUE_EXT)) + WT_ITEM_SET(cursor->value, state->value); + cursor->recno = state->recno; + F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + F_SET(cursor, F_MASK(state, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT)); + +} + +/* + * __cursor_page_pinned -- + * Return if we have a page pinned and it's not been flagged for forced + * eviction (the forced eviction test is so we periodically release pages + * grown too large). + */ +static inline bool +__cursor_page_pinned(WT_CURSOR_BTREE *cbt) +{ + return (F_ISSET(cbt, WT_CBT_ACTIVE) && + cbt->ref->page->read_gen != WT_READGEN_OLDEST); +} + +/* + * __cursor_copy_int_key -- + * If we're pointing into the tree, save the key into local memory. + */ +static inline int +__cursor_copy_int_key(WT_CURSOR *cursor) +{ + /* + * We're about to discard the cursor's position and the cursor layer + * might retry the operation. We discard pinned pages on error, which + * will invalidate pinned keys. Clear WT_CURSTD_KEY_INT in all cases, + * the underlying page is gone whether we can allocate memory or not. + */ + if (F_ISSET(cursor, WT_CURSTD_KEY_INT)) { + F_CLR(cursor, WT_CURSTD_KEY_INT); + if (!WT_DATA_IN_ITEM(&cursor->key)) + WT_RET(__wt_buf_set((WT_SESSION_IMPL *)cursor->session, + &cursor->key, cursor->key.data, cursor->key.size)); + F_SET(cursor, WT_CURSTD_KEY_EXT); + } + return (0); +} + +/* * __cursor_size_chk -- * Return if an inserted item is too large. */ @@ -55,6 +133,34 @@ __cursor_size_chk(WT_SESSION_IMPL *session, WT_ITEM *kv) } /* + * __cursor_disable_bulk -- + * Disable bulk loads into a tree. + */ +static inline void +__cursor_disable_bulk(WT_SESSION_IMPL *session, WT_BTREE *btree) +{ + /* + * Once a tree (other than the LSM primary) is no longer empty, eviction + * should pay attention to it, and it's no longer possible to bulk-load + * into it. + */ + if (!btree->original) + return; + if (btree->lsm_primary) { + btree->original = 0; /* Make the next test faster. */ + return; + } + + /* + * We use a compare-and-swap here to avoid races among the first inserts + * into a tree. Eviction is disabled when an empty tree is opened, and + * it must only be enabled once. + */ + if (__wt_atomic_cas8(&btree->original, 1, 0)) + __wt_evict_file_exclusive_off(session); +} + +/* * __cursor_fix_implicit -- * Return if search went past the end of the tree. */ @@ -76,11 +182,11 @@ __cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt) } /* - * __cursor_valid -- + * __wt_cursor_valid -- * Return if the cursor references an valid key/value pair. */ -static inline bool -__cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) +bool +__wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) { WT_BTREE *btree; WT_CELL *cell; @@ -163,7 +269,7 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) * column-store pages don't have slots, but map one-to-one to * keys, check for retrieval past the end of the page. */ - if (cbt->recno >= cbt->ref->ref_recno + page->pg_fix_entries) + if (cbt->recno >= cbt->ref->ref_recno + page->entries) return (false); /* @@ -173,9 +279,9 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) break; case BTREE_COL_VAR: /* The search function doesn't check for empty pages. */ - if (page->pg_var_entries == 0) + if (page->entries == 0) return (false); - WT_ASSERT(session, cbt->slot < page->pg_var_entries); + WT_ASSERT(session, cbt->slot < page->entries); /* * Column-store updates are stored as "insert" objects. If @@ -191,16 +297,16 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) * backing store; check the cell for a record already deleted * when read. */ - cip = &page->pg_var_d[cbt->slot]; + cip = &page->pg_var[cbt->slot]; if ((cell = WT_COL_PTR(page, cip)) == NULL || __wt_cell_type(cell) == WT_CELL_DEL) return (false); break; case BTREE_ROW: /* The search function doesn't check for empty pages. */ - if (page->pg_row_entries == 0) + if (page->entries == 0) return (false); - WT_ASSERT(session, cbt->slot < page->pg_row_entries); + WT_ASSERT(session, cbt->slot < page->entries); /* * See above: for row-store, no insert object can have the same @@ -285,13 +391,17 @@ __cursor_row_modify( int __wt_btcur_reset(WT_CURSOR_BTREE *cbt) { + WT_CURSOR *cursor; WT_SESSION_IMPL *session; + cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_CONN_INCR(session, cursor_reset); WT_STAT_DATA_INCR(session, cursor_reset); + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + return (__cursor_reset(cbt)); } @@ -303,6 +413,7 @@ int __wt_btcur_search(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; + WT_CURFILE_STATE state; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; @@ -317,20 +428,28 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_STAT_CONN_INCR(session, cursor_search); WT_STAT_DATA_INCR(session, cursor_search); + __cursor_state_save(cursor, &state); + + /* + * The pinned page goes away if we do a search, make sure there's a + * local copy of any key, then re-save the cursor state. + */ + WT_ERR(__cursor_copy_int_key(cursor)); + __cursor_state_save(cursor, &state); + /* * If we have a page pinned, search it; if we don't have a page pinned, * or the search of the pinned page doesn't find an exact match, search * from the root. */ valid = false; - if (F_ISSET(cbt, WT_CBT_ACTIVE) && - cbt->ref->page->read_gen != WT_READGEN_OLDEST) { + if (__cursor_page_pinned(cbt)) { __wt_txn_cursor_op(session); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, cbt->ref, false) : __cursor_col_search(session, cbt, cbt->ref)); - valid = cbt->compare == 0 && __cursor_valid(cbt, &upd); + valid = cbt->compare == 0 && __wt_cursor_valid(cbt, &upd); } if (!valid) { WT_ERR(__cursor_func_init(cbt, true)); @@ -338,7 +457,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, false) : __cursor_col_search(session, cbt, NULL)); - valid = cbt->compare == 0 && __cursor_valid(cbt, &upd); + valid = cbt->compare == 0 && __wt_cursor_valid(cbt, &upd); } if (valid) @@ -352,6 +471,8 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) cbt->v = 0; cursor->value.data = &cbt->v; cursor->value.size = 1; + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); } else ret = WT_NOTFOUND; @@ -360,8 +481,10 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_ERR(__wt_cursor_key_order_init(session, cbt)); #endif -err: if (ret != 0) +err: if (ret != 0) { WT_TRET(__cursor_reset(cbt)); + __cursor_state_restore(cursor, &state); + } return (ret); } @@ -373,6 +496,7 @@ int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) { WT_BTREE *btree; + WT_CURFILE_STATE state; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; @@ -389,6 +513,15 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) WT_STAT_CONN_INCR(session, cursor_search_near); WT_STAT_DATA_INCR(session, cursor_search_near); + __cursor_state_save(cursor, &state); + + /* + * The pinned page goes away if we do a search, make sure there's a + * local copy of any key, then re-save the cursor state. + */ + WT_ERR(__cursor_copy_int_key(cursor)); + __cursor_state_save(cursor, &state); + /* * If we have a row-store page pinned, search it; if we don't have a * page pinned, or the search of the pinned page doesn't find an exact @@ -402,9 +535,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) * existing record. */ valid = false; - if (btree->type == BTREE_ROW && - F_ISSET(cbt, WT_CBT_ACTIVE) && - cbt->ref->page->read_gen != WT_READGEN_OLDEST) { + if (btree->type == BTREE_ROW && __cursor_page_pinned(cbt)) { __wt_txn_cursor_op(session); WT_ERR(__cursor_row_search(session, cbt, cbt->ref, true)); @@ -418,16 +549,15 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) * might be legitimately positioned after the last page slot). * Ignore those cases, it makes things too complicated. */ - if (cbt->slot != 0 && - cbt->slot != cbt->ref->page->pg_row_entries - 1) - valid = __cursor_valid(cbt, &upd); + if (cbt->slot != 0 && cbt->slot != cbt->ref->page->entries - 1) + valid = __wt_cursor_valid(cbt, &upd); } if (!valid) { WT_ERR(__cursor_func_init(cbt, true)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, true) : __cursor_col_search(session, cbt, NULL)); - valid = __cursor_valid(cbt, &upd); + valid = __wt_cursor_valid(cbt, &upd); } /* @@ -456,6 +586,8 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) cursor->value.data = &cbt->v; cursor->value.size = 1; exact = 0; + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); } else if ((ret = __wt_btcur_next(cbt, false)) != WT_NOTFOUND) exact = 1; else { @@ -463,22 +595,25 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, true) : __cursor_col_search(session, cbt, NULL)); - if (__cursor_valid(cbt, &upd)) { + if (__wt_cursor_valid(cbt, &upd)) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND) exact = -1; } +err: if (ret == 0 && exactp != NULL) + *exactp = exact; + #ifdef HAVE_DIAGNOSTIC if (ret == 0) - WT_ERR(__wt_cursor_key_order_init(session, cbt)); + WT_TRET(__wt_cursor_key_order_init(session, cbt)); #endif -err: if (ret != 0) + if (ret != 0) { WT_TRET(__cursor_reset(cbt)); - if (exactp != NULL && (ret == 0 || ret == WT_NOTFOUND)) - *exactp = exact; + __cursor_state_restore(cursor, &state); + } return (ret); } @@ -490,9 +625,11 @@ int __wt_btcur_insert(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; + WT_CURFILE_STATE state; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; + bool append_key; btree = cbt->btree; cursor = &cbt->iface; @@ -503,30 +640,86 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt) WT_STAT_DATA_INCRV(session, cursor_insert_bytes, cursor->key.size + cursor->value.size); + __cursor_state_save(cursor, &state); + if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); + /* It's no longer possible to bulk-load into the tree. */ + __cursor_disable_bulk(session, btree); + + /* + * Insert a new record if WT_CURSTD_APPEND configured, (ignoring any + * application set record number). Although append can't be configured + * for a row-store, this code would break if it were, and that's owned + * by the upper cursor layer, be cautious. + */ + append_key = + F_ISSET(cursor, WT_CURSTD_APPEND) && btree->type != BTREE_ROW; + /* - * The tree is no longer empty: eviction should pay attention to it, - * and it's no longer possible to bulk-load into it. + * If inserting with overwrite configured, and positioned to an on-page + * key, the update doesn't require another search. The cursor won't be + * positioned on a page with an external key set, but be sure. Cursors + * configured for append aren't included, regardless of whether or not + * they meet all other criteria. */ - if (btree->bulk_load_ok) { - btree->bulk_load_ok = false; - __wt_btree_evictable(session, true); + if (__cursor_page_pinned(cbt) && + F_ISSET_ALL(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_OVERWRITE) && + !append_key) { + WT_ERR(__wt_txn_autocommit_check(session)); + /* + * The cursor position may not be exact (the cursor's comparison + * value not equal to zero). Correct to an exact match so we can + * update whatever we're pointing at. + */ + cbt->compare = 0; + ret = btree->type == BTREE_ROW ? + __cursor_row_modify(session, cbt, false) : + __cursor_col_modify(session, cbt, false); + if (ret == 0) + goto done; + + /* + * The pinned page goes away if we fail for any reason, make + * sure there's a local copy of any key. (Restart could still + * use the pinned page, but that's an unlikely path.) Re-save + * the cursor state: we may retry but eventually fail. + */ + WT_TRET(__cursor_copy_int_key(cursor)); + __cursor_state_save(cursor, &state); + goto err; } -retry: WT_RET(__cursor_func_init(cbt, true)); + /* + * The pinned page goes away if we do a search, make sure there's a + * local copy of any key. Re-save the cursor state: we may retry but + * eventually fail. + */ + WT_ERR(__cursor_copy_int_key(cursor)); + __cursor_state_save(cursor, &state); - switch (btree->type) { - case BTREE_COL_FIX: - case BTREE_COL_VAR: +retry: WT_ERR(__cursor_func_init(cbt, true)); + + if (btree->type == BTREE_ROW) { + WT_ERR(__cursor_row_search(session, cbt, NULL, true)); + /* + * If not overwriting, fail if the key exists, else insert the + * key/value pair. + */ + if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && + cbt->compare == 0 && __wt_cursor_valid(cbt, NULL)) + WT_ERR(WT_DUPLICATE_KEY); + + ret = __cursor_row_modify(session, cbt, false); + } else { /* - * If WT_CURSTD_APPEND is set, insert a new record (ignoring - * the application's record number). The real record number - * is assigned by the serialized append operation. + * Optionally insert a new record (ignoring the application's + * record number). The real record number is allocated by the + * serialized append operation. */ - if (F_ISSET(cursor, WT_CURSTD_APPEND)) + if (append_key) cbt->iface.recno = WT_RECNO_OOB; WT_ERR(__cursor_col_search(session, cbt, NULL)); @@ -538,26 +731,14 @@ retry: WT_RET(__cursor_func_init(cbt, true)); * Fail in that case, the record exists. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && - ((cbt->compare == 0 && __cursor_valid(cbt, NULL)) || + ((cbt->compare == 0 && __wt_cursor_valid(cbt, NULL)) || (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt)))) WT_ERR(WT_DUPLICATE_KEY); WT_ERR(__cursor_col_modify(session, cbt, false)); - if (F_ISSET(cursor, WT_CURSTD_APPEND)) - cbt->iface.recno = cbt->recno; - break; - case BTREE_ROW: - WT_ERR(__cursor_row_search(session, cbt, NULL, true)); - /* - * If not overwriting, fail if the key exists, else insert the - * key/value pair. - */ - if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && - cbt->compare == 0 && __cursor_valid(cbt, NULL)) - WT_ERR(WT_DUPLICATE_KEY); - ret = __cursor_row_modify(session, cbt, false); - break; + if (append_key) + cbt->iface.recno = cbt->recno; } err: if (ret == WT_RESTART) { @@ -565,11 +746,17 @@ err: if (ret == WT_RESTART) { WT_STAT_DATA_INCR(session, cursor_restart); goto retry; } - /* Insert doesn't maintain a position across calls, clear resources. */ - if (ret == 0) - WT_TRET(__curfile_leave(cbt)); + +done: /* Insert doesn't maintain a position across calls, clear resources. */ + if (ret == 0) { + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + if (append_key) + F_SET(cursor, WT_CURSTD_KEY_INT); + } + WT_TRET(__cursor_reset(cbt)); if (ret != 0) - WT_TRET(__cursor_reset(cbt)); + __cursor_state_restore(cursor, &state); + return (ret); } @@ -605,16 +792,15 @@ __curfile_update_check(WT_CURSOR_BTREE *cbt) } /* - * __wt_btcur_update_check -- + * __wt_btcur_insert_check -- * Check whether an update would conflict. * - * This can be used to replace WT_CURSOR::insert or WT_CURSOR::update, so - * they only check for conflicts without updating the tree. It is used to - * maintain snapshot isolation for transactions that span multiple chunks - * in an LSM tree. + * This can replace WT_CURSOR::insert, so it only checks for conflicts without + * updating the tree. It is used to maintain snapshot isolation for transactions + * that span multiple chunks in an LSM tree. */ int -__wt_btcur_update_check(WT_CURSOR_BTREE *cbt) +__wt_btcur_insert_check(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; @@ -625,31 +811,35 @@ __wt_btcur_update_check(WT_CURSOR_BTREE *cbt) btree = cbt->btree; session = (WT_SESSION_IMPL *)cursor->session; -retry: WT_RET(__cursor_func_init(cbt, true)); + /* + * The pinned page goes away if we do a search, make sure there's a + * local copy of any key. Unlike most of the btree cursor routines, + * we don't have to save/restore the cursor key state, none of the + * work done here changes the key state. + */ + WT_ERR(__cursor_copy_int_key(cursor)); - switch (btree->type) { - case BTREE_ROW: +retry: WT_ERR(__cursor_func_init(cbt, true)); + + if (btree->type == BTREE_ROW) { WT_ERR(__cursor_row_search(session, cbt, NULL, true)); - /* - * Just check for conflicts. - */ + /* Just check for conflicts. */ ret = __curfile_update_check(cbt); - break; - case BTREE_COL_FIX: - case BTREE_COL_VAR: + } else WT_ERR(__wt_illegal_value(session, NULL)); - break; - } err: if (ret == WT_RESTART) { WT_STAT_CONN_INCR(session, cursor_restart); WT_STAT_DATA_INCR(session, cursor_restart); goto retry; } - WT_TRET(__curfile_leave(cbt)); - if (ret != 0) - WT_TRET(__cursor_reset(cbt)); + + /* Insert doesn't maintain a position across calls, clear resources. */ + if (ret == 0) + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + WT_TRET(__cursor_reset(cbt)); + return (ret); } @@ -661,9 +851,11 @@ int __wt_btcur_remove(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; + WT_CURFILE_STATE state; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; + bool positioned; btree = cbt->btree; cursor = &cbt->iface; @@ -673,22 +865,80 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt) WT_STAT_DATA_INCR(session, cursor_remove); WT_STAT_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size); -retry: WT_RET(__cursor_func_init(cbt, true)); + __cursor_state_save(cursor, &state); - switch (btree->type) { - case BTREE_COL_FIX: - case BTREE_COL_VAR: + /* + * WT_CURSOR.remove has a unique semantic, the cursor stays positioned + * if it starts positioned, otherwise clear the cursor on completion. + */ + positioned = F_ISSET(cursor, WT_CURSTD_KEY_INT); + + /* + * If remove positioned to an on-page key, the remove doesn't require + * another search. We don't care about the "overwrite" configuration + * because regardless of the overwrite setting, any existing record is + * removed, and the record must exist with a positioned cursor. The + * cursor won't be positioned on a page with an external key set, but + * be sure. + */ + if (__cursor_page_pinned(cbt) && F_ISSET(cursor, WT_CURSTD_KEY_INT)) { + WT_ERR(__wt_txn_autocommit_check(session)); + + /* + * The cursor position may not be exact (the cursor's comparison + * value not equal to zero). Correct to an exact match so we can + * remove whatever we're pointing at. + */ + cbt->compare = 0; + ret = btree->type == BTREE_ROW ? + __cursor_row_modify(session, cbt, true) : + __cursor_col_modify(session, cbt, true); + if (ret == 0) + goto done; + + /* + * The pinned page goes away if we fail for any reason, make + * sure there's a local copy of any key. (Restart could still + * use the pinned page, but that's an unlikely path.) Re-save + * the cursor state: we may retry but eventually fail. + */ + WT_TRET(__cursor_copy_int_key(cursor)); + __cursor_state_save(cursor, &state); + goto err; + } + + /* + * The pinned page goes away if we do a search, make sure there's a + * local copy of any key. Re-save the cursor state: we may retry but + * eventually fail. + */ + WT_ERR(__cursor_copy_int_key(cursor)); + __cursor_state_save(cursor, &state); + +retry: WT_ERR(__cursor_func_init(cbt, true)); + + if (btree->type == BTREE_ROW) { + WT_ERR(__cursor_row_search(session, cbt, NULL, false)); + + /* Check whether an update would conflict. */ + WT_ERR(__curfile_update_check(cbt)); + + if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) + WT_ERR(WT_NOTFOUND); + + ret = __cursor_row_modify(session, cbt, true); + } else { WT_ERR(__cursor_col_search(session, cbt, NULL)); /* * If we find a matching record, check whether an update would * conflict. Do this before checking if the update is visible - * in __cursor_valid, or we can miss conflict. + * in __wt_cursor_valid, or we can miss conflict. */ WT_ERR(__curfile_update_check(cbt)); /* Remove the record if it exists. */ - if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) { + if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) { if (!__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); /* @@ -704,19 +954,6 @@ retry: WT_RET(__cursor_func_init(cbt, true)); cbt->recno = cursor->recno; } else ret = __cursor_col_modify(session, cbt, true); - break; - case BTREE_ROW: - /* Remove the record if it exists. */ - WT_ERR(__cursor_row_search(session, cbt, NULL, false)); - - /* Check whether an update would conflict. */ - WT_ERR(__curfile_update_check(cbt)); - - if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) - WT_ERR(WT_NOTFOUND); - - ret = __cursor_row_modify(session, cbt, true); - break; } err: if (ret == WT_RESTART) { @@ -724,15 +961,27 @@ err: if (ret == WT_RESTART) { WT_STAT_DATA_INCR(session, cursor_restart); goto retry; } + /* - * If the cursor is configured to overwrite and the record is not - * found, that is exactly what we want. + * If the cursor is configured to overwrite and the record is not found, + * that is exactly what we want, return success. */ if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) && ret == WT_NOTFOUND) ret = 0; - if (ret != 0) +done: /* + * If the cursor was positioned, it stays positioned, point the cursor + * at an internal copy of the key. Otherwise, there's no position or + * key/value. + */ + if (ret == 0) + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + if (ret == 0 && positioned) + WT_TRET(__wt_key_return(session, cbt)); + else WT_TRET(__cursor_reset(cbt)); + if (ret != 0) + __cursor_state_restore(cursor, &state); return (ret); } @@ -745,6 +994,7 @@ int __wt_btcur_update(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; + WT_CURFILE_STATE state; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; @@ -757,24 +1007,71 @@ __wt_btcur_update(WT_CURSOR_BTREE *cbt) WT_STAT_DATA_INCR(session, cursor_update); WT_STAT_DATA_INCRV(session, cursor_update_bytes, cursor->value.size); + __cursor_state_save(cursor, &state); + if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); + /* It's no longer possible to bulk-load into the tree. */ + __cursor_disable_bulk(session, btree); + /* - * The tree is no longer empty: eviction should pay attention to it, - * and it's no longer possible to bulk-load into it. + * If update positioned to an on-page key, the update doesn't require + * another search. We don't care about the "overwrite" configuration + * because regardless of the overwrite setting, any existing record is + * updated, and the record must exist with a positioned cursor. The + * cursor won't be positioned on a page with an external key set, but + * be sure. */ - if (btree->bulk_load_ok) { - btree->bulk_load_ok = false; - __wt_btree_evictable(session, true); + if (__cursor_page_pinned(cbt) && F_ISSET(cursor, WT_CURSTD_KEY_INT)) { + WT_ERR(__wt_txn_autocommit_check(session)); + /* + * The cursor position may not be exact (the cursor's comparison + * value not equal to zero). Correct to an exact match so we can + * update whatever we're pointing at. + */ + cbt->compare = 0; + ret = btree->type == BTREE_ROW ? + __cursor_row_modify(session, cbt, false) : + __cursor_col_modify(session, cbt, false); + if (ret == 0) + goto done; + + /* + * The pinned page goes away if we fail for any reason, make + * sure there's a local copy of any key. (Restart could still + * use the pinned page, but that's an unlikely path.) Re-save + * the cursor state: we may retry but eventually fail. + */ + WT_TRET(__cursor_copy_int_key(cursor)); + __cursor_state_save(cursor, &state); + goto err; } -retry: WT_RET(__cursor_func_init(cbt, true)); + /* + * The pinned page goes away if we do a search, make sure there's a + * local copy of any key. Re-save the cursor state: we may retry but + * eventually fail. + */ + WT_ERR(__cursor_copy_int_key(cursor)); + __cursor_state_save(cursor, &state); - switch (btree->type) { - case BTREE_COL_FIX: - case BTREE_COL_VAR: +retry: WT_ERR(__cursor_func_init(cbt, true)); + + if (btree->type == BTREE_ROW) { + WT_ERR(__cursor_row_search(session, cbt, NULL, true)); + /* + * If not overwriting, check for conflicts and fail if the key + * does not exist. + */ + if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { + WT_ERR(__curfile_update_check(cbt)); + if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) + WT_ERR(WT_NOTFOUND); + } + ret = __cursor_row_modify(session, cbt, false); + } else { WT_ERR(__cursor_col_search(session, cbt, NULL)); /* @@ -787,25 +1084,12 @@ retry: WT_RET(__cursor_func_init(cbt, true)); */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curfile_update_check(cbt)); - if ((cbt->compare != 0 || !__cursor_valid(cbt, NULL)) && + if ((cbt->compare != 0 || + !__wt_cursor_valid(cbt, NULL)) && !__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); } ret = __cursor_col_modify(session, cbt, false); - break; - case BTREE_ROW: - WT_ERR(__cursor_row_search(session, cbt, NULL, true)); - /* - * If not overwriting, check for conflicts and fail if the key - * does not exist. - */ - if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { - WT_ERR(__curfile_update_check(cbt)); - if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) - WT_ERR(WT_NOTFOUND); - } - ret = __cursor_row_modify(session, cbt, false); - break; } err: if (ret == WT_RESTART) { @@ -822,116 +1106,14 @@ err: if (ret == WT_RESTART) { * To make this work, we add a field to the btree cursor to pass back a * pointer to the modify function's allocated update structure. */ - if (ret == 0) +done: if (ret == 0) WT_TRET(__wt_kv_return(session, cbt, cbt->modify_update)); - if (ret != 0) + if (ret != 0) { WT_TRET(__cursor_reset(cbt)); - return (ret); -} - -/* - * __wt_btcur_next_random -- - * Move to a random record in the tree. There are two algorithms, one - * where we select a record at random from the whole tree on each - * retrieval and one where we first select a record at random from the - * whole tree, and then subsequently sample forward from that location. - * The sampling approach allows us to select reasonably uniform random - * points from unbalanced trees. - */ -int -__wt_btcur_next_random(WT_CURSOR_BTREE *cbt) -{ - WT_BTREE *btree; - WT_DECL_RET; - WT_SESSION_IMPL *session; - WT_UPDATE *upd; - wt_off_t size; - uint64_t skip; - - session = (WT_SESSION_IMPL *)cbt->iface.session; - btree = cbt->btree; - - /* - * Only supports row-store: applications can trivially select a random - * value from a column-store, if there were any reason to do so. - */ - if (btree->type != BTREE_ROW) - WT_RET_MSG(session, ENOTSUP, - "WT_CURSOR.next_random only supported by row-store tables"); - - WT_STAT_CONN_INCR(session, cursor_next); - WT_STAT_DATA_INCR(session, cursor_next); - - /* - * If retrieving random values without sampling, or we don't have a - * page reference, pick a roughly random leaf page in the tree. - */ - if (cbt->ref == NULL || cbt->next_random_sample_size == 0) { - /* - * Skip past the sample size of the leaf pages in the tree - * between each random key return to compensate for unbalanced - * trees. - * - * Use the underlying file size divided by its block allocation - * size as our guess of leaf pages in the file (this can be - * entirely wrong, as it depends on how many pages are in this - * particular checkpoint, how large the leaf and internal pages - * really are, and other factors). Then, divide that value by - * the configured sample size and increment the final result to - * make sure tiny files don't leave us with a skip value of 0. - * - * !!! - * Ideally, the number would be prime to avoid restart issues. - */ - if (cbt->next_random_sample_size != 0) { - WT_ERR(btree->bm->size(btree->bm, session, &size)); - cbt->next_random_leaf_skip = (uint64_t) - ((size / btree->allocsize) / - cbt->next_random_sample_size) + 1; - } - - /* - * Choose a leaf page from the tree. - */ - WT_ERR(__cursor_func_init(cbt, true)); - WT_WITH_PAGE_INDEX( - session, ret = __wt_row_random_descent(session, cbt)); - WT_ERR(ret); - } else { - /* - * Read through the tree, skipping leaf pages. Be cautious about - * the skip count: if the last leaf page skipped was also the - * last leaf page in the tree, it may be set to zero on return - * with the end-of-walk condition. - * - * Pages read for data sampling aren't "useful"; don't update - * the read generation of pages already in memory, and if a page - * is read, set its generation to a low value so it is evicted - * quickly. - */ - for (skip = - cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) - WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip, - WT_READ_NO_GEN | - WT_READ_SKIP_INTL | WT_READ_WONT_NEED)); + __cursor_state_restore(cursor, &state); } - /* - * Select a random entry from the leaf page. If it's not valid, move to - * the next entry, if that doesn't work, move to the previous entry. - */ - WT_ERR(__wt_row_random_leaf(session, cbt)); - if (__cursor_valid(cbt, &upd)) - WT_ERR(__wt_kv_return(session, cbt, upd)); - else { - if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND) - ret = __wt_btcur_prev(cbt, false); - WT_ERR(ret); - } - return (0); - -err: WT_TRET(__cursor_reset(cbt)); return (ret); } @@ -1060,9 +1242,12 @@ __cursor_truncate(WT_SESSION_IMPL *session, WT_DECL_RET; /* - * First, call the standard cursor remove method to do a full search and - * re-position the cursor because we don't have a saved copy of the - * page's write generation information, which we need to remove records. + * First, call the cursor search method to re-position the cursor: we + * may not have a cursor position (if the higher-level truncate code + * switched the cursors to have an "external" cursor key, and because + * we don't save a copy of the page's write generation information, + * which we need to remove records. + * * Once that's done, we can delete records without a full search, unless * we encounter a restart error because the page was modified by some * other thread of control; in that case, repeat the full search to @@ -1075,20 +1260,19 @@ __cursor_truncate(WT_SESSION_IMPL *session, * instantiated the end cursor, so we know that page is pinned in memory * and we can proceed without concern. */ -retry: WT_RET(__wt_btcur_remove(start)); +retry: WT_RET(__wt_btcur_search(start)); + WT_ASSERT(session, + F_MASK((WT_CURSOR *)start, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT); + + for (;;) { + if ((ret = rmfunc(session, start, 1)) != 0) + break; - /* - * Reset ret each time through so that we don't loop forever in - * the cursor equals case. - */ - for (ret = 0;;) { if (stop != NULL && __cursor_equals(start, stop)) break; if ((ret = __wt_btcur_next(start, true)) != 0) break; - start->compare = 0; /* Exact match */ - if ((ret = rmfunc(session, start, 1)) != 0) - break; + start->compare = 0; /* Exact match */ } if (ret == WT_RESTART) { @@ -1121,29 +1305,32 @@ __cursor_truncate_fix(WT_SESSION_IMPL *session, * record 37, records 1-36 magically appear. Those records can't be * deleted, which means we have to ignore already "deleted" records. * - * First, call the standard cursor remove method to do a full search and - * re-position the cursor because we don't have a saved copy of the - * page's write generation information, which we need to remove records. + * First, call the cursor search method to re-position the cursor: we + * may not have a cursor position (if the higher-level truncate code + * switched the cursors to have an "external" cursor key, and because + * we don't save a copy of the page's write generation information, + * which we need to remove records. + * * Once that's done, we can delete records without a full search, unless * we encounter a restart error because the page was modified by some * other thread of control; in that case, repeat the full search to * refresh the page's modification information. */ -retry: WT_RET(__wt_btcur_remove(start)); - /* - * Reset ret each time through so that we don't loop forever in - * the cursor equals case. - */ - for (ret = 0;;) { +retry: WT_RET(__wt_btcur_search(start)); + WT_ASSERT(session, + F_MASK((WT_CURSOR *)start, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT); + + for (;;) { + value = (const uint8_t *)start->iface.value.data; + if (*value != 0 && + (ret = rmfunc(session, start, 1)) != 0) + break; + if (stop != NULL && __cursor_equals(start, stop)) break; if ((ret = __wt_btcur_next(start, true)) != 0) break; start->compare = 0; /* Exact match */ - value = (const uint8_t *)start->iface.value.data; - if (*value != 0 && - (ret = rmfunc(session, start, 1)) != 0) - break; } if (ret == WT_RESTART) { @@ -1263,7 +1450,7 @@ __wt_btcur_close(WT_CURSOR_BTREE *cbt, bool lowlevel) * Skip the usual cursor tear-down in that case. */ if (!lowlevel) - ret = __curfile_leave(cbt); + ret = __cursor_reset(cbt); __wt_buf_free(session, &cbt->_row_key); __wt_buf_free(session, &cbt->_tmp); diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index d507cc0e396..d3f02e29b90 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -34,7 +34,7 @@ static const /* Output separator */ static int __debug_cell(WT_DBG *, const WT_PAGE_HEADER *, WT_CELL_UNPACK *); static int __debug_cell_data( - WT_DBG *, WT_PAGE *, int type, const char *, WT_CELL_UNPACK *); + WT_DBG *, WT_PAGE *, int, const char *, WT_CELL_UNPACK *); static int __debug_col_skip(WT_DBG *, WT_INSERT_HEAD *, const char *, bool); static int __debug_config(WT_SESSION_IMPL *, WT_DBG *, const char *); static int __debug_dsk_cell(WT_DBG *, const WT_PAGE_HEADER *); @@ -64,7 +64,7 @@ __wt_debug_set_verbose(WT_SESSION_IMPL *session, const char *v) const char *cfg[2] = { NULL, NULL }; char buf[256]; - snprintf(buf, sizeof(buf), "verbose=[%s]", v); + WT_RET(__wt_snprintf(buf, sizeof(buf), "verbose=[%s]", v)); cfg[0] = buf; return (__wt_verbose_config(session, cfg)); } @@ -87,6 +87,7 @@ __debug_hex_byte(WT_DBG *ds, uint8_t v) static int __dmsg_event(WT_DBG *ds, const char *fmt, ...) { + WT_DECL_RET; WT_ITEM *msg; WT_SESSION_IMPL *session; size_t len, space; @@ -107,8 +108,9 @@ __dmsg_event(WT_DBG *ds, const char *fmt, ...) p = (char *)msg->mem + msg->size; space = msg->memsize - msg->size; va_start(ap, fmt); - len = (size_t)vsnprintf(p, space, fmt, ap); + ret = __wt_vsnprintf_len_set(p, space, &len, fmt, ap); va_end(ap); + WT_RET(ret); /* Check if there was enough space. */ if (len < space) { @@ -447,13 +449,14 @@ __debug_tree_shape_info(WT_PAGE *page) v = page->memory_footprint; if (v >= WT_GIGABYTE) - snprintf(buf, sizeof(buf), + (void)__wt_snprintf(buf, sizeof(buf), "(%p %" PRIu64 "G)", (void *)page, v / WT_GIGABYTE); else if (v >= WT_MEGABYTE) - snprintf(buf, sizeof(buf), + (void)__wt_snprintf(buf, sizeof(buf), "(%p %" PRIu64 "M)", (void *)page, v / WT_MEGABYTE); else - snprintf(buf, sizeof(buf), "(%p %" PRIu64 ")", (void *)page, v); + (void)__wt_snprintf(buf, sizeof(buf), + "(%p %" PRIu64 ")", (void *)page, v); return (buf); } @@ -652,7 +655,7 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref) page = ref->page; mod = page->modify; - WT_RET(ds->f(ds, "%p", (void *)page)); + WT_RET(ds->f(ds, "%p", (void *)ref)); switch (page->type) { case WT_PAGE_COL_INT: @@ -662,25 +665,28 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref) break; case WT_PAGE_COL_FIX: WT_RET(ds->f(ds, " recno %" PRIu64, ref->ref_recno)); - entries = page->pg_fix_entries; + entries = page->entries; break; case WT_PAGE_COL_VAR: WT_RET(ds->f(ds, " recno %" PRIu64, ref->ref_recno)); - entries = page->pg_var_entries; + entries = page->entries; break; case WT_PAGE_ROW_INT: WT_INTL_INDEX_GET(session, page, pindex); entries = pindex->entries; break; case WT_PAGE_ROW_LEAF: - entries = page->pg_row_entries; + entries = page->entries; break; WT_ILLEGAL_VALUE(session); } WT_RET(ds->f(ds, ": %s\n", __wt_page_type_string(page->type))); - WT_RET(ds->f(ds, - "\t" "disk %p, entries %" PRIu32, (void *)page->dsk, entries)); + WT_RET(ds->f(ds, "\t" "disk %p", (void *)page->dsk)); + if (page->dsk != NULL) + WT_RET(ds->f( + ds, ", dsk_mem_size %" PRIu32, page->dsk->mem_size)); + WT_RET(ds->f(ds, ", entries %" PRIu32, entries)); WT_RET(ds->f(ds, ", %s", __wt_page_is_modified(page) ? "dirty" : "clean")); WT_RET(ds->f(ds, ", %s", __wt_rwlock_islocked( @@ -696,8 +702,6 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref) WT_RET(ds->f(ds, ", evict-lru")); if (F_ISSET_ATOMIC(page, WT_PAGE_OVERFLOW_KEYS)) WT_RET(ds->f(ds, ", overflow-keys")); - if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK)) - WT_RET(ds->f(ds, ", split-block")); if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT)) WT_RET(ds->f(ds, ", split-insert")); if (F_ISSET_ATOMIC(page, WT_PAGE_UPDATE_IGNORE)) @@ -837,7 +841,8 @@ __debug_page_col_var(WT_DBG *ds, WT_REF *ref) __wt_cell_unpack(cell, unpack); rle = __wt_cell_rle(unpack); } - snprintf(tag, sizeof(tag), "%" PRIu64 " %" PRIu64, recno, rle); + WT_RET(__wt_snprintf( + tag, sizeof(tag), "%" PRIu64 " %" PRIu64, recno, rle)); WT_RET( __debug_cell_data(ds, page, WT_PAGE_COL_VAR, tag, unpack)); diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c index 00e41475de9..b55ad291c5e 100644 --- a/src/btree/bt_delete.c +++ b/src/btree/bt_delete.c @@ -318,13 +318,12 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) * hard case is if a page splits: the update structures might be moved * to different pages, and we still have to find them all for an abort. */ - if (page_del != NULL) WT_RET(__wt_calloc_def( - session, page->pg_row_entries + 1, &page_del->update_list)); + session, page->entries + 1, &page_del->update_list)); /* Allocate the per-page update array. */ - WT_ERR(__wt_calloc_def(session, page->pg_row_entries, &upd_array)); + WT_ERR(__wt_calloc_def(session, page->entries, &upd_array)); page->modify->mod_row_update = upd_array; /* @@ -332,7 +331,7 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) * structures, fill in the per-page update array with references to * deleted items. */ - for (i = 0, size = 0; i < page->pg_row_entries; ++i) { + for (i = 0, size = 0; i < page->entries; ++i) { WT_ERR(__wt_calloc_one(session, &upd)); WT_UPDATE_DELETED_SET(upd); diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c index c2733d6567b..bab7b8145d6 100644 --- a/src/btree/bt_discard.c +++ b/src/btree/bt_discard.c @@ -16,13 +16,14 @@ static void __free_skip_array( WT_SESSION_IMPL *, WT_INSERT_HEAD **, uint32_t, bool); static void __free_skip_list(WT_SESSION_IMPL *, WT_INSERT *, bool); static void __free_update(WT_SESSION_IMPL *, WT_UPDATE **, uint32_t, bool); +static void __page_out_int(WT_SESSION_IMPL *, WT_PAGE **, bool); /* - * __wt_ref_out -- + * __wt_ref_out_int -- * Discard an in-memory page, freeing all memory associated with it. */ void -__wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref) +__wt_ref_out_int(WT_SESSION_IMPL *session, WT_REF *ref, bool rewrite) { /* * A version of the page-out function that allows us to make additional @@ -56,15 +57,25 @@ __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref) } #endif - __wt_page_out(session, &ref->page); + __page_out_int(session, &ref->page, rewrite); } /* - * __wt_page_out -- + * __wt_ref_out -- * Discard an in-memory page, freeing all memory associated with it. */ void -__wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) +__wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref) +{ + __wt_ref_out_int(session, ref, false); +} + +/* + * __page_out_int -- + * Discard an in-memory page, freeing all memory associated with it. + */ +static void +__page_out_int(WT_SESSION_IMPL *session, WT_PAGE **pagep, bool rewrite) { WT_PAGE *page; WT_PAGE_HEADER *dsk; @@ -103,7 +114,7 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) } /* Update the cache's information. */ - __wt_cache_page_evict(session, page); + __wt_cache_page_evict(session, page, rewrite); dsk = (WT_PAGE_HEADER *)page->dsk; if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC)) @@ -148,6 +159,16 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) } /* + * __wt_page_out -- + * Discard an in-memory page, freeing all memory associated with it. + */ +void +__wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) +{ + __page_out_int(session, pagep, false); +} + +/* * __free_page_modify -- * Discard the page's associated modification structures. */ @@ -206,8 +227,7 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) if (mod->mod_col_update != NULL) __free_skip_array(session, mod->mod_col_update, page->type == - WT_PAGE_COL_FIX ? 1 : page->pg_var_entries, - update_ignore); + WT_PAGE_COL_FIX ? 1 : page->entries, update_ignore); break; case WT_PAGE_ROW_LEAF: /* @@ -219,12 +239,12 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) */ if (mod->mod_row_insert != NULL) __free_skip_array(session, mod->mod_row_insert, - page->pg_row_entries + 1, update_ignore); + page->entries + 1, update_ignore); /* Free the update array. */ if (mod->mod_row_update != NULL) __free_update(session, mod->mod_row_update, - page->pg_row_entries, update_ignore); + page->entries, update_ignore); break; } @@ -332,7 +352,7 @@ static void __free_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page) { /* Free the RLE lookup array. */ - __wt_free(session, page->pg_var_repeats); + __wt_free(session, page->u.col_var.repeats); } /* diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index 47c7972dd57..d76720b19ae 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -15,6 +15,44 @@ static int __btree_preload(WT_SESSION_IMPL *); static int __btree_tree_open_empty(WT_SESSION_IMPL *, bool); /* + * __btree_clear -- + * Clear a Btree, either on handle discard or re-open. + */ +static int +__btree_clear(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_DECL_RET; + + btree = S2BT(session); + + /* + * If the tree hasn't gone through an open/close cycle, there's no + * cleanup to be done. + */ + if (!F_ISSET(btree, WT_BTREE_CLOSED)) + return (0); + + /* Close the Huffman tree. */ + __wt_btree_huffman_close(session); + + /* Terminate any associated collator. */ + if (btree->collator_owned && btree->collator->terminate != NULL) + WT_TRET(btree->collator->terminate( + btree->collator, &session->iface)); + + /* Destroy locks. */ + __wt_rwlock_destroy(session, &btree->ovfl_lock); + __wt_spin_destroy(session, &btree->flush_lock); + + /* Free allocated memory. */ + __wt_free(session, btree->key_format); + __wt_free(session, btree->value_format); + + return (ret); +} + +/* * __wt_btree_open -- * Open a Btree. */ @@ -28,12 +66,27 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) WT_DATA_HANDLE *dhandle; WT_DECL_RET; size_t root_addr_size; + uint32_t mask; uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE]; const char *filename; bool creation, forced_salvage, readonly; - dhandle = session->dhandle; btree = S2BT(session); + dhandle = session->dhandle; + + /* + * This may be a re-open of an underlying object and we have to clean + * up. We can't clear the operation flags, however, they're set by the + * connection handle software that called us. + */ + WT_RET(__btree_clear(session)); + + mask = F_MASK(btree, WT_BTREE_SPECIAL_FLAGS); + memset(btree, 0, sizeof(*btree)); + btree->flags = mask; + + /* Set the data handle first, our called functions reasonably use it. */ + btree->dhandle = dhandle; /* Checkpoint files are readonly. */ readonly = dhandle->checkpoint != NULL || @@ -126,6 +179,20 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) } } + /* + * Eviction ignores trees until the handle's open flag is set, configure + * eviction before that happens. + * + * Files that can still be bulk-loaded cannot be evicted. + * Permanently cache-resident files can never be evicted. + * Special operations don't enable eviction. (The underlying commands + * may turn on eviction, but it's their decision.) + */ + if (btree->original || + F_ISSET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_REBALANCE | + WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) + WT_ERR(__wt_evict_file_exclusive_on(session)); + if (0) { err: WT_TRET(__wt_btree_close(session)); } @@ -147,7 +214,24 @@ __wt_btree_close(WT_SESSION_IMPL *session) btree = S2BT(session); + /* + * The close process isn't the same as discarding the handle: we might + * re-open the handle, which isn't a big deal, but the backing blocks + * for the handle may not yet have been discarded from the cache, and + * eviction uses WT_BTREE structure elements. Free backing resources + * but leave the rest alone, and we'll discard the structure when we + * discard the data handle. + * + * Handles can be closed multiple times, ignore all but the first. + */ + if (F_ISSET(btree, WT_BTREE_CLOSED)) + return (0); + F_SET(btree, WT_BTREE_CLOSED); + + /* Discard any underlying block manager resources. */ if ((bm = btree->bm) != NULL) { + btree->bm = NULL; + /* Unload the checkpoint, unless it's a special command. */ if (!F_ISSET(btree, WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) @@ -155,33 +239,26 @@ __wt_btree_close(WT_SESSION_IMPL *session) /* Close the underlying block manager reference. */ WT_TRET(bm->close(bm, session)); - - btree->bm = NULL; } - /* Close the Huffman tree. */ - __wt_btree_huffman_close(session); - - /* Destroy locks. */ - __wt_rwlock_destroy(session, &btree->ovfl_lock); - __wt_spin_destroy(session, &btree->flush_lock); - - /* Free allocated memory. */ - __wt_free(session, btree->key_format); - __wt_free(session, btree->value_format); + return (ret); +} - if (btree->collator_owned) { - if (btree->collator->terminate != NULL) - WT_TRET(btree->collator->terminate( - btree->collator, &session->iface)); - btree->collator_owned = 0; - } - btree->collator = NULL; - btree->kencryptor = NULL; +/* + * __wt_btree_discard -- + * Discard a Btree. + */ +int +__wt_btree_discard(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_DECL_RET; - btree->bulk_load_ok = false; + ret = __btree_clear(session); - F_CLR(btree, WT_BTREE_SPECIAL_FLAGS); + btree = S2BT(session); + __wt_overwrite_and_free(session, btree); + session->dhandle->handle = NULL; return (ret); } @@ -267,9 +344,9 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) WT_RET(__wt_config_gets(session, cfg, "cache_resident", &cval)); if (cval.val) - F_SET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION); + F_SET(btree, WT_BTREE_IN_MEMORY); else - F_CLR(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION); + F_CLR(btree, WT_BTREE_IN_MEMORY); WT_RET(__wt_config_gets(session, cfg, "ignore_in_memory_cache_size", &cval)); @@ -282,6 +359,14 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) } else F_CLR(btree, WT_BTREE_IGNORE_CACHE); + /* + * The metadata isn't blocked by in-memory cache limits because metadata + * "unroll" is performed by updates that are potentially blocked by the + * cache-full checks. + */ + if (WT_IS_METADATA(btree->dhandle)) + F_SET(btree, WT_BTREE_IGNORE_CACHE); + WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval)); if (cval.val) F_CLR(btree, WT_BTREE_NO_LOGGING); @@ -359,8 +444,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) } /* Initialize locks. */ - WT_RET(__wt_rwlock_alloc( - session, &btree->ovfl_lock, "btree overflow lock")); + __wt_rwlock_init(session, &btree->ovfl_lock); WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush")); btree->checkpointing = WT_CKPT_OFF; /* Not checkpointing */ @@ -483,13 +567,10 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, bool creation) /* * Newly created objects can be used for cursor inserts or for bulk * loads; set a flag that's cleared when a row is inserted into the - * tree. Objects being bulk-loaded cannot be evicted, we set it - * globally, there's no point in searching empty trees for eviction. + * tree. */ - if (creation) { - btree->bulk_load_ok = true; - __wt_btree_evictable(session, false); - } + if (creation) + btree->original = 1; /* * A note about empty trees: the initial tree is a single root page. @@ -582,27 +663,6 @@ __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep) } /* - * __wt_btree_evictable -- - * Setup or release a cache-resident tree. - */ -void -__wt_btree_evictable(WT_SESSION_IMPL *session, bool on) -{ - WT_BTREE *btree; - - btree = S2BT(session); - - /* Permanently cache-resident files can never be evicted. */ - if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) - return; - - if (on) - F_CLR(btree, WT_BTREE_NO_EVICTION); - else - F_SET(btree, WT_BTREE_NO_EVICTION); -} - -/* * __btree_preload -- * Pre-load internal pages. */ @@ -728,9 +788,16 @@ __btree_page_sizes(WT_SESSION_IMPL *session) * Get the split percentage (reconciliation splits pages into smaller * than the maximum page size chunks so we don't split every time a * new entry is added). Determine how large newly split pages will be. + * Set to the minimum, if the read value is less than that. */ WT_RET(__wt_config_gets(session, cfg, "split_pct", &cval)); - btree->split_pct = (int)cval.val; + if (cval.val < WT_BTREE_MIN_SPLIT_PCT) { + btree->split_pct = WT_BTREE_MIN_SPLIT_PCT; + WT_RET(__wt_msg(session, + "Re-setting split_pct for %s to the minimum allowed of " + "%d%%.", session->dhandle->name, WT_BTREE_MIN_SPLIT_PCT)); + } else + btree->split_pct = (int)cval.val; intl_split_size = __wt_split_page_size(btree, btree->maxintlpage); leaf_split_size = __wt_split_page_size(btree, btree->maxleafpage); diff --git a/src/btree/bt_io.c b/src/btree/bt_io.c index a8645f79dbe..b5e4d52394a 100644 --- a/src/btree/bt_io.c +++ b/src/btree/bt_io.c @@ -183,7 +183,7 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t dst_len, len, result_len, size, src_len; int compression_failed; /* Extension API, so not a bool. */ uint8_t *dst, *src; - bool data_checksum, encrypted; + bool data_checksum, encrypted, timer; btree = S2BT(session); bm = btree->bm; @@ -216,7 +216,7 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, &result_len)); WT_ASSERT(session, dsk->mem_size == result_len + WT_BLOCK_COMPRESS_SKIP); - ctmp->size = (uint32_t)result_len + WT_BLOCK_COMPRESS_SKIP; + ctmp->size = result_len + WT_BLOCK_COMPRESS_SKIP; ip = ctmp; } else { WT_ASSERT(session, dsk->mem_size == buf->size); @@ -357,7 +357,8 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, data_checksum = !compressed; break; } - if (!F_ISSET(session, WT_SESSION_INTERNAL)) + timer = !F_ISSET(session, WT_SESSION_INTERNAL); + if (timer) __wt_epoch(session, &start); /* Call the block manager to write the block. */ @@ -367,7 +368,7 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, bm, session, ip, addr, addr_sizep, data_checksum, checkpoint_io)); /* Update some statistics now that the write is done */ - if (!F_ISSET(session, WT_SESSION_INTERNAL)) { + if (timer) { __wt_epoch(session, &stop); WT_STAT_CONN_INCR(session, cache_write_app_count); WT_STAT_CONN_INCRV(session, cache_write_app_time, diff --git a/src/btree/bt_ovfl.c b/src/btree/bt_ovfl.c index 29ea561db3a..ae0da62af57 100644 --- a/src/btree/bt_ovfl.c +++ b/src/btree/bt_ovfl.c @@ -67,11 +67,11 @@ __wt_ovfl_read(WT_SESSION_IMPL *session, * Acquire the overflow lock, and retest the on-page cell's value inside * the lock. */ - __wt_readlock(session, S2BT(session)->ovfl_lock); + __wt_readlock(session, &S2BT(session)->ovfl_lock); ret = __wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM ? __wt_ovfl_txnc_search(page, unpack->data, unpack->size, store) : __ovfl_read(session, unpack->data, unpack->size, store); - __wt_readunlock(session, S2BT(session)->ovfl_lock); + __wt_readunlock(session, &S2BT(session)->ovfl_lock); return (ret); } @@ -249,7 +249,7 @@ __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) * Acquire the overflow lock to avoid racing with a thread reading the * backing overflow blocks. */ - __wt_writelock(session, btree->ovfl_lock); + __wt_writelock(session, &btree->ovfl_lock); switch (unpack->raw) { case WT_CELL_KEY_OVFL: @@ -263,7 +263,7 @@ __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) WT_ILLEGAL_VALUE(session); } - __wt_writeunlock(session, btree->ovfl_lock); + __wt_writeunlock(session, &btree->ovfl_lock); /* Free the backing disk blocks. */ return (bm->free(bm, session, unpack->data, unpack->size)); diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c index 7bac7079fe8..f20f6398e37 100644 --- a/src/btree/bt_page.c +++ b/src/btree/bt_page.c @@ -67,7 +67,7 @@ __wt_page_alloc(WT_SESSION_IMPL *session, switch (type) { case WT_PAGE_COL_FIX: - page->pg_fix_entries = alloc_entries; + page->entries = alloc_entries; break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: @@ -102,12 +102,12 @@ err: if ((pindex = WT_INTL_INDEX_GET_SAFE(page)) != NULL) { } break; case WT_PAGE_COL_VAR: - page->pg_var_d = (WT_COL *)((uint8_t *)page + sizeof(WT_PAGE)); - page->pg_var_entries = alloc_entries; + page->pg_var = (WT_COL *)((uint8_t *)page + sizeof(WT_PAGE)); + page->entries = alloc_entries; break; case WT_PAGE_ROW_LEAF: - page->pg_row_d = (WT_ROW *)((uint8_t *)page + sizeof(WT_PAGE)); - page->pg_row_entries = alloc_entries; + page->pg_row = (WT_ROW *)((uint8_t *)page + sizeof(WT_PAGE)); + page->entries = alloc_entries; break; WT_ILLEGAL_VALUE(session); } @@ -333,9 +333,10 @@ __inmem_col_var( WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; const WT_PAGE_HEADER *dsk; + size_t size; uint64_t rle; - size_t bytes_allocated; uint32_t i, indx, n, repeat_off; + void *p; btree = S2BT(session); dsk = page->dsk; @@ -343,7 +344,6 @@ __inmem_col_var( repeats = NULL; repeat_off = 0; unpack = &_unpack; - bytes_allocated = 0; /* * Walk the page, building references: the page contains unsorted value @@ -351,7 +351,7 @@ __inmem_col_var( * (WT_CELL_VALUE_OVFL) or deleted items (WT_CELL_DEL). */ indx = 0; - cip = page->pg_var_d; + cip = page->pg_var; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { __wt_cell_unpack(cell, unpack); WT_COL_PTR_SET(cip, WT_PAGE_DISK_OFFSET(page, cell)); @@ -367,12 +367,14 @@ __inmem_col_var( if (rle > 1) { if (repeats == NULL) { __inmem_col_var_repeats(session, page, &n); - WT_RET(__wt_realloc_def(session, - &bytes_allocated, n + 1, &repeats)); + size = sizeof(WT_COL_VAR_REPEAT) + + (n + 1) * sizeof(WT_COL_RLE); + WT_RET(__wt_calloc(session, 1, size, &p)); + *sizep += size; - page->pg_var_repeats = repeats; + page->u.col_var.repeats = p; page->pg_var_nrepeats = n; - *sizep += bytes_allocated; + repeats = page->pg_var_repeats; } repeats[repeat_off].indx = indx; repeats[repeat_off].recno = recno; @@ -569,7 +571,7 @@ __inmem_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) unpack = &_unpack; /* Walk the page, building indices. */ - rip = page->pg_row_d; + rip = page->pg_row; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { __wt_cell_unpack(cell, unpack); switch (unpack->type) { diff --git a/src/btree/bt_random.c b/src/btree/bt_random.c new file mode 100644 index 00000000000..c5948ec4ab5 --- /dev/null +++ b/src/btree/bt_random.c @@ -0,0 +1,432 @@ +/*- + * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_row_random_leaf -- + * Return a random key from a row-store leaf page. + */ +int +__wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) +{ + WT_INSERT *ins, **start, **stop; + WT_INSERT_HEAD *ins_head; + WT_PAGE *page; + uint64_t samples; + uint32_t choice, entries, i; + int level; + + page = cbt->ref->page; + start = stop = NULL; /* [-Wconditional-uninitialized] */ + entries = 0; /* [-Wconditional-uninitialized] */ + + __cursor_pos_clear(cbt); + + /* If the page has disk-based entries, select from them. */ + if (page->entries != 0) { + cbt->compare = 0; + cbt->slot = __wt_random(&session->rnd) % page->entries; + + /* + * The real row-store search function builds the key, so we + * have to as well. + */ + return (__wt_row_leaf_key(session, + page, page->pg_row + cbt->slot, cbt->tmp, false)); + } + + /* + * If the tree is new (and not empty), it might have a large insert + * list. + * + * Walk down the list until we find a level with at least 50 entries, + * that's where we'll start rolling random numbers. The value 50 is + * used to ignore levels with only a few entries, that is, levels which + * are potentially badly skewed. + */ + F_SET(cbt, WT_CBT_SEARCH_SMALLEST); + if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL) + return (WT_NOTFOUND); + for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) { + start = &ins_head->head[level]; + for (entries = 0, stop = start; + *stop != NULL; stop = &(*stop)->next[level]) + ++entries; + + if (entries > 50) + break; + } + + /* + * If it's a tiny list and we went all the way to level 0, correct the + * level; entries is correctly set. + */ + if (level < 0) + level = 0; + + /* + * Step down the skip list levels, selecting a random chunk of the name + * space at each level. + */ + for (samples = entries; level > 0; samples += entries) { + /* + * There are (entries) or (entries + 1) chunks of the name space + * considered at each level. They are: between start and the 1st + * element, between the 1st and 2nd elements, and so on to the + * last chunk which is the name space after the stop element on + * the current level. This last chunk of name space may or may + * not be there: as we descend the levels of the skip list, this + * chunk may appear, depending if the next level down has + * entries logically after the stop point in the current level. + * We can't ignore those entries: because of the algorithm used + * to determine the depth of a skiplist, there may be a large + * number of entries "revealed" by descending a level. + * + * If the next level down has more items after the current stop + * point, there are (entries + 1) chunks to consider, else there + * are (entries) chunks. + */ + if (*(stop - 1) == NULL) + choice = __wt_random(&session->rnd) % entries; + else + choice = __wt_random(&session->rnd) % (entries + 1); + + if (choice == entries) { + /* + * We selected the name space after the stop element on + * this level. Set the start point to the current stop + * point, descend a level and move the stop element to + * the end of the list, that is, the end of the newly + * discovered name space, counting entries as we go. + */ + start = stop; + --start; + --level; + for (entries = 0, stop = start; + *stop != NULL; stop = &(*stop)->next[level]) + ++entries; + } else { + /* + * We selected another name space on the level. Move the + * start pointer the selected number of entries forward + * to the start of the selected chunk (if the selected + * number is 0, start won't move). Set the stop pointer + * to the next element in the list and drop both start + * and stop down a level. + */ + for (i = 0; i < choice; ++i) + start = &(*start)->next[level]; + stop = &(*start)->next[level]; + + --start; + --stop; + --level; + + /* Count the entries in the selected name space. */ + for (entries = 0, + ins = *start; ins != *stop; ins = ins->next[level]) + ++entries; + } + } + + /* + * When we reach the bottom level, entries will already be set. Select + * a random entry from the name space and return it. + * + * It should be impossible for the entries count to be 0 at this point, + * but check for it out of paranoia and to quiet static testing tools. + */ + if (entries > 0) + entries = __wt_random(&session->rnd) % entries; + for (ins = *start; entries > 0; --entries) + ins = ins->next[0]; + + cbt->ins = ins; + cbt->ins_head = ins_head; + cbt->compare = 0; + + /* + * Random lookups in newly created collections can be slow if a page + * consists of a large skiplist. Schedule the page for eviction if we + * encounter a large skiplist. This worthwhile because applications + * that take a sample often take many samples, so the overhead of + * traversing the skip list each time accumulates to real time. + */ + if (samples > 5000) + __wt_page_evict_soon(session, cbt->ref); + + return (0); +} + +/* + * __wt_random_descent -- + * Find a random page in a tree for either sampling or eviction. + */ +int +__wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, bool eviction) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_PAGE *page; + WT_PAGE_INDEX *pindex; + WT_REF *current, *descent; + uint32_t flags, i, entries, retry; + + *refp = NULL; + + btree = S2BT(session); + current = NULL; + retry = 100; + + /* Eviction should not be tapped to do eviction. */ + if (eviction) + flags = WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | + WT_READ_NO_WAIT | WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK; + else + flags = WT_READ_RESTART_OK; + + if (0) { +restart: /* + * Discard the currently held page and restart the search from + * the root. + */ + WT_RET(__wt_page_release(session, current, flags)); + } + + /* Search the internal pages of the tree. */ + current = &btree->root; + for (;;) { + page = current->page; + if (!WT_PAGE_IS_INTERNAL(page)) + break; + + WT_INTL_INDEX_GET(session, page, pindex); + entries = pindex->entries; + + /* Eviction just wants any random child. */ + if (eviction) { + descent = pindex->index[ + __wt_random(&session->rnd) % entries]; + goto descend; + } + + /* + * There may be empty pages in the tree, and they're useless to + * us. If we don't find a non-empty page in "entries" random + * guesses, take the first non-empty page in the tree. If the + * search page contains nothing other than empty pages, restart + * from the root some number of times before giving up. + * + * Random sampling is looking for a key/value pair on a random + * leaf page, and so will accept any page that contains a valid + * key/value pair, so on-disk is fine, but deleted is not. + */ + descent = NULL; + for (i = 0; i < entries; ++i) { + descent = + pindex->index[__wt_random(&session->rnd) % entries]; + if (descent->state == WT_REF_MEM || + descent->state == WT_REF_DISK) + break; + } + if (i == entries) + for (i = 0; i < entries; ++i) { + descent = pindex->index[i]; + if (descent->state == WT_REF_MEM || + descent->state == WT_REF_DISK) + break; + } + if (i == entries || descent == NULL) { + if (--retry > 0) + goto restart; + + WT_RET(__wt_page_release(session, current, flags)); + return (WT_NOTFOUND); + } + + /* + * Swap the current page for the child page. If the page splits + * while we're retrieving it, restart the search at the root. + * + * On other error, simply return, the swap call ensures we're + * holding nothing on failure. + */ +descend: if ((ret = + __wt_page_swap(session, current, descent, flags)) == 0) { + current = descent; + continue; + } + if (eviction && (ret == WT_NOTFOUND || ret == WT_RESTART)) + break; + if (ret == WT_RESTART) + goto restart; + return (ret); + } + + /* + * There is no point starting with the root page: the walk will exit + * immediately. In that case we aren't holding a hazard pointer so + * there is nothing to release. + */ + if (!eviction || !__wt_ref_is_root(current)) + *refp = current; + return (0); +} + +/* + * __wt_btcur_next_random -- + * Move to a random record in the tree. There are two algorithms, one + * where we select a record at random from the whole tree on each + * retrieval and one where we first select a record at random from the + * whole tree, and then subsequently sample forward from that location. + * The sampling approach allows us to select reasonably uniform random + * points from unbalanced trees. + */ +int +__wt_btcur_next_random(WT_CURSOR_BTREE *cbt) +{ + WT_BTREE *btree; + WT_CURSOR *cursor; + WT_DECL_RET; + WT_SESSION_IMPL *session; + WT_UPDATE *upd; + wt_off_t size; + uint64_t n, skip; + + btree = cbt->btree; + cursor = &cbt->iface; + session = (WT_SESSION_IMPL *)cbt->iface.session; + + /* + * Only supports row-store: applications can trivially select a random + * value from a column-store, if there were any reason to do so. + */ + if (btree->type != BTREE_ROW) + WT_RET_MSG(session, ENOTSUP, + "WT_CURSOR.next_random only supported by row-store tables"); + + WT_STAT_CONN_INCR(session, cursor_next); + WT_STAT_DATA_INCR(session, cursor_next); + + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + +#ifdef HAVE_DIAGNOSTIC + /* + * Under some conditions we end up using the underlying cursor.next to + * walk through the object. Since there are multiple calls, we can hit + * the cursor-order checks, turn them off. + */ + __wt_cursor_key_order_reset(cbt); +#endif + /* + * If we don't have a current position in the tree, or if retrieving + * random values without sampling, pick a roughly random leaf page in + * the tree and return an entry from it. + */ + if (cbt->ref == NULL || cbt->next_random_sample_size == 0) { + WT_ERR(__cursor_func_init(cbt, true)); + WT_WITH_PAGE_INDEX(session, + ret = __wt_random_descent(session, &cbt->ref, false)); + if (ret == 0) + goto random_page_entry; + + /* + * Random descent may return not-found: the tree might be empty + * or have so many deleted items we didn't find any valid pages. + * We can't return WT_NOTFOUND to the application unless a tree + * is really empty, fallback to skipping through tree pages. + */ + WT_ERR_NOTFOUND_OK(ret); + } + + /* + * Cursor through the tree, skipping past the sample size of the leaf + * pages in the tree between each random key return to compensate for + * unbalanced trees. + * + * If the random descent attempt failed, we don't have a configured + * sample size, use 100 for no particular reason. + */ + if (cbt->next_random_sample_size == 0) + cbt->next_random_sample_size = 100; + + /* + * If the random descent attempt failed, or it's our first skip attempt, + * we haven't yet set the pages to skip, do it now. + * + * Use the underlying file size divided by its block allocation size as + * our guess of leaf pages in the file (this can be entirely wrong, as + * it depends on how many pages are in this particular checkpoint, how + * large the leaf and internal pages really are, and other factors). + * Then, divide that value by the configured sample size and increment + * the final result to make sure tiny files don't leave us with a skip + * value of 0. + * + * !!! + * Ideally, the number would be prime to avoid restart issues. + */ + if (cbt->next_random_leaf_skip == 0) { + WT_ERR(btree->bm->size(btree->bm, session, &size)); + cbt->next_random_leaf_skip = (uint64_t) + ((size / btree->allocsize) / + cbt->next_random_sample_size) + 1; + } + + /* + * Be paranoid about loop termination: first, if the last leaf page + * skipped was also the last leaf page in the tree, skip may be set to + * zero on return along with the NULL WT_REF end-of-walk condition. + * Second, if a tree has no valid pages at all (the condition after + * initial creation), we might make no progress at all, or finally, if + * a tree has only deleted pages, we'll make progress, but never get a + * useful WT_REF. And, of course, the tree can switch from one of these + * states to another without warning. Decrement skip regardless of what + * is happening in the search, guarantee we eventually quit. + * + * Pages read for data sampling aren't "useful"; don't update the read + * generation of pages already in memory, and if a page is read, set + * its generation to a low value so it is evicted quickly. + */ + for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) { + n = skip; + WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip, + WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED)); + if (n == skip) { + if (skip == 0) + break; + --skip; + } + } + + /* + * We can't return WT_NOTFOUND to the application unless a tree is + * really empty, fallback to a random entry from the first page in the + * tree that has anything at all. + */ + if (cbt->ref == NULL) + WT_ERR(__wt_btcur_next(cbt, false)); + +random_page_entry: + /* + * Select a random entry from the leaf page. If it's not valid, move to + * the next entry, if that doesn't work, move to the previous entry. + */ + WT_ERR(__wt_row_random_leaf(session, cbt)); + if (__wt_cursor_valid(cbt, &upd)) + WT_ERR(__wt_kv_return(session, cbt, upd)); + else { + if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND) + ret = __wt_btcur_prev(cbt, false); + WT_ERR(ret); + } + return (0); + +err: WT_TRET(__cursor_reset(cbt)); + return (ret); +} diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index 39f9e1159cb..64874547b9c 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -369,6 +369,7 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref) size_t addr_size; uint32_t previous_state; const uint8_t *addr; + bool timer; btree = S2BT(session); page = NULL; @@ -408,10 +409,11 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref) * There's an address, read or map the backing disk page and build an * in-memory version of the page. */ - if (!F_ISSET(session, WT_SESSION_INTERNAL)) + timer = !F_ISSET(session, WT_SESSION_INTERNAL); + if (timer) __wt_epoch(session, &start); WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size)); - if (!F_ISSET(session, WT_SESSION_INTERNAL)) { + if (timer) { __wt_epoch(session, &stop); WT_STAT_CONN_INCR(session, cache_read_app_count); WT_STAT_CONN_INCRV(session, cache_read_app_time, @@ -590,8 +592,9 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags */ if (LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(session, WT_SESSION_NO_EVICTION) || - (F_ISSET(btree, WT_BTREE_NO_EVICTION) && - !F_ISSET(btree, WT_BTREE_NO_RECONCILE))) + btree->lsm_primary || + (btree->evict_disabled > 0 && + !F_ISSET(btree, WT_BTREE_ALLOW_SPLITS))) goto skip_evict; /* diff --git a/src/btree/bt_rebalance.c b/src/btree/bt_rebalance.c index 29380459b94..68848c7c8f5 100644 --- a/src/btree/bt_rebalance.c +++ b/src/btree/bt_rebalance.c @@ -265,7 +265,7 @@ __rebalance_row_leaf_key(WT_SESSION_IMPL *session, */ WT_RET(__wt_bt_read(session, rs->tmp1, addr, addr_len)); WT_RET(__wt_page_inmem(session, NULL, rs->tmp1->data, 0, 0, &page)); - ret = __wt_row_leaf_key_copy(session, page, &page->pg_row_d[0], key); + ret = __wt_row_leaf_key_copy(session, page, &page->pg_row[0], key); __wt_page_out(session, &page); return (ret); } @@ -406,12 +406,10 @@ __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) WT_BTREE *btree; WT_DECL_RET; WT_REBALANCE_STUFF *rs, _rstuff; - bool evict_reset; WT_UNUSED(cfg); btree = S2BT(session); - evict_reset = false; /* * If the tree has never been written to disk, we're done, rebalance @@ -433,14 +431,6 @@ __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) /* Set the internal page tree type. */ rs->type = btree->root.page->type; - /* - * Get exclusive access to the file. (Not required, the only page in the - * cache is the root page, and that cannot be evicted; however, this way - * eviction ignores the tree entirely.) - */ - WT_ERR(__wt_evict_file_exclusive_on(session)); - evict_reset = true; - /* Recursively walk the tree. */ switch (rs->type) { case WT_PAGE_ROW_INT: @@ -471,10 +461,7 @@ __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) btree->root.page = rs->root; rs->root = NULL; -err: if (evict_reset) - __wt_evict_file_exclusive_off(session); - - /* Discard any leftover root page we created. */ +err: /* Discard any leftover root page we created. */ if (rs->root != NULL) { __wt_page_modify_clear(session, rs->root); __wt_page_out(session, &rs->root); diff --git a/src/btree/bt_ret.c b/src/btree/bt_ret.c index 8ef2db67e7b..f17fa1b85d1 100644 --- a/src/btree/bt_ret.c +++ b/src/btree/bt_ret.c @@ -9,65 +9,22 @@ #include "wt_internal.h" /* - * __wt_kv_return -- - * Return a page referenced key/value pair to the application. + * __key_return -- + * Change the cursor to reference an internal return key. */ -int -__wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +static inline int +__key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) { - WT_BTREE *btree; - WT_CELL *cell; - WT_CELL_UNPACK unpack; WT_CURSOR *cursor; WT_ITEM *tmp; WT_PAGE *page; WT_ROW *rip; - uint8_t v; - - btree = S2BT(session); page = cbt->ref->page; cursor = &cbt->iface; - switch (page->type) { - case WT_PAGE_COL_FIX: - /* - * The interface cursor's record has usually been set, but that - * isn't universally true, specifically, cursor.search_near may - * call here without first setting the interface cursor. - */ - cursor->recno = cbt->recno; - - /* If the cursor references a WT_UPDATE item, return it. */ - if (upd != NULL) { - cursor->value.data = WT_UPDATE_DATA(upd); - cursor->value.size = upd->size; - return (0); - } - - /* Take the value from the original page. */ - v = __bit_getv_recno(cbt->ref, cursor->recno, btree->bitcnt); - return (__wt_buf_set(session, &cursor->value, &v, 1)); - case WT_PAGE_COL_VAR: - /* - * The interface cursor's record has usually been set, but that - * isn't universally true, specifically, cursor.search_near may - * call here without first setting the interface cursor. - */ - cursor->recno = cbt->recno; - - /* If the cursor references a WT_UPDATE item, return it. */ - if (upd != NULL) { - cursor->value.data = WT_UPDATE_DATA(upd); - cursor->value.size = upd->size; - return (0); - } - - /* Take the value from the original page cell. */ - cell = WT_COL_PTR(page, &page->pg_var_d[cbt->slot]); - break; - case WT_PAGE_ROW_LEAF: - rip = &page->pg_row_d[cbt->slot]; + if (page->type == WT_PAGE_ROW_LEAF) { + rip = &page->pg_row[cbt->slot]; /* * If the cursor references a WT_INSERT item, take its key. @@ -79,7 +36,10 @@ __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) if (cbt->ins != NULL) { cursor->key.data = WT_INSERT_KEY(cbt->ins); cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); - } else if (cbt->compare == 0) { + return (0); + } + + if (cbt->compare == 0) { /* * If not in an insert list and there's an exact match, * the row-store search function built the key we want @@ -97,16 +57,51 @@ __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) cursor->key.data = cbt->row_key->data; cursor->key.size = cbt->row_key->size; - } else - WT_RET(__wt_row_leaf_key( - session, page, rip, &cursor->key, false)); - - /* If the cursor references a WT_UPDATE item, return it. */ - if (upd != NULL) { - cursor->value.data = WT_UPDATE_DATA(upd); - cursor->value.size = upd->size; return (0); } + return (__wt_row_leaf_key( + session, page, rip, &cursor->key, false)); + } + + /* + * WT_PAGE_COL_FIX, WT_PAGE_COL_VAR: + * The interface cursor's record has usually been set, but that + * isn't universally true, specifically, cursor.search_near may call + * here without first setting the interface cursor. + */ + cursor->recno = cbt->recno; + return (0); +} + +/* + * __value_return -- + * Change the cursor to reference an internal return value. + */ +static inline int +__value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +{ + WT_BTREE *btree; + WT_CELL *cell; + WT_CELL_UNPACK unpack; + WT_CURSOR *cursor; + WT_PAGE *page; + WT_ROW *rip; + uint8_t v; + + btree = S2BT(session); + + page = cbt->ref->page; + cursor = &cbt->iface; + + /* If the cursor references a WT_UPDATE item, return it. */ + if (upd != NULL) { + cursor->value.data = WT_UPDATE_DATA(upd); + cursor->value.size = upd->size; + return (0); + } + + if (page->type == WT_PAGE_ROW_LEAF) { + rip = &page->pg_row[cbt->slot]; /* Simple values have their location encoded in the WT_ROW. */ if (__wt_row_leaf_value(page, rip, &cursor->value)) @@ -121,13 +116,65 @@ __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) cursor->value.size = 0; return (0); } - break; - WT_ILLEGAL_VALUE(session); + __wt_cell_unpack(cell, &unpack); + return (__wt_page_cell_data_ref( + session, page, &unpack, &cursor->value)); + } - /* The value is an on-page cell, unpack and expand it as necessary. */ - __wt_cell_unpack(cell, &unpack); - WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cursor->value)); + if (page->type == WT_PAGE_COL_VAR) { + /* Take the value from the original page cell. */ + cell = WT_COL_PTR(page, &page->pg_var[cbt->slot]); + __wt_cell_unpack(cell, &unpack); + return (__wt_page_cell_data_ref( + session, page, &unpack, &cursor->value)); + } + + /* WT_PAGE_COL_FIX: Take the value from the original page. */ + v = __bit_getv_recno(cbt->ref, cursor->recno, btree->bitcnt); + return (__wt_buf_set(session, &cursor->value, &v, 1)); +} + +/* + * __wt_key_return -- + * Change the cursor to reference an internal return key. + */ +int +__wt_key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) +{ + WT_CURSOR *cursor; + + cursor = &cbt->iface; + + /* + * We may already have an internal key, in which case the cursor may + * not be set up to get another copy (for example, when we rely on a + * search-function result). + */ + F_CLR(cursor, WT_CURSTD_KEY_EXT); + if (!F_ISSET(cursor, WT_CURSTD_KEY_INT)) { + WT_RET(__key_return(session, cbt)); + F_SET(cursor, WT_CURSTD_KEY_INT); + } + return (0); +} + +/* + * __wt_kv_return -- + * Return a page referenced key/value pair to the application. + */ +int +__wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +{ + WT_CURSOR *cursor; + + cursor = &cbt->iface; + + WT_RET(__wt_key_return(session, cbt)); + + F_CLR(cursor, WT_CURSTD_VALUE_EXT); + WT_RET(__value_return(session, cbt, upd)); + F_SET(cursor, WT_CURSTD_VALUE_INT); return (0); } diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c index fde4d4fb9de..165f932afb2 100644 --- a/src/btree/bt_slvg.c +++ b/src/btree/bt_slvg.c @@ -166,13 +166,11 @@ __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) WT_DECL_RET; WT_STUFF *ss, stuff; uint32_t i, leaf_cnt; - bool evict_reset; WT_UNUSED(cfg); btree = S2BT(session); bm = btree->bm; - evict_reset = false; WT_CLEAR(stuff); ss = &stuff; @@ -184,13 +182,6 @@ __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) WT_ERR(__wt_scr_alloc(session, 0, &ss->tmp2)); /* - * Salvage handles its own page eviction; get exclusive access to the - * file, have eviction ignore the tree entirely. - */ - WT_ERR(__wt_evict_file_exclusive_on(session)); - evict_reset = true; - - /* * Step 1: * Inform the underlying block manager that we're salvaging the file. */ @@ -350,9 +341,6 @@ err: WT_TRET(bm->salvage_end(bm, session)); if (ss->root_ref.page != NULL) __wt_ref_out(session, &ss->root_ref); - if (evict_reset) - __wt_evict_file_exclusive_off(session); - /* Discard the leaf and overflow page memory. */ WT_TRET(__slvg_cleanup(session, ss)); @@ -603,9 +591,9 @@ __slvg_trk_leaf(WT_SESSION_IMPL *session, */ WT_ERR(__wt_page_inmem(session, NULL, dsk, 0, 0, &page)); WT_ERR(__wt_row_leaf_key_copy(session, - page, &page->pg_row_d[0], &trk->row_start)); - WT_ERR(__wt_row_leaf_key_copy(session, page, - &page->pg_row_d[page->pg_row_entries - 1], &trk->row_stop)); + page, &page->pg_row[0], &trk->row_start)); + WT_ERR(__wt_row_leaf_key_copy(session, + page, &page->pg_row[page->entries - 1], &trk->row_stop)); __wt_verbose(session, WT_VERB_SALVAGE, "%s start key %s", @@ -1235,7 +1223,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) WT_PAGE *page; WT_SALVAGE_COOKIE *cookie, _cookie; uint64_t recno, skip, take; - uint32_t *entriesp, save_entries; + uint32_t save_entries; cookie = &_cookie; WT_CLEAR(*cookie); @@ -1244,11 +1232,8 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) WT_RET(__wt_page_in(session, ref, 0)); page = ref->page; - entriesp = page->type == WT_PAGE_COL_VAR ? - &page->pg_var_entries : &page->pg_fix_entries; - - save_col_var = page->pg_var_d; - save_entries = *entriesp; + save_col_var = page->pg_var; + save_entries = page->entries; /* * Calculate the number of K/V entries we are going to skip, and @@ -1303,8 +1288,8 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR, NULL)); /* Reset the page. */ - page->pg_var_d = save_col_var; - *entriesp = save_entries; + page->pg_var = save_col_var; + page->entries = save_entries; ret = __wt_page_release(session, ref, 0); if (ret == 0) @@ -1973,14 +1958,14 @@ __slvg_row_build_leaf( /* We should have selected some entries, but not the entire page. */ WT_ASSERT(session, skip_start + skip_stop > 0 && - skip_start + skip_stop < page->pg_row_entries); + skip_start + skip_stop < page->entries); /* * Take a copy of this page's first key to define the start of * its range. The key may require processing, otherwise, it's * a copy from the page. */ - rip = page->pg_row_d + skip_start; + rip = page->pg_row + skip_start; WT_ERR(__wt_row_leaf_key(session, page, rip, key, false)); WT_ERR(__wt_row_ikey_incr( session, ref->home, 0, key->data, key->size, ref)); @@ -1988,14 +1973,14 @@ __slvg_row_build_leaf( /* Set the referenced flag on overflow pages we're using. */ if (trk->trk_ovfl_cnt != 0) WT_ERR(__slvg_row_ovfl(session, - trk, page, skip_start, page->pg_row_entries - skip_stop)); + trk, page, skip_start, page->entries - skip_stop)); /* * Change the page to reflect the correct record count: there is no * need to copy anything on the page itself, the entries value limits * the number of page items. */ - page->pg_row_entries -= skip_stop; + page->entries -= skip_stop; cookie->skip = skip_start; /* @@ -2014,7 +1999,7 @@ __slvg_row_build_leaf( WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR, NULL)); /* Reset the page. */ - page->pg_row_entries += skip_stop; + page->entries += skip_stop; /* * Discard our hazard pointer and evict the page, updating the @@ -2081,7 +2066,7 @@ __slvg_row_ovfl(WT_SESSION_IMPL *session, * We're merging a row-store page, and we took some number of records, * figure out which (if any) overflow records we used. */ - for (rip = page->pg_row_d + start; start < stop; ++start, ++rip) { + for (rip = page->pg_row + start; start < stop; ++start, ++rip) { copy = WT_ROW_KEY_COPY(rip); (void)__wt_row_leaf_key_info( page, copy, NULL, &cell, NULL, NULL); diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index fe49f937719..49043c8bab4 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -10,8 +10,8 @@ #define WT_MEM_TRANSFER(from_decr, to_incr, len) do { \ size_t __len = (len); \ - from_decr += __len; \ - to_incr += __len; \ + (from_decr) += __len; \ + (to_incr) += __len; \ } while (0) /* @@ -54,6 +54,16 @@ __split_oldest_gen(WT_SESSION_IMPL *session) } /* + * __wt_split_obsolete -- + * Check if it is safe to free / evict based on split generation. + */ +bool +__wt_split_obsolete(WT_SESSION_IMPL *session, uint64_t split_gen) +{ + return (split_gen < __split_oldest_gen(session)); +} + +/* * __split_stash_add -- * Add a new entry into the session's split stash list. */ @@ -109,7 +119,7 @@ __wt_split_stash_discard(WT_SESSION_IMPL *session) ++i, ++stash) { if (stash->p == NULL) continue; - else if (stash->split_gen >= oldest) + if (stash->split_gen >= oldest) break; /* * It's a bad thing if another thread is in this memory after @@ -177,7 +187,7 @@ __split_safe_free(WT_SESSION_IMPL *session, exclusive = true; if (exclusive) { - __wt_free(session, p); + __wt_overwrite_and_free_len(session, p, s); return (0); } @@ -187,7 +197,7 @@ __split_safe_free(WT_SESSION_IMPL *session, #ifdef HAVE_DIAGNOSTIC /* * __split_verify_intl_key_order -- - * Verify the key order on an internal page after a split, diagnostic only. + * Verify the key order on an internal page after a split. */ static void __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) @@ -239,6 +249,46 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) break; } } + +/* + * __split_verify_root -- + * Verify a root page involved in a split. + */ +static int +__split_verify_root(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_DECL_RET; + WT_REF *ref; + + /* The split is complete and live, verify all of the pages involved. */ + __split_verify_intl_key_order(session, page); + + WT_INTL_FOREACH_BEGIN(session, page, ref) { + /* + * An eviction thread might be attempting to evict the page + * (the WT_REF may be WT_REF_LOCKED), or it may be a disk based + * page (the WT_REF may be WT_REF_READING), or it may be in + * some other state. Acquire a hazard pointer for any + * in-memory pages so we know the state of the page. + * + * Ignore pages not in-memory (deleted, on-disk, being read), + * there's no in-memory structure to check. + */ + if ((ret = __wt_page_in(session, + ref, WT_READ_CACHE | WT_READ_NO_EVICT)) == WT_NOTFOUND) + continue; + WT_ERR(ret); + + __split_verify_intl_key_order(session, ref->page); + + WT_ERR(__wt_page_release(session, ref, WT_READ_NO_EVICT)); + } WT_INTL_FOREACH_END; + + return (0); + +err: /* Something really bad just happened. */ + WT_PANIC_RET(session, ret, "fatal error during page split"); +} #endif /* @@ -390,12 +440,12 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, } /* - * __split_ref_step1 -- + * __split_ref_prepare -- * Prepare a set of WT_REFs for a move. */ static void -__split_ref_step1( - WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first) +__split_ref_prepare(WT_SESSION_IMPL *session, + WT_PAGE_INDEX *pindex, uint64_t split_gen, bool skip_first) { WT_PAGE *child; WT_REF *child_ref, *ref; @@ -418,30 +468,25 @@ __split_ref_step1( child = ref->page; /* - * Block eviction and splits in newly created pages. + * Block eviction in newly created pages. * * Once the split is live, newly created internal pages might be * evicted and their WT_REF structures freed. If that happened * before all threads exit the index of the page that previously * "owned" the WT_REF, a thread might see a freed WT_REF. To - * ensure that doesn't happen, the newly created page's modify - * structure has a field with a transaction ID that's checked - * before any internal page is evicted. Unfortunately, we don't - * know the correct value until we update the original page's - * index (we need a transaction ID from after that update), but - * the act of updating the original page's index is what allows - * the eviction to happen. + * ensure that doesn't happen, the newly created page contains + * the current split generation and can't be evicted until + * all readers have left the old generation. * - * Split blocking was because historic versions of the split - * code didn't update the WT_REF.home field until after the - * split was live, so the WT_REF.home fields being updated could - * split again before the update, there's a race between splits - * as to which would update them first. The current code updates - * the WT_REF.home fields before going live (in this function), - * this shouldn't be an issue, but for now splits remain turned - * off. + * Historic, we also blocked splits in newly created pages + * because we didn't update the WT_REF.home field until after + * the split was live, so the WT_REF.home fields being updated + * could split again before the update, there's a race between + * splits as to which would update them first. The current code + * updates the WT_REF.home fields before going live (in this + * function), this isn't an issue. */ - F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); + child->pg_intl_split_gen = split_gen; /* * We use a page flag to prevent the child from splitting from @@ -465,64 +510,6 @@ __split_ref_step1( } /* - * __split_ref_step2 -- - * Allow the newly created children to be evicted or split. - */ -static int -__split_ref_step2( - WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first) -{ - WT_DECL_RET; - WT_PAGE *child; - WT_REF *ref; - uint32_t i; - - /* - * The split has gone live, enable eviction and splits on the newly - * created internal pages. - */ - WT_WRITE_BARRIER(); - - for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) { - ref = pindex->index[i]; - - /* - * We don't hold hazard pointers on created pages, they cannot - * be evicted because the page-modify transaction value set as - * they were created prevents eviction. (See above, we reset - * that value as part of fixing up the page.) But, an eviction - * thread might be attempting to evict the page (the WT_REF may - * be WT_REF_LOCKED), or it may be a disk based page (the WT_REF - * may be WT_REF_READING), or it may be in some other state. - * Acquire a hazard pointer for any in-memory pages so we know - * the state of the page. Ignore pages not in-memory (deleted, - * on-disk, being read), there's no in-memory structure to fix. - */ - if ((ret = __wt_page_in(session, - ref, WT_READ_CACHE | WT_READ_NO_EVICT)) == WT_NOTFOUND) - continue; - WT_ERR(ret); - - child = ref->page; - - /* The child can now be evicted or split. */ - F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); - -#ifdef HAVE_DIAGNOSTIC - WT_WITH_PAGE_INDEX(session, - __split_verify_intl_key_order(session, child)); -#endif - - WT_ERR(__wt_hazard_clear(session, ref)); - } - - return (0); - -err: /* Something really bad just happened. */ - WT_PANIC_RET(session, ret, "fatal error resolving a split"); -} - -/* * __split_root -- * Split the root page in-memory, deepening the tree. */ @@ -653,8 +640,12 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; - /* Prepare the WT_REFs for the move. */ - __split_ref_step1(session, alloc_index, false); + /* + * Prepare the WT_REFs for the move: this requires a stable split + * generation to block splits in newly created pages, so get one. + */ + WT_ENTER_PAGE_INDEX(session); + __split_ref_prepare(session, alloc_index, session->split_gen, false); /* * Confirm the root page's index hasn't moved, then update it, which @@ -662,20 +653,27 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) */ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(root) == pindex); WT_INTL_INDEX_SET(root, alloc_index); + alloc_index = NULL; + + WT_LEAVE_PAGE_INDEX(session); + + /* + * Get a generation for this split, mark the root page. This must be + * after the new index is swapped into place in order to know that no + * readers are looking at the old index. + */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + root->pg_intl_split_gen = split_gen; #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, - __split_verify_intl_key_order(session, root)); + ret = __split_verify_root(session, root)); + WT_ERR(ret); #endif - /* Finalize the WT_REFs we moved. */ - WT_ERR(__split_ref_step2(session, alloc_index, false)); - /* The split is complete and correct, ignore benign errors. */ + /* The split is complete and verified, ignore benign errors. */ complete = WT_ERR_IGNORE; - /* We've installed the allocated page-index, ensure error handling. */ - alloc_index = NULL; - /* * We can't free the previous root's index, there may be threads using * it. Add to the session's discard list, to be freed once we know no @@ -686,7 +684,6 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) * fails, we don't roll back that change, because threads may already * be using the new index. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); WT_TRET(__split_safe_free(session, split_gen, false, pindex, size)); root_decr += size; @@ -846,10 +843,13 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, WT_INTL_INDEX_SET(parent, alloc_index); alloc_index = NULL; -#ifdef HAVE_DIAGNOSTIC - WT_WITH_PAGE_INDEX(session, - __split_verify_intl_key_order(session, parent)); -#endif + /* + * Get a generation for this split, mark the page. This must be after + * the new index is swapped into place in order to know that no readers + * are looking at the old index. + */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + parent->pg_intl_split_gen = split_gen; /* * If discarding the page's original WT_REF field, reset it to split. @@ -869,16 +869,25 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, __wt_free(session, ref->page_del); } + /* + * Set the discarded WT_REF state to split, ensuring we don't + * race with any discard of the WT_REF deleted fields. + */ WT_PUBLISH(ref->state, WT_REF_SPLIT); + + /* + * Push out the change: not required for correctness, but stops + * threads spinning on incorrect page references. + */ + WT_FULL_BARRIER(); } - /* - * Push out the changes: not required for correctness, but don't let - * threads spin on incorrect page references longer than necessary. - */ - WT_FULL_BARRIER(); +#ifdef HAVE_DIAGNOSTIC + WT_WITH_PAGE_INDEX(session, + __split_verify_intl_key_order(session, parent)); +#endif - /* The split is complete and correct, ignore benign errors. */ + /* The split is complete and verified, ignore benign errors. */ complete = WT_ERR_IGNORE; /* @@ -908,7 +917,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, * * Acquire a new split generation. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); for (i = 0, deleted_refs = scr->mem; i < deleted_entries; ++i) { next_ref = pindex->index[deleted_refs[i]]; WT_ASSERT(session, next_ref->state == WT_REF_SPLIT); @@ -1160,16 +1168,34 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; - /* Prepare the WT_REFs for the move. */ - __split_ref_step1(session, alloc_index, true); + /* + * Prepare the WT_REFs for the move: this requires a stable split + * generation to block splits in newly created pages, so get one. + */ + WT_ENTER_PAGE_INDEX(session); + __split_ref_prepare(session, alloc_index, session->split_gen, true); /* Split into the parent. */ - WT_ERR(__split_parent(session, page_ref, alloc_index->index, - alloc_index->entries, parent_incr, false, false)); + if ((ret = __split_parent(session, page_ref, alloc_index->index, + alloc_index->entries, parent_incr, false, false)) == 0) { + /* + * Confirm the page's index hasn't moved, then update it, which + * makes the split visible to threads descending the tree. + */ + WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex); + WT_INTL_INDEX_SET(page, replace_index); + } - /* Confirm the page's index hasn't moved, then update it. */ - WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex); - WT_INTL_INDEX_SET(page, replace_index); + WT_LEAVE_PAGE_INDEX(session); + WT_ERR(ret); + + /* + * Get a generation for this split, mark the parent page. This must be + * after the new index is swapped into place in order to know that no + * readers are looking at the old index. + */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + page->pg_intl_split_gen = split_gen; #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, @@ -1178,19 +1204,10 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) __split_verify_intl_key_order(session, page)); #endif - /* Finalize the WT_REFs we moved. */ - WT_ERR(__split_ref_step2(session, alloc_index, true)); - - /* The split is complete and correct, ignore benign errors. */ + /* The split is complete and verified, ignore benign errors. */ complete = WT_ERR_IGNORE; /* - * Push out the changes: not required for correctness, but no reason - * to wait. - */ - WT_FULL_BARRIER(); - - /* * We don't care about the page-index we allocated, all we needed was * the array of WT_REF structures, which has now been split into the * parent page. @@ -1207,7 +1224,6 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) * back that change, because threads may already be using the new parent * page. */ - split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); WT_TRET(__split_safe_free(session, split_gen, false, pindex, size)); page_decr += size; @@ -1284,10 +1300,6 @@ __split_internal_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool trylock, for (;;) { parent = ref->home; - /* Skip pages that aren't ready to split. */ - if (F_ISSET_ATOMIC(parent, WT_PAGE_SPLIT_BLOCK)) - return (EBUSY); - if (trylock) WT_RET(__wt_try_writelock(session, &parent->page_lock)); else @@ -1770,9 +1782,9 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) /* Find the last item on the page. */ if (type == WT_PAGE_ROW_LEAF) - ins_head = page->pg_row_entries == 0 ? + ins_head = page->entries == 0 ? WT_ROW_INSERT_SMALLEST(page) : - WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); + WT_ROW_INSERT_SLOT(page, page->entries - 1); else ins_head = WT_COL_APPEND(page); moved_ins = WT_SKIP_LAST(ins_head); @@ -1822,7 +1834,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) key->size = WT_INSERT_KEY_SIZE(ins); } else WT_ERR(__wt_row_leaf_key( - session, page, &page->pg_row_d[0], key, true)); + session, page, &page->pg_row[0], key, true)); WT_ERR(__wt_row_ikey(session, 0, key->data, key->size, child)); parent_incr += sizeof(WT_IKEY) + key->size; __wt_scr_free(session, &key); @@ -2086,8 +2098,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) WT_PAGE *parent; bool hazard; - __wt_verbose( - session, WT_VERB_SPLIT, "%p: split-insert", (void *)ref->page); + __wt_verbose(session, WT_VERB_SPLIT, "%p: split-insert", (void *)ref); WT_RET(__split_internal_lock(session, ref, true, &parent, &hazard)); if ((ret = __split_insert(session, ref)) != 0) { @@ -2178,8 +2189,7 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing) WT_PAGE *parent; bool hazard; - __wt_verbose( - session, WT_VERB_SPLIT, "%p: split-multi", (void *)ref->page); + __wt_verbose(session, WT_VERB_SPLIT, "%p: split-multi", (void *)ref); WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard)); if ((ret = __split_multi(session, ref, closing)) != 0 || closing) { @@ -2207,8 +2217,7 @@ __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref) WT_PAGE *parent; bool hazard; - __wt_verbose( - session, WT_VERB_SPLIT, "%p: reverse-split", (void *)ref->page); + __wt_verbose(session, WT_VERB_SPLIT, "%p: reverse-split", (void *)ref); WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard)); ret = __split_parent(session, ref, NULL, 0, 0, false, true); @@ -2229,8 +2238,7 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, WT_MULTI *multi) page = ref->page; - __wt_verbose( - session, WT_VERB_SPLIT, "%p: split-rewrite", (void *)ref->page); + __wt_verbose(session, WT_VERB_SPLIT, "%p: split-rewrite", (void *)ref); /* * This isn't a split: a reconciliation failed because we couldn't write @@ -2266,7 +2274,7 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, WT_MULTI *multi) * reconciliation, do it now. */ __wt_page_modify_clear(session, page); - __wt_ref_out(session, ref); + __wt_ref_out_int(session, ref, true); /* Swap the new page into place. */ ref->page = new->page; diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c index 06428b87f6e..0da0e0807bd 100644 --- a/src/btree/bt_stat.c +++ b/src/btree/bt_stat.c @@ -40,6 +40,8 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage); WT_STAT_SET(session, stats, btree_maxleafvalue, btree->maxleafvalue); + WT_STAT_SET(session, stats, cache_bytes_dirty, + __wt_btree_dirty_inuse(session)); WT_STAT_SET(session, stats, cache_bytes_inuse, __wt_btree_bytes_inuse(session)); @@ -104,8 +106,7 @@ __stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats) switch (page->type) { case WT_PAGE_COL_FIX: WT_STAT_INCR(session, stats, btree_column_fix); - WT_STAT_INCRV( - session, stats, btree_entries, page->pg_fix_entries); + WT_STAT_INCRV(session, stats, btree_entries, page->entries); break; case WT_PAGE_COL_INT: WT_STAT_INCR(session, stats, btree_column_internal); diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index 7bf15baa67f..ead6ccc4ac0 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -78,6 +78,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; uint64_t oldest_id, saved_pinned_id; uint32_t flags; + bool timer; conn = S2C(session); btree = S2BT(session); @@ -88,7 +89,8 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; - if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) + timer = WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT); + if (timer) __wt_epoch(session, &start); switch (syncop) { @@ -186,9 +188,9 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) * to grow significantly larger than the configured maximum * size. */ - F_SET(btree, WT_BTREE_NO_RECONCILE); + F_SET(btree, WT_BTREE_ALLOW_SPLITS); ret = __wt_evict_file_exclusive_on(session); - F_CLR(btree, WT_BTREE_NO_RECONCILE); + F_CLR(btree, WT_BTREE_ALLOW_SPLITS); WT_ERR(ret); __wt_evict_file_exclusive_off(session); @@ -242,7 +244,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) break; } - if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { + if (timer) { __wt_epoch(session, &end); __wt_verbose(session, WT_VERB_CHECKPOINT, "__sync_file WT_SYNC_%s wrote: %" PRIu64 diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c index 340f9bb6f0e..7475811adc5 100644 --- a/src/btree/bt_vrfy.c +++ b/src/btree/bt_vrfy.c @@ -216,13 +216,11 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) ckpt->raw.data, ckpt->raw.size, root_addr, &root_addr_size, true)); - /* - * Ignore trees with no root page. - * Verify, then discard the checkpoint from the cache. - */ - if (root_addr_size != 0 && - (ret = __wt_btree_tree_open( - session, root_addr, root_addr_size)) == 0) { + /* Skip trees with no root page. */ + if (root_addr_size != 0) { + WT_ERR(__wt_btree_tree_open( + session, root_addr, root_addr_size)); + if (WT_VRFY_DUMP(vs)) WT_ERR(__wt_msg(session, "Root: %s %s", __wt_addr_string(session, @@ -230,14 +228,38 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) __wt_page_type_string( btree->root.page->type))); + __wt_evict_file_exclusive_off(session); + + /* Verify the tree. */ WT_WITH_PAGE_INDEX(session, ret = __verify_tree(session, &btree->root, vs)); + /* + * We have an exclusive lock on the handle, but we're + * swapping root pages in-and-out of that handle, and + * there's a race with eviction entering the tree and + * seeing an invalid root page. Eviction must work on + * trees being verified (else we'd have to do our own + * eviction), lock eviction out whenever we're loading + * a new root page. This loops works because we are + * called with eviction locked out, so we release the + * lock at the top of the loop and re-acquire it here. + */ + WT_TRET(__wt_evict_file_exclusive_on(session)); WT_TRET(__wt_cache_op(session, WT_SYNC_DISCARD)); } /* Unload the checkpoint. */ WT_TRET(bm->checkpoint_unload(bm, session)); + + /* + * We've finished one checkpoint's verification (verification, + * then cache eviction and checkpoint unload): if any errors + * occurred, quit. Done this way because otherwise we'd need + * at least two more state variables on error, one to know if + * we need to discard the tree from the cache and one to know + * if we need to unload the checkpoint. + */ WT_ERR(ret); /* Display the tree shape. */ @@ -252,7 +274,7 @@ err: /* Inform the underlying block manager we're done. */ /* Discard the list of checkpoints. */ if (ckptbase != NULL) - __wt_meta_ckptlist_free(session, ckptbase); + __wt_meta_ckptlist_free(session, &ckptbase); /* Free allocated memory. */ __wt_scr_free(session, &vs->max_key); @@ -386,7 +408,7 @@ recno_chk: if (recno != vs->record_total + 1) } switch (page->type) { case WT_PAGE_COL_FIX: - vs->record_total += page->pg_fix_entries; + vs->record_total += page->entries; break; case WT_PAGE_COL_VAR: recno = 0; @@ -614,7 +636,7 @@ __verify_row_leaf_key_order( * If a tree is empty (just created), it won't have keys; if there * are no keys, we're done. */ - if (page->pg_row_entries == 0) + if (page->entries == 0) return (0); /* @@ -624,7 +646,7 @@ __verify_row_leaf_key_order( */ if (vs->max_addr->size != 0) { WT_RET(__wt_row_leaf_key_copy( - session, page, page->pg_row_d, vs->tmp1)); + session, page, page->pg_row, vs->tmp1)); /* * Compare the key against the largest key we've seen so far. @@ -653,7 +675,7 @@ __verify_row_leaf_key_order( /* Update the largest key we've seen to the last key on this page. */ WT_RET(__wt_row_leaf_key_copy(session, page, - page->pg_row_d + (page->pg_row_entries - 1), vs->max_key)); + page->pg_row + (page->entries - 1), vs->max_key)); (void)__wt_page_addr_string(session, ref, vs->max_addr); return (0); diff --git a/src/btree/bt_vrfy_dsk.c b/src/btree/bt_vrfy_dsk.c index 3a6fd8261ba..a4071c44aee 100644 --- a/src/btree/bt_vrfy_dsk.c +++ b/src/btree/bt_vrfy_dsk.c @@ -203,7 +203,8 @@ __verify_dsk_row( WT_ITEM *last; enum { FIRST, WAS_KEY, WAS_VALUE } last_cell_type; void *huffman; - uint32_t cell_num, cell_type, i, key_cnt, prefix; + size_t prefix; + uint32_t cell_num, cell_type, i, key_cnt; uint8_t *end; int cmp; @@ -343,8 +344,9 @@ __verify_dsk_row( if (cell_num > 1 && prefix > last->size) WT_ERR_VRFY(session, "key %" PRIu32 " on page at %s has a prefix " - "compression count of %" PRIu32 ", larger than " - "the length of the previous key, %" WT_SIZET_FMT, + "compression count of %" WT_SIZET_FMT + ", larger than the length of the previous key, %" + WT_SIZET_FMT, cell_num, tag, prefix, last->size); /* diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c index 049700952ee..86484feb7c9 100644 --- a/src/btree/bt_walk.c +++ b/src/btree/bt_walk.c @@ -340,9 +340,7 @@ __tree_walk_internal(WT_SESSION_IMPL *session, * Take a copy of any held page and clear the return value. Remember * the hazard pointer we're currently holding. * - * We may be passed a pointer to btree->evict_page that we are clearing - * here. We check when discarding pages that we're not discarding that - * page, so this clear must be done before the page is released. + * Clear the returned value, it makes future error handling easier. */ couple = couple_orig = ref = *refp; *refp = NULL; @@ -350,16 +348,19 @@ __tree_walk_internal(WT_SESSION_IMPL *session, /* If no page is active, begin a walk from the start/end of the tree. */ if (ref == NULL) { restart: /* - * We can reach here with a NULL or root reference; the release + * We can be here with a NULL or root WT_REF; the page release * function handles them internally, don't complicate this code * by calling them out. */ WT_ERR(__wt_page_release(session, couple, flags)); - couple = couple_orig = ref = &btree->root; - if (ref->page == NULL) - goto done; + /* + * We're not supposed to walk trees without root pages. As this + * has not always been the case, assert to debug that change. + */ + WT_ASSERT(session, btree->root.page != NULL); + couple = couple_orig = ref = &btree->root; initial_descent = true; goto descend; } diff --git a/src/btree/col_modify.c b/src/btree/col_modify.c index a7920da5267..9ccb9728189 100644 --- a/src/btree/col_modify.c +++ b/src/btree/col_modify.c @@ -115,9 +115,8 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, page, mod->mod_col_update, ins_headp, 1); ins_headp = &mod->mod_col_update[0]; } else { - WT_PAGE_ALLOC_AND_SWAP(session, - page, mod->mod_col_update, ins_headp, - page->pg_var_entries); + WT_PAGE_ALLOC_AND_SWAP(session, page, + mod->mod_col_update, ins_headp, page->entries); ins_headp = &mod->mod_col_update[cbt->slot]; } diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c index 64ee9e94f4c..c72d66f8796 100644 --- a/src/btree/col_srch.c +++ b/src/btree/col_srch.c @@ -240,8 +240,8 @@ leaf_only: cbt->compare = 1; return (0); } - if (recno >= current->ref_recno + page->pg_fix_entries) { - cbt->recno = current->ref_recno + page->pg_fix_entries; + if (recno >= current->ref_recno + page->entries) { + cbt->recno = current->ref_recno + page->entries; goto past_end; } else { cbt->recno = recno; @@ -257,8 +257,7 @@ leaf_only: } if ((cip = __col_var_search(current, recno, NULL)) == NULL) { cbt->recno = __col_var_last_recno(current); - cbt->slot = page->pg_var_entries == 0 ? - 0 : page->pg_var_entries - 1; + cbt->slot = page->entries == 0 ? 0 : page->entries - 1; goto past_end; } else { cbt->recno = recno; diff --git a/src/btree/row_key.c b/src/btree/row_key.c index 99ee34a6c5d..032fdf7d897 100644 --- a/src/btree/row_key.c +++ b/src/btree/row_key.c @@ -26,7 +26,7 @@ __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page) btree = S2BT(session); - if (page->pg_row_entries == 0) { /* Just checking... */ + if (page->entries == 0) { /* Just checking... */ F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS); return (0); } @@ -51,15 +51,15 @@ __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page) */ WT_RET(__wt_scr_alloc(session, 0, &key)); WT_RET(__wt_scr_alloc(session, - (uint32_t)__bitstr_size(page->pg_row_entries), &tmp)); + (uint32_t)__bitstr_size(page->entries), &tmp)); memset(tmp->mem, 0, tmp->memsize); if ((gap = btree->key_gap) == 0) gap = 1; - __inmem_row_leaf_slots(tmp->mem, 0, page->pg_row_entries, gap); + __inmem_row_leaf_slots(tmp->mem, 0, page->entries, gap); /* Instantiate the keys. */ - for (rip = page->pg_row_d, i = 0; i < page->pg_row_entries; ++rip, ++i) + for (rip = page->pg_row, i = 0; i < page->entries; ++rip, ++i) if (__bit_test(tmp->mem, i)) WT_ERR(__wt_row_leaf_key_work( session, page, rip, key, true)); @@ -282,7 +282,7 @@ switch_and_jump: /* Switching to a forward roll. */ * the tracking cache. */ if (slot_offset == 0) { - __wt_readlock(session, btree->ovfl_lock); + __wt_readlock(session, &btree->ovfl_lock); copy = WT_ROW_KEY_COPY(rip); if (!__wt_row_leaf_key_info(page, copy, NULL, &cell, &keyb->data, &keyb->size)) { @@ -290,7 +290,7 @@ switch_and_jump: /* Switching to a forward roll. */ ret = __wt_dsk_cell_data_ref(session, WT_PAGE_ROW_LEAF, unpack, keyb); } - __wt_readunlock(session, btree->ovfl_lock); + __wt_readunlock(session, &btree->ovfl_lock); WT_ERR(ret); break; } diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c index a1c214e5b8b..b1a81ca3d9f 100644 --- a/src/btree/row_modify.c +++ b/src/btree/row_modify.c @@ -85,9 +85,8 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, if (cbt->compare == 0) { if (cbt->ins == NULL) { /* Allocate an update array as necessary. */ - WT_PAGE_ALLOC_AND_SWAP(session, - page, mod->mod_row_update, - upd_entry, page->pg_row_entries); + WT_PAGE_ALLOC_AND_SWAP(session, page, + mod->mod_row_update, upd_entry, page->entries); /* Set the WT_UPDATE array reference. */ upd_entry = &mod->mod_row_update[cbt->slot]; @@ -147,10 +146,10 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, * slot. That's hard, so we set a flag. */ WT_PAGE_ALLOC_AND_SWAP(session, page, - mod->mod_row_insert, ins_headp, page->pg_row_entries + 1); + mod->mod_row_insert, ins_headp, page->entries + 1); ins_slot = F_ISSET(cbt, WT_CBT_SEARCH_SMALLEST) ? - page->pg_row_entries: cbt->slot; + page->entries: cbt->slot; ins_headp = &mod->mod_row_insert[ins_slot]; /* Allocate the WT_INSERT_HEAD structure as necessary. */ diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c index d4e82c458d4..9c3d467340e 100644 --- a/src/btree/row_srch.c +++ b/src/btree/row_srch.c @@ -486,14 +486,14 @@ leaf_only: if (insert && descend_right) { cbt->append_tree = 1; - if (page->pg_row_entries == 0) { - cbt->slot = WT_ROW_SLOT(page, page->pg_row_d); + if (page->entries == 0) { + cbt->slot = WT_ROW_SLOT(page, page->pg_row); F_SET(cbt, WT_CBT_SEARCH_SMALLEST); ins_head = WT_ROW_INSERT_SMALLEST(page); } else { cbt->slot = WT_ROW_SLOT(page, - page->pg_row_d + (page->pg_row_entries - 1)); + page->pg_row + (page->entries - 1)); ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot); } @@ -511,11 +511,11 @@ leaf_only: * doing the tests and error handling inside the loop costs about 5%. */ base = 0; - limit = page->pg_row_entries; + limit = page->entries; if (collator == NULL && srch_key->size <= WT_COMPARE_SHORT_MAXLEN) for (; limit != 0; limit >>= 1) { indx = base + (limit >> 1); - rip = page->pg_row_d + indx; + rip = page->pg_row + indx; WT_ERR( __wt_row_leaf_key(session, page, rip, item, true)); @@ -529,7 +529,7 @@ leaf_only: else if (collator == NULL) for (; limit != 0; limit >>= 1) { indx = base + (limit >> 1); - rip = page->pg_row_d + indx; + rip = page->pg_row + indx; WT_ERR( __wt_row_leaf_key(session, page, rip, item, true)); @@ -547,7 +547,7 @@ leaf_only: else for (; limit != 0; limit >>= 1) { indx = base + (limit >> 1); - rip = page->pg_row_d + indx; + rip = page->pg_row + indx; WT_ERR( __wt_row_leaf_key(session, page, rip, item, true)); @@ -591,13 +591,13 @@ leaf_match: cbt->compare = 0; */ if (base == 0) { cbt->compare = 1; - cbt->slot = WT_ROW_SLOT(page, page->pg_row_d); + cbt->slot = WT_ROW_SLOT(page, page->pg_row); F_SET(cbt, WT_CBT_SEARCH_SMALLEST); ins_head = WT_ROW_INSERT_SMALLEST(page); } else { cbt->compare = -1; - cbt->slot = WT_ROW_SLOT(page, page->pg_row_d + (base - 1)); + cbt->slot = WT_ROW_SLOT(page, page->pg_row + (base - 1)); ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot); } @@ -623,215 +623,3 @@ leaf_match: cbt->compare = 0; err: WT_TRET(__wt_page_release(session, current, 0)); return (ret); } - -/* - * __wt_row_random_leaf -- - * Return a random key from a row-store leaf page. - */ -int -__wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) -{ - WT_INSERT *ins, **start, **stop; - WT_INSERT_HEAD *ins_head; - WT_PAGE *page; - uint64_t samples; - uint32_t choice, entries, i; - int level; - - page = cbt->ref->page; - start = stop = NULL; /* [-Wconditional-uninitialized] */ - entries = 0; /* [-Wconditional-uninitialized] */ - - __cursor_pos_clear(cbt); - - /* If the page has disk-based entries, select from them. */ - if (page->pg_row_entries != 0) { - cbt->compare = 0; - cbt->slot = __wt_random(&session->rnd) % page->pg_row_entries; - - /* - * The real row-store search function builds the key, so we - * have to as well. - */ - return (__wt_row_leaf_key(session, - page, page->pg_row_d + cbt->slot, cbt->tmp, false)); - } - - /* - * If the tree is new (and not empty), it might have a large insert - * list. - * - * Walk down the list until we find a level with at least 50 entries, - * that's where we'll start rolling random numbers. The value 50 is - * used to ignore levels with only a few entries, that is, levels which - * are potentially badly skewed. - */ - F_SET(cbt, WT_CBT_SEARCH_SMALLEST); - if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL) - return (WT_NOTFOUND); - for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) { - start = &ins_head->head[level]; - for (entries = 0, stop = start; - *stop != NULL; stop = &(*stop)->next[level]) - ++entries; - - if (entries > 50) - break; - } - - /* - * If it's a tiny list and we went all the way to level 0, correct the - * level; entries is correctly set. - */ - if (level < 0) - level = 0; - - /* - * Step down the skip list levels, selecting a random chunk of the name - * space at each level. - */ - for (samples = entries; level > 0; samples += entries) { - /* - * There are (entries) or (entries + 1) chunks of the name space - * considered at each level. They are: between start and the 1st - * element, between the 1st and 2nd elements, and so on to the - * last chunk which is the name space after the stop element on - * the current level. This last chunk of name space may or may - * not be there: as we descend the levels of the skip list, this - * chunk may appear, depending if the next level down has - * entries logically after the stop point in the current level. - * We can't ignore those entries: because of the algorithm used - * to determine the depth of a skiplist, there may be a large - * number of entries "revealed" by descending a level. - * - * If the next level down has more items after the current stop - * point, there are (entries + 1) chunks to consider, else there - * are (entries) chunks. - */ - if (*(stop - 1) == NULL) - choice = __wt_random(&session->rnd) % entries; - else - choice = __wt_random(&session->rnd) % (entries + 1); - - if (choice == entries) { - /* - * We selected the name space after the stop element on - * this level. Set the start point to the current stop - * point, descend a level and move the stop element to - * the end of the list, that is, the end of the newly - * discovered name space, counting entries as we go. - */ - start = stop; - --start; - --level; - for (entries = 0, stop = start; - *stop != NULL; stop = &(*stop)->next[level]) - ++entries; - } else { - /* - * We selected another name space on the level. Move the - * start pointer the selected number of entries forward - * to the start of the selected chunk (if the selected - * number is 0, start won't move). Set the stop pointer - * to the next element in the list and drop both start - * and stop down a level. - */ - for (i = 0; i < choice; ++i) - start = &(*start)->next[level]; - stop = &(*start)->next[level]; - - --start; - --stop; - --level; - - /* Count the entries in the selected name space. */ - for (entries = 0, - ins = *start; ins != *stop; ins = ins->next[level]) - ++entries; - } - } - - /* - * When we reach the bottom level, entries will already be set. Select - * a random entry from the name space and return it. - * - * It should be impossible for the entries count to be 0 at this point, - * but check for it out of paranoia and to quiet static testing tools. - */ - if (entries > 0) - entries = __wt_random(&session->rnd) % entries; - for (ins = *start; entries > 0; --entries) - ins = ins->next[0]; - - cbt->ins = ins; - cbt->ins_head = ins_head; - cbt->compare = 0; - - /* - * Random lookups in newly created collections can be slow if a page - * consists of a large skiplist. Schedule the page for eviction if we - * encounter a large skiplist. This worthwhile because applications - * that take a sample often take many samples, so the overhead of - * traversing the skip list each time accumulates to real time. - */ - if (samples > 5000) - __wt_page_evict_soon(session, cbt->ref); - - return (0); -} - -/* - * __wt_row_random_descent -- - * Find a random leaf page in a row-store tree. - */ -int -__wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) -{ - WT_BTREE *btree; - WT_DECL_RET; - WT_PAGE *page; - WT_PAGE_INDEX *pindex; - WT_REF *current, *descent; - - btree = S2BT(session); - current = NULL; - - if (0) { -restart: /* - * Discard the currently held page and restart the search from - * the root. - */ - WT_RET(__wt_page_release(session, current, 0)); - } - - /* Search the internal pages of the tree. */ - current = &btree->root; - for (;;) { - page = current->page; - if (page->type != WT_PAGE_ROW_INT) - break; - - WT_INTL_INDEX_GET(session, page, pindex); - descent = pindex->index[ - __wt_random(&session->rnd) % pindex->entries]; - - /* - * Swap the current page for the child page. If the page splits - * while we're retrieving it, restart the search at the root. - * - * On other error, simply return, the swap call ensures we're - * holding nothing on failure. - */ - if ((ret = __wt_page_swap( - session, current, descent, WT_READ_RESTART_OK)) == 0) { - current = descent; - continue; - } - if (ret == WT_RESTART) - goto restart; - return (ret); - } - - cbt->ref = current; - return (0); -} diff --git a/src/checksum/power8/crc32_wrapper.c b/src/checksum/power8/crc32_wrapper.c index ddfa2bdaeb8..a9be9ced1c6 100644 --- a/src/checksum/power8/crc32_wrapper.c +++ b/src/checksum/power8/crc32_wrapper.c @@ -1,4 +1,6 @@ #if defined(__powerpc64__) +#include "wt_internal.h" + #define CRC_TABLE #include "crc32_constants.h" @@ -68,8 +70,6 @@ out: } #endif -#include "wt_internal.h" - /* * __wt_checksum_hw -- * WiredTiger: return a checksum for a chunk of memory. diff --git a/src/checksum/zseries/crc32-s390x.c b/src/checksum/zseries/crc32-s390x.c index f77d6768d42..28b46594220 100644 --- a/src/checksum/zseries/crc32-s390x.c +++ b/src/checksum/zseries/crc32-s390x.c @@ -6,8 +6,20 @@ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> * */ +#include "wt_internal.h" + #include <sys/types.h> #include <endian.h> + +#if defined(HAVE_CRC32_HARDWARE) + +#include <sys/auxv.h> + +/* RHEL 7 has kernel support, but does not define this constant in the lib c headers. */ +#ifndef HWCAP_S390_VX +#define HWCAP_S390_VX 2048 +#endif + #include "crc32-s390x.h" #include "slicing-consts.h" @@ -69,8 +81,6 @@ unsigned int __wt_crc32c_le(unsigned int crc, const unsigned char *buf, size_t l /* Main CRC-32 functions */ DEFINE_CRC32_VX(__wt_crc32c_le_vx, __wt_crc32c_le_vgfm_16, __wt_crc32c_le) -#include "wt_internal.h" - /* * __wt_checksum_hw -- * WiredTiger: return a checksum for a chunk of memory. @@ -81,6 +91,8 @@ __wt_checksum_hw(const void *chunk, size_t len) return (~__wt_crc32c_le_vx(0xffffffff, chunk, len)); } +#endif + /* * __wt_checksum_init -- * WiredTiger: detect CRC hardware and set the checksum function. @@ -89,8 +101,14 @@ void __wt_checksum_init(void) { #if defined(HAVE_CRC32_HARDWARE) - __wt_process.checksum = __wt_checksum_hw; -#else + unsigned long caps = getauxval(AT_HWCAP); + + if (caps & HWCAP_S390_VX) + __wt_process.checksum = __wt_checksum_hw; + else + __wt_process.checksum = __wt_checksum_sw; + +#else /* !HAVE_CRC32_HARDWARE */ __wt_process.checksum = __wt_checksum_sw; #endif } diff --git a/src/config/config_api.c b/src/config/config_api.c index 05c5c1287a7..c1299baaafe 100644 --- a/src/config/config_api.c +++ b/src/config/config_api.c @@ -215,7 +215,7 @@ __wt_configure_method(WT_SESSION_IMPL *session, WT_CONFIG_ENTRY *entry; WT_CONNECTION_IMPL *conn; WT_DECL_RET; - size_t cnt; + size_t cnt, len; char *newcheck_name, *p; /* @@ -276,12 +276,10 @@ __wt_configure_method(WT_SESSION_IMPL *session, */ WT_ERR(__wt_calloc_one(session, &entry)); entry->method = (*epp)->method; - WT_ERR(__wt_calloc_def(session, - strlen((*epp)->base) + strlen(",") + strlen(config) + 1, &p)); - (void)strcpy(p, (*epp)->base); - (void)strcat(p, ","); - (void)strcat(p, config); + len = strlen((*epp)->base) + strlen(",") + strlen(config) + 1; + WT_ERR(__wt_calloc_def(session, len, &p)); entry->base = p; + WT_ERR(__wt_snprintf(p, len, "%s,%s", (*epp)->base, config)); /* * There may be a default value in the config argument passed in (for diff --git a/src/config/config_def.c b/src/config/config_def.c index e4fd7937a40..f152fbacad4 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -147,11 +147,12 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { confchk_WT_CONNECTION_reconfigure_statistics_log_subconfigs, 5 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," - "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," - "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\",\"salvage\"," - "\"shared_cache\",\"split\",\"temporary\",\"thread_group\"," - "\"transaction\",\"verify\",\"version\",\"write\"]", + "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\"," + "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," + "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," + "\"split\",\"temporary\",\"thread_group\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } }; @@ -293,7 +294,7 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_create[] = { { "source", "string", NULL, NULL, NULL, 0 }, { "split_deepen_min_child", "int", NULL, NULL, NULL, 0 }, { "split_deepen_per_child", "int", NULL, NULL, NULL, 0 }, - { "split_pct", "int", NULL, "min=25,max=100", NULL, 0 }, + { "split_pct", "int", NULL, "min=50,max=100", NULL, 0 }, { "type", "string", NULL, NULL, NULL, 0 }, { "value_format", "format", __wt_struct_confchk, NULL, @@ -465,7 +466,7 @@ static const WT_CONFIG_CHECK confchk_file_config[] = { { "prefix_compression_min", "int", NULL, "min=0", NULL, 0 }, { "split_deepen_min_child", "int", NULL, NULL, NULL, 0 }, { "split_deepen_per_child", "int", NULL, NULL, NULL, 0 }, - { "split_pct", "int", NULL, "min=25,max=100", NULL, 0 }, + { "split_pct", "int", NULL, "min=50,max=100", NULL, 0 }, { "value_format", "format", __wt_struct_confchk, NULL, NULL, 0 }, @@ -529,7 +530,7 @@ static const WT_CONFIG_CHECK confchk_file_meta[] = { { "prefix_compression_min", "int", NULL, "min=0", NULL, 0 }, { "split_deepen_min_child", "int", NULL, NULL, NULL, 0 }, { "split_deepen_per_child", "int", NULL, NULL, NULL, 0 }, - { "split_pct", "int", NULL, "min=25,max=100", NULL, 0 }, + { "split_pct", "int", NULL, "min=50,max=100", NULL, 0 }, { "value_format", "format", __wt_struct_confchk, NULL, NULL, 0 }, @@ -613,7 +614,7 @@ static const WT_CONFIG_CHECK confchk_lsm_meta[] = { { "prefix_compression_min", "int", NULL, "min=0", NULL, 0 }, { "split_deepen_min_child", "int", NULL, NULL, NULL, 0 }, { "split_deepen_per_child", "int", NULL, NULL, NULL, 0 }, - { "split_pct", "int", NULL, "min=25,max=100", NULL, 0 }, + { "split_pct", "int", NULL, "min=50,max=100", NULL, 0 }, { "value_format", "format", __wt_struct_confchk, NULL, NULL, 0 }, @@ -749,11 +750,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { { "use_environment_priv", "boolean", NULL, NULL, NULL, 0 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," - "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," - "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\",\"salvage\"," - "\"shared_cache\",\"split\",\"temporary\",\"thread_group\"," - "\"transaction\",\"verify\",\"version\",\"write\"]", + "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\"," + "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," + "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," + "\"split\",\"temporary\",\"thread_group\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { "write_through", "list", NULL, "choices=[\"data\",\"log\"]", @@ -835,11 +837,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { { "use_environment_priv", "boolean", NULL, NULL, NULL, 0 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," - "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," - "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\",\"salvage\"," - "\"shared_cache\",\"split\",\"temporary\",\"thread_group\"," - "\"transaction\",\"verify\",\"version\",\"write\"]", + "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\"," + "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," + "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," + "\"split\",\"temporary\",\"thread_group\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { "version", "string", NULL, NULL, NULL, 0 }, { "write_through", "list", @@ -916,11 +919,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { confchk_wiredtiger_open_transaction_sync_subconfigs, 2 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," - "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," - "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\",\"salvage\"," - "\"shared_cache\",\"split\",\"temporary\",\"thread_group\"," - "\"transaction\",\"verify\",\"version\",\"write\"]", + "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\"," + "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," + "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," + "\"split\",\"temporary\",\"thread_group\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { "version", "string", NULL, NULL, NULL, 0 }, { "write_through", "list", @@ -997,11 +1001,12 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { confchk_wiredtiger_open_transaction_sync_subconfigs, 2 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," - "\"evict\",\"evictserver\",\"fileops\",\"handleops\",\"log\"," - "\"lsm\",\"lsm_manager\",\"metadata\",\"mutex\",\"overflow\"," - "\"read\",\"rebalance\",\"reconcile\",\"recovery\",\"salvage\"," - "\"shared_cache\",\"split\",\"temporary\",\"thread_group\"," - "\"transaction\",\"verify\",\"version\",\"write\"]", + "\"evict\",\"evict_stuck\",\"evictserver\",\"fileops\"," + "\"handleops\",\"log\",\"lsm\",\"lsm_manager\",\"metadata\"," + "\"mutex\",\"overflow\",\"read\",\"rebalance\",\"reconcile\"," + "\"recovery\",\"recovery_progress\",\"salvage\",\"shared_cache\"," + "\"split\",\"temporary\",\"thread_group\",\"transaction\"," + "\"verify\",\"version\",\"write\"]", NULL, 0 }, { "write_through", "list", NULL, "choices=[\"data\",\"log\"]", @@ -1050,7 +1055,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { { "WT_CONNECTION.reconfigure", "async=(enabled=false,ops_max=1024,threads=2),cache_overhead=8," "cache_size=100MB,checkpoint=(log_size=0,wait=0),error_prefix=," - "eviction=(threads_max=1,threads_min=1)," + "eviction=(threads_max=8,threads_min=1)," "eviction_checkpoint_target=5,eviction_dirty_target=5," "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" ",file_manager=(close_handle_minimum=250,close_idle_time=30," @@ -1114,7 +1119,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "chunk_size=10MB,merge_max=15,merge_min=0),memory_page_max=5MB," "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false," "prefix_compression_min=4,source=,split_deepen_min_child=0," - "split_deepen_per_child=0,split_pct=75,type=file,value_format=u", + "split_deepen_per_child=0,split_pct=90,type=file,value_format=u", confchk_WT_SESSION_create, 42 }, { "WT_SESSION.drop", @@ -1208,7 +1213,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "leaf_value_max=0,log=(enabled=true),memory_page_max=5MB," "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false," "prefix_compression_min=4,split_deepen_min_child=0," - "split_deepen_per_child=0,split_pct=75,value_format=u", + "split_deepen_per_child=0,split_pct=90,value_format=u", confchk_file_config, 35 }, { "file.meta", @@ -1223,7 +1228,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=0," "log=(enabled=true),memory_page_max=5MB,os_cache_dirty_max=0," "os_cache_max=0,prefix_compression=false,prefix_compression_min=4" - ",split_deepen_min_child=0,split_deepen_per_child=0,split_pct=75," + ",split_deepen_min_child=0,split_deepen_per_child=0,split_pct=90," "value_format=u,version=(major=0,minor=0)", confchk_file_meta, 39 }, @@ -1248,7 +1253,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "merge_min=0),memory_page_max=5MB,old_chunks=," "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false," "prefix_compression_min=4,split_deepen_min_child=0," - "split_deepen_per_child=0,split_pct=75,value_format=u", + "split_deepen_per_child=0,split_pct=90,value_format=u", confchk_lsm_meta, 39 }, { "table.meta", @@ -1261,7 +1266,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { ",builtin_extension_config=,cache_overhead=8,cache_size=100MB," "checkpoint=(log_size=0,wait=0),checkpoint_sync=true," "config_base=true,create=false,direct_io=,encryption=(keyid=," - "name=,secretkey=),error_prefix=,eviction=(threads_max=1," + "name=,secretkey=),error_prefix=,eviction=(threads_max=8," "threads_min=1),eviction_checkpoint_target=5," "eviction_dirty_target=5,eviction_dirty_trigger=20," "eviction_target=80,eviction_trigger=95,exclusive=false," @@ -1285,7 +1290,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { ",builtin_extension_config=,cache_overhead=8,cache_size=100MB," "checkpoint=(log_size=0,wait=0),checkpoint_sync=true," "config_base=true,create=false,direct_io=,encryption=(keyid=," - "name=,secretkey=),error_prefix=,eviction=(threads_max=1," + "name=,secretkey=),error_prefix=,eviction=(threads_max=8," "threads_min=1),eviction_checkpoint_target=5," "eviction_dirty_target=5,eviction_dirty_trigger=20," "eviction_target=80,eviction_trigger=95,exclusive=false," @@ -1309,7 +1314,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { ",builtin_extension_config=,cache_overhead=8,cache_size=100MB," "checkpoint=(log_size=0,wait=0),checkpoint_sync=true,direct_io=," "encryption=(keyid=,name=,secretkey=),error_prefix=," - "eviction=(threads_max=1,threads_min=1)," + "eviction=(threads_max=8,threads_min=1)," "eviction_checkpoint_target=5,eviction_dirty_target=5," "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" ",extensions=,file_extend=,file_manager=(close_handle_minimum=250" @@ -1330,7 +1335,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { ",builtin_extension_config=,cache_overhead=8,cache_size=100MB," "checkpoint=(log_size=0,wait=0),checkpoint_sync=true,direct_io=," "encryption=(keyid=,name=,secretkey=),error_prefix=," - "eviction=(threads_max=1,threads_min=1)," + "eviction=(threads_max=8,threads_min=1)," "eviction_checkpoint_target=5,eviction_dirty_target=5," "eviction_dirty_trigger=20,eviction_target=80,eviction_trigger=95" ",extensions=,file_extend=,file_manager=(close_handle_minimum=250" diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index 474b8bbad8a..68d45678965 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -1662,8 +1662,8 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR_MSG(session, EINVAL, "Creating a new database is incompatible with " "read-only configuration"); - len = (size_t)snprintf(buf, sizeof(buf), - "%s\n%s\n", WT_WIREDTIGER, WIREDTIGER_VERSION_STRING); + WT_ERR(__wt_snprintf_len_set(buf, sizeof(buf), &len, + "%s\n%s\n", WT_WIREDTIGER, WIREDTIGER_VERSION_STRING)); WT_ERR(__wt_write(session, fh, (wt_off_t)0, len, buf)); WT_ERR(__wt_fsync(session, fh, true)); } else { @@ -1798,6 +1798,7 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) { "checkpoint", WT_VERB_CHECKPOINT }, { "compact", WT_VERB_COMPACT }, { "evict", WT_VERB_EVICT }, + { "evict_stuck", WT_VERB_EVICT_STUCK }, { "evictserver", WT_VERB_EVICTSERVER }, { "fileops", WT_VERB_FILEOPS }, { "handleops", WT_VERB_HANDLEOPS }, @@ -1811,6 +1812,7 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) { "rebalance", WT_VERB_REBALANCE }, { "reconcile", WT_VERB_RECONCILE }, { "recovery", WT_VERB_RECOVERY }, + { "recovery_progress", WT_VERB_RECOVERY_PROGRESS }, { "salvage", WT_VERB_SALVAGE }, { "shared_cache", WT_VERB_SHARED_CACHE }, { "split", WT_VERB_SPLIT }, @@ -1986,6 +1988,16 @@ __conn_set_file_system( CONNECTION_API_CALL(conn, session, set_file_system, config, cfg); WT_UNUSED(cfg); + /* + * You can only configure a file system once, and attempting to do it + * again probably means the extension argument didn't have early-load + * set and we've already configured the default file system. + */ + if (conn->file_system != NULL) + WT_ERR_MSG(session, EPERM, + "filesystem already configured; custom filesystems should " + "enable \"early_load\" configuration"); + conn->file_system = file_system; err: API_END_RET(session, ret); @@ -2174,6 +2186,15 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, if (cval.val) F_SET(conn, WT_CONN_READONLY); + /* Configure error messages so we get them right early. */ + WT_ERR(__wt_config_gets(session, cfg, "error_prefix", &cval)); + if (cval.len != 0) + WT_ERR(__wt_strndup( + session, cval.str, cval.len, &conn->error_prefix)); + + /* Set the database home so extensions have access to it. */ + WT_ERR(__conn_home(session, home, cfg)); + /* * Load early extensions before doing further initialization (one early * extension is to configure a file system). @@ -2197,6 +2218,9 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WT_ERR( __conn_chk_file_system(session, F_ISSET(conn, WT_CONN_READONLY))); + /* Make sure no other thread of control already owns this database. */ + WT_ERR(__conn_single(session, cfg)); + /* * Capture the config_base setting file for later use. Again, if the * application doesn't want us to read the base configuration file, @@ -2206,18 +2230,6 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WT_ERR(__wt_config_gets(session, cfg, "config_base", &cval)); config_base_set = cval.val != 0; - /* Configure error messages so we get them right early. */ - WT_ERR(__wt_config_gets(session, cfg, "error_prefix", &cval)); - if (cval.len != 0) - WT_ERR(__wt_strndup( - session, cval.str, cval.len, &conn->error_prefix)); - - /* Get the database home. */ - WT_ERR(__conn_home(session, home, cfg)); - - /* Make sure no other thread of control already owns this database. */ - WT_ERR(__conn_single(session, cfg)); - /* * Build the real configuration stack, in the following order (where * later entries override earlier entries): @@ -2238,10 +2250,9 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WT_ERR(__wt_scr_alloc(session, 0, &i3)); cfg[0] = WT_CONFIG_BASE(session, wiredtiger_open_all); cfg[1] = NULL; - WT_ERR_TEST(snprintf(version, sizeof(version), + WT_ERR(__wt_snprintf(version, sizeof(version), "version=(major=%d,minor=%d)", - WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR) >= - (int)sizeof(version), ENOMEM); + WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR)); __conn_config_append(cfg, version); /* Ignore the base_config file if config_base_set is false. */ diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c index fe5f94ea03d..28dd06332e0 100644 --- a/src/conn/conn_cache.c +++ b/src/conn/conn_cache.c @@ -143,7 +143,8 @@ __wt_cache_config(WT_SESSION_IMPL *session, bool reconfigure, const char *cfg[]) if (reconfigure) WT_RET(__wt_thread_group_resize( session, &conn->evict_threads, - conn->evict_threads_min, conn->evict_threads_max, + conn->evict_threads_min, + conn->evict_threads_max, WT_THREAD_CAN_WAIT | WT_THREAD_PANIC_FAIL)); return (0); @@ -186,8 +187,8 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET_MSG(session, EINVAL, "eviction target must be lower than the eviction trigger"); - WT_RET(__wt_cond_auto_alloc(session, "cache eviction server", - false, 10000, WT_MILLION, &cache->evict_cond)); + WT_RET(__wt_cond_auto_alloc(session, + "cache eviction server", 10000, WT_MILLION, &cache->evict_cond)); WT_RET(__wt_spin_init(session, &cache->evict_pass_lock, "evict pass")); WT_RET(__wt_spin_init(session, &cache->evict_queue_lock, "cache eviction queue")); @@ -311,7 +312,7 @@ __wt_cache_destroy(WT_SESSION_IMPL *session) cache->bytes_dirty_intl + cache->bytes_dirty_leaf, cache->pages_dirty_intl + cache->pages_dirty_leaf); - WT_TRET(__wt_cond_auto_destroy(session, &cache->evict_cond)); + WT_TRET(__wt_cond_destroy(session, &cache->evict_cond)); __wt_spin_destroy(session, &cache->evict_pass_lock); __wt_spin_destroy(session, &cache->evict_queue_lock); __wt_spin_destroy(session, &cache->evict_walk_lock); diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c index 79c2fc23da5..ed078991581 100644 --- a/src/conn/conn_cache_pool.c +++ b/src/conn/conn_cache_pool.c @@ -32,7 +32,7 @@ */ #define WT_CACHE_POOL_APP_EVICT_MULTIPLIER 3 #define WT_CACHE_POOL_APP_WAIT_MULTIPLIER 6 -#define WT_CACHE_POOL_READ_MULTIPLIER 1 +#define WT_CACHE_POOL_READ_MULTIPLIER 1 static void __cache_pool_adjust( WT_SESSION_IMPL *, uint64_t, uint64_t, bool, bool *); @@ -104,8 +104,8 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) TAILQ_INIT(&cp->cache_pool_qh); WT_ERR(__wt_spin_init( session, &cp->cache_pool_lock, "cache shared pool")); - WT_ERR(__wt_cond_alloc(session, - "cache pool server", false, &cp->cache_pool_cond)); + WT_ERR(__wt_cond_alloc( + session, "cache pool server", &cp->cache_pool_cond)); __wt_process.cache_pool = cp; __wt_verbose(session, @@ -418,8 +418,9 @@ static void __cache_pool_balance(WT_SESSION_IMPL *session, bool forward) { WT_CACHE_POOL *cp; - bool adjusted; uint64_t bump_threshold, highest; + int i; + bool adjusted; cp = __wt_process.cache_pool; adjusted = false; @@ -438,11 +439,17 @@ __cache_pool_balance(WT_SESSION_IMPL *session, bool forward) /* * Actively attempt to: - * - Reduce the amount allocated, if we are over the budget + * - Reduce the amount allocated, if we are over the budget. * - Increase the amount used if there is capacity and any pressure. + * Don't keep trying indefinitely, if we aren't succeeding in reducing + * the cache in use re-assessing the participants' states is necessary. + * We are also holding a lock across this process, which can slow + * participant shutdown if we spend a long time balancing. */ - while (F_ISSET(cp, WT_CACHE_POOL_ACTIVE) && - F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN)) { + for (i = 0; + i < 2 * WT_CACHE_POOL_BUMP_THRESHOLD && + F_ISSET(cp, WT_CACHE_POOL_ACTIVE) && + F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN); i++) { __cache_pool_adjust( session, highest, bump_threshold, forward, &adjusted); /* @@ -565,7 +572,7 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, WT_CONNECTION_IMPL *entry; uint64_t adjustment, highest_percentile, pressure, reserved, smallest; u_int pct_full; - bool busy, pool_full, grow; + bool busy, decrease_ok, grow, pool_full; *adjustedp = false; cp = __wt_process.cache_pool; @@ -612,6 +619,34 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, continue; /* + * The bump threshold decreases as we try longer to balance + * the pool. Adjust how aggressively we free space from + * participants depending on how long we have been trying. + */ + decrease_ok = false; + /* + * Any participant is a candidate if we have been trying + * for long enough. + */ + if (bump_threshold == 0) + decrease_ok = true; + /* + * Participants that aren't doing application eviction and + * are showing a reasonable amount of usage are excluded + * even if we have been trying for a while. + */ + else if (bump_threshold < WT_CACHE_POOL_BUMP_THRESHOLD / 3 && + (!busy && highest > 1)) + decrease_ok = true; + /* + * Any participant that is proportionally less busy is a + * candidate from the first attempt. + */ + else if (highest > 1 && + pressure < WT_CACHE_POOL_REDUCE_THRESHOLD) + decrease_ok = true; + + /* * If the entry is currently allocated less than the reserved * size, increase its allocation. This should only happen if: * - it's the first time we've seen this member, or @@ -624,17 +659,12 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, * Conditions for reducing the amount of resources for an * entry: * - the pool is full, - * - application threads are not busy doing eviction already, * - this entry has more than the minimum amount of space in * use, - * - the read pressure in this entry is below the threshold, - * other entries need more cache, the entry has more than - * the minimum space and there is no available space in the - * pool. + * - it was determined that this slot is a good candidate */ - } else if (pool_full && !busy && - entry->cache_size > reserved && - pressure < WT_CACHE_POOL_REDUCE_THRESHOLD && highest > 1) { + } else if (pool_full && + entry->cache_size > reserved && decrease_ok) { grow = false; /* * Don't drop the size down too much - or it can @@ -733,7 +763,7 @@ __wt_cache_pool_server(void *arg) F_ISSET(cache, WT_CACHE_POOL_RUN)) { if (cp->currently_used <= cp->size) __wt_cond_wait( - session, cp->cache_pool_cond, WT_MILLION); + session, cp->cache_pool_cond, WT_MILLION, NULL); /* * Re-check pool run flag - since we want to avoid getting the diff --git a/src/conn/conn_ckpt.c b/src/conn/conn_ckpt.c index 1d18c128c5b..d5a6faf7bd7 100644 --- a/src/conn/conn_ckpt.c +++ b/src/conn/conn_ckpt.c @@ -65,6 +65,16 @@ __ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, bool *startp) } /* + * __ckpt_server_run_chk -- + * Check to decide if the checkpoint server should continue running. + */ +static bool +__ckpt_server_run_chk(WT_SESSION_IMPL *session) +{ + return (F_ISSET(S2C(session), WT_CONN_SERVER_CHECKPOINT)); +} + +/* * __ckpt_server -- * The checkpoint server thread. */ @@ -80,14 +90,18 @@ __ckpt_server(void *arg) conn = S2C(session); wt_session = (WT_SESSION *)session; - while (F_ISSET(conn, WT_CONN_SERVER_RUN) && - F_ISSET(conn, WT_CONN_SERVER_CHECKPOINT)) { + for (;;) { /* * Wait... * NOTE: If the user only configured logsize, then usecs * will be 0 and this wait won't return until signalled. */ - __wt_cond_wait(session, conn->ckpt_cond, conn->ckpt_usecs); + __wt_cond_wait(session, + conn->ckpt_cond, conn->ckpt_usecs, __ckpt_server_run_chk); + + /* Check if we're quitting or being reconfigured. */ + if (!__ckpt_server_run_chk(session)) + break; /* * Checkpoint the database if the connection is marked dirty. @@ -115,7 +129,8 @@ __ckpt_server(void *arg) * it so we don't do another checkpoint * immediately. */ - __wt_cond_wait(session, conn->ckpt_cond, 1); + __wt_cond_wait( + session, conn->ckpt_cond, 1, NULL); } } else WT_STAT_CONN_INCR(session, txn_checkpoint_skipped); @@ -154,8 +169,7 @@ __ckpt_server_start(WT_CONNECTION_IMPL *conn) "checkpoint-server", true, session_flags, &conn->ckpt_session)); session = conn->ckpt_session; - WT_RET(__wt_cond_alloc( - session, "checkpoint server", false, &conn->ckpt_cond)); + WT_RET(__wt_cond_alloc(session, "checkpoint server", &conn->ckpt_cond)); /* * Start the thread. diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index e9e3925c57e..657cdebf7ee 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -12,37 +12,47 @@ * __conn_dhandle_destroy -- * Destroy a data handle. */ -static void +static int __conn_dhandle_destroy(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle) { + WT_DECL_RET; + + WT_WITH_DHANDLE(session, dhandle, ret = __wt_btree_discard(session)); + __wt_rwlock_destroy(session, &dhandle->rwlock); __wt_free(session, dhandle->name); __wt_free(session, dhandle->checkpoint); - __wt_free(session, dhandle->handle); __wt_spin_destroy(session, &dhandle->close_lock); __wt_stat_dsrc_discard(session, dhandle); __wt_overwrite_and_free(session, dhandle); + return (ret); } /* - * __conn_dhandle_alloc -- + * __wt_conn_dhandle_alloc -- * Allocate a new data handle and return it linked into the connection's * list. */ -static int -__conn_dhandle_alloc(WT_SESSION_IMPL *session, - const char *uri, const char *checkpoint, WT_DATA_HANDLE **dhandlep) +int +__wt_conn_dhandle_alloc( + WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) { WT_BTREE *btree; WT_DATA_HANDLE *dhandle; WT_DECL_RET; uint64_t bucket; - *dhandlep = NULL; + /* + * Ensure no one beat us to creating the handle now that we hold the + * write lock. + */ + if ((ret = + __wt_conn_dhandle_find(session, uri, checkpoint)) != WT_NOTFOUND) + return (ret); WT_RET(__wt_calloc_one(session, &dhandle)); - WT_ERR(__wt_rwlock_alloc(session, &dhandle->rwlock, "data handle")); + __wt_rwlock_init(session, &dhandle->rwlock); dhandle->name_hash = __wt_hash_city64(uri, strlen(uri)); WT_ERR(__wt_strdup(session, uri, &dhandle->name)); WT_ERR(__wt_strdup(session, checkpoint, &dhandle->checkpoint)); @@ -75,10 +85,10 @@ __conn_dhandle_alloc(WT_SESSION_IMPL *session, bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE; WT_CONN_DHANDLE_INSERT(S2C(session), dhandle, bucket); - *dhandlep = dhandle; + session->dhandle = dhandle; return (0); -err: __conn_dhandle_destroy(session, dhandle); +err: WT_TRET(__conn_dhandle_destroy(session, dhandle)); return (ret); } @@ -122,10 +132,7 @@ __wt_conn_dhandle_find( } } - WT_RET(__conn_dhandle_alloc(session, uri, checkpoint, &dhandle)); - - session->dhandle = dhandle; - return (0); + return (WT_NOTFOUND); } /* @@ -153,11 +160,11 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) WT_RET(__wt_evict_file_exclusive_on(session)); /* - * If we don't already have the schema lock, make it an error to try - * to acquire it. The problem is that we are holding an exclusive - * lock on the handle, and if we attempt to acquire the schema lock - * we might deadlock with a thread that has the schema lock and wants - * a handle lock (specifically, checkpoint). + * If we don't already have the schema lock, make it an error to try to + * acquire it. The problem is that we are holding an exclusive lock on + * the handle, and if we attempt to acquire the schema lock we might + * deadlock with a thread that has the schema lock and wants a handle + * lock. */ no_schema_lock = false; if (!F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)) { @@ -197,6 +204,7 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) } WT_TRET(__wt_btree_close(session)); + F_CLR(btree, WT_BTREE_SPECIAL_FLAGS); /* * If we marked a handle dead it will be closed by sweep, via @@ -306,7 +314,8 @@ __wt_conn_btree_open( F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) && !LF_ISSET(WT_DHANDLE_LOCK_ONLY)); - WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_CLOSING)); + WT_ASSERT(session, + !F_ISSET(S2C(session), WT_CONN_CLOSING_NO_MORE_OPENS)); /* * If the handle is already open, it has to be closed so it can be @@ -400,10 +409,7 @@ __conn_btree_apply_internal(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, return (ret == EBUSY ? 0 : ret); WT_SAVE_DHANDLE(session, ret = file_func(session, cfg)); - if (WT_META_TRACKING(session)) - WT_TRET(__wt_meta_track_handle_lock(session, false)); - else - WT_TRET(__wt_session_release_btree(session)); + WT_TRET(__wt_session_release_btree(session)); return (ret); } @@ -419,12 +425,11 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session, const char *uri, { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; + WT_DECL_RET; uint64_t bucket; conn = S2C(session); - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); - /* * If we're given a URI, then we walk only the hash list for that * name. If we don't have a URI we walk the entire dhandle list. @@ -432,29 +437,42 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session, const char *uri, if (uri != NULL) { bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE; - TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) { + + for (dhandle = NULL;;) { + WT_WITH_HANDLE_LIST_READ_LOCK(session, + WT_DHANDLE_NEXT(session, dhandle, + &conn->dhhash[bucket], hashq)); + if (dhandle == NULL) + return (0); + if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) || F_ISSET(dhandle, WT_DHANDLE_DEAD) || dhandle->checkpoint != NULL || strcmp(uri, dhandle->name) != 0) continue; - WT_RET(__conn_btree_apply_internal( - session, dhandle, file_func, name_func, cfg)); + WT_ERR(__conn_btree_apply_internal(session, + dhandle, file_func, name_func, cfg)); } } else { - TAILQ_FOREACH(dhandle, &conn->dhqh, q) { + for (dhandle = NULL;;) { + WT_WITH_HANDLE_LIST_READ_LOCK(session, + WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q)); + if (dhandle == NULL) + return (0); + if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) || F_ISSET(dhandle, WT_DHANDLE_DEAD) || dhandle->checkpoint != NULL || !WT_PREFIX_MATCH(dhandle->name, "file:") || WT_IS_METADATA(dhandle)) continue; - WT_RET(__conn_btree_apply_internal( - session, dhandle, file_func, name_func, cfg)); + WT_ERR(__conn_btree_apply_internal(session, + dhandle, file_func, name_func, cfg)); } } - return (0); +err: WT_DHANDLE_RELEASE(dhandle); + return (ret); } /* @@ -473,7 +491,8 @@ __wt_conn_dhandle_close_all( conn = S2C(session); - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); + WT_ASSERT(session, + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); WT_ASSERT(session, session->dhandle == NULL); bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE; @@ -484,7 +503,12 @@ __wt_conn_dhandle_close_all( session->dhandle = dhandle; - /* Lock the handle exclusively. */ + /* + * Lock the handle exclusively. If this is part of + * schema-changing operation (indicated by metadata tracking + * being enabled), hold the lock for the duration of the + * operation. + */ WT_ERR(__wt_session_get_btree(session, dhandle->name, dhandle->checkpoint, NULL, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_LOCK_ONLY)); @@ -534,7 +558,8 @@ __conn_dhandle_remove(WT_SESSION_IMPL *session, bool final) dhandle = session->dhandle; bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE; - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); + WT_ASSERT(session, + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); WT_ASSERT(session, dhandle != conn->cache->evict_file_next); /* Check if the handle was reacquired by a session while we waited. */ @@ -583,7 +608,7 @@ __wt_conn_dhandle_discard_single( } /* Try to remove the handle, protected by the data handle lock. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, tret = __conn_dhandle_remove(session, final)); if (set_pass_intr) (void)__wt_atomic_subv32(&S2C(session)->cache->pass_intr, 1); @@ -594,7 +619,7 @@ __wt_conn_dhandle_discard_single( */ if (ret == 0 || final) { __conn_btree_config_clear(session); - __conn_dhandle_destroy(session, dhandle); + WT_TRET(__conn_dhandle_destroy(session, dhandle)); session->dhandle = NULL; } diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c index 02182daa7dc..287e9ca7b99 100644 --- a/src/conn/conn_handle.c +++ b/src/conn/conn_handle.c @@ -53,19 +53,18 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) /* Spinlocks. */ WT_RET(__wt_spin_init(session, &conn->api_lock, "api")); WT_SPIN_INIT_TRACKED(session, &conn->checkpoint_lock, checkpoint); - WT_SPIN_INIT_TRACKED(session, &conn->dhandle_lock, handle_list); WT_RET(__wt_spin_init(session, &conn->encryptor_lock, "encryptor")); WT_RET(__wt_spin_init(session, &conn->fh_lock, "file list")); WT_RET(__wt_spin_init(session, &conn->las_lock, "lookaside table")); WT_SPIN_INIT_TRACKED(session, &conn->metadata_lock, metadata); WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure")); WT_SPIN_INIT_TRACKED(session, &conn->schema_lock, schema); - WT_SPIN_INIT_TRACKED(session, &conn->table_lock, table); WT_RET(__wt_spin_init(session, &conn->turtle_lock, "turtle file")); /* Read-write locks */ - WT_RET(__wt_rwlock_alloc( - session, &conn->hot_backup_lock, "hot backup")); + __wt_rwlock_init(session, &conn->dhandle_lock); + __wt_rwlock_init(session, &conn->hot_backup_lock); + __wt_rwlock_init(session, &conn->table_lock); WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS, &conn->page_lock)); for (i = 0; i < WT_PAGE_LOCKS; ++i) @@ -80,7 +79,7 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_RET(__wt_spin_init( session, &conn->lsm_manager.switch_lock, "LSM switch queue lock")); WT_RET(__wt_cond_alloc( - session, "LSM worker cond", false, &conn->lsm_manager.work_cond)); + session, "LSM worker cond", &conn->lsm_manager.work_cond)); /* * Generation numbers. @@ -110,16 +109,15 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) * __wt_connection_destroy -- * Destroy the connection's underlying WT_CONNECTION_IMPL structure. */ -int +void __wt_connection_destroy(WT_CONNECTION_IMPL *conn) { - WT_DECL_RET; WT_SESSION_IMPL *session; u_int i; /* Check there's something to destroy. */ if (conn == NULL) - return (0); + return; session = conn->default_session; @@ -136,7 +134,7 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_spin_destroy(session, &conn->api_lock); __wt_spin_destroy(session, &conn->block_lock); __wt_spin_destroy(session, &conn->checkpoint_lock); - __wt_spin_destroy(session, &conn->dhandle_lock); + __wt_rwlock_destroy(session, &conn->dhandle_lock); __wt_spin_destroy(session, &conn->encryptor_lock); __wt_spin_destroy(session, &conn->fh_lock); __wt_rwlock_destroy(session, &conn->hot_backup_lock); @@ -144,17 +142,12 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_spin_destroy(session, &conn->metadata_lock); __wt_spin_destroy(session, &conn->reconfig_lock); __wt_spin_destroy(session, &conn->schema_lock); - __wt_spin_destroy(session, &conn->table_lock); + __wt_rwlock_destroy(session, &conn->table_lock); __wt_spin_destroy(session, &conn->turtle_lock); for (i = 0; i < WT_PAGE_LOCKS; ++i) __wt_spin_destroy(session, &conn->page_lock[i]); __wt_free(session, conn->page_lock); - /* Destroy the file-system configuration. */ - if (conn->file_system != NULL && conn->file_system->terminate != NULL) - WT_TRET(conn->file_system->terminate( - conn->file_system, (WT_SESSION *)session)); - /* Free allocated memory. */ __wt_free(session, conn->cfg); __wt_free(session, conn->home); @@ -163,5 +156,4 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_stat_connection_discard(session, conn); __wt_free(NULL, conn); - return (ret); } diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index 8198b3a1a02..b8b5bd2a908 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -174,7 +174,7 @@ __logmgr_config( WT_RET(__logmgr_sync_cfg(session, cfg)); if (conn->log_cond != NULL) - __wt_cond_auto_signal(session, conn->log_cond); + __wt_cond_signal(session, conn->log_cond); return (0); } @@ -237,7 +237,7 @@ __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file) * We can only archive files if a hot backup is not in progress or * if we are the backup. */ - __wt_readlock(session, conn->hot_backup_lock); + __wt_readlock(session, &conn->hot_backup_lock); locked = true; if (!conn->hot_backup || backup_file != 0) { for (i = 0; i < logcount; i++) { @@ -248,7 +248,7 @@ __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file) session, WT_LOG_FILENAME, lognum)); } } - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); locked = false; /* @@ -260,7 +260,7 @@ __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file) if (0) err: __wt_err(session, ret, "log archive server error"); if (locked) - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount)); return (ret); } @@ -341,7 +341,7 @@ __wt_log_truncate_files( conn = S2C(session); if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) return (0); - if (F_ISSET(conn, WT_CONN_SERVER_RUN) && + if (F_ISSET(conn, WT_CONN_SERVER_LOG) && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) WT_RET_MSG(session, EINVAL, "Attempt to archive manually while a server is running"); @@ -355,9 +355,9 @@ __wt_log_truncate_files( __wt_verbose(session, WT_VERB_LOG, "log_truncate_files: Archive once up to %" PRIu32, backup_file); - __wt_writelock(session, log->log_archive_lock); + __wt_writelock(session, &log->log_archive_lock); ret = __log_archive_once(session, backup_file); - __wt_writeunlock(session, log->log_archive_lock); + __wt_writeunlock(session, &log->log_archive_lock); return (ret); } @@ -382,7 +382,7 @@ __log_file_server(void *arg) conn = S2C(session); log = conn->log; locked = false; - while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { + while (F_ISSET(conn, WT_CONN_SERVER_LOG)) { /* * If there is a log file to close, make sure any outstanding * write operations have completed, then fsync and close it. @@ -433,7 +433,7 @@ __log_file_server(void *arg) */ if (!conn->hot_backup) { __wt_readlock( - session, conn->hot_backup_lock); + session, &conn->hot_backup_lock); if (!conn->hot_backup) WT_ERR_ERROR_OK( __wt_ftruncate(session, @@ -441,7 +441,7 @@ __log_file_server(void *arg) close_end_lsn.l.offset), ENOTSUP); __wt_readunlock( - session, conn->hot_backup_lock); + session, &conn->hot_backup_lock); } WT_SET_LSN(&close_end_lsn, close_end_lsn.l.file + 1, 0); @@ -505,8 +505,7 @@ __log_file_server(void *arg) locked = false; __wt_spin_unlock(session, &log->log_sync_lock); } else { - __wt_cond_auto_signal( - session, conn->log_wrlsn_cond); + __wt_cond_signal(session, conn->log_wrlsn_cond); /* * We do not want to wait potentially a second * to process this. Yield to give the wrlsn @@ -517,8 +516,9 @@ __log_file_server(void *arg) continue; } } + /* Wait until the next event. */ - __wt_cond_wait(session, conn->log_file_cond, WT_MILLION / 10); + __wt_cond_wait(session, conn->log_file_cond, 100000, NULL); } if (0) { @@ -708,7 +708,7 @@ __log_wrlsn_server(void *arg) log = conn->log; yield = 0; WT_INIT_LSN(&prev); - while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { + while (F_ISSET(conn, WT_CONN_SERVER_LOG)) { /* * Write out any log record buffers if anything was done * since last time. Only call the function to walk the @@ -730,12 +730,8 @@ __log_wrlsn_server(void *arg) if (yield++ < WT_THOUSAND) __wt_yield(); else - /* - * Send in false because if we did any work we would - * not be on this path. - */ __wt_cond_auto_wait( - session, conn->log_wrlsn_cond, did_work); + session, conn->log_wrlsn_cond, did_work, NULL); } /* * On close we need to do this one more time because there could @@ -787,7 +783,7 @@ __log_server(void *arg) * takes to sync out an earlier file. */ did_work = true; - while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { + while (F_ISSET(conn, WT_CONN_SERVER_LOG)) { /* * Slots depend on future activity. Force out buffered * writes in case we are idle. This cannot be part of the @@ -814,10 +810,11 @@ __log_server(void *arg) * agreed not to rename or remove any files in * the database directory. */ - __wt_readlock(session, conn->hot_backup_lock); + __wt_readlock(session, &conn->hot_backup_lock); if (!conn->hot_backup) ret = __log_prealloc_once(session); - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock( + session, &conn->hot_backup_lock); WT_ERR(ret); } @@ -826,10 +823,10 @@ __log_server(void *arg) */ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) { if (__wt_try_writelock( - session, log->log_archive_lock) == 0) { + session, &log->log_archive_lock) == 0) { ret = __log_archive_once(session, 0); __wt_writeunlock( - session, log->log_archive_lock); + session, &log->log_archive_lock); WT_ERR(ret); } else __wt_verbose(session, WT_VERB_LOG, @@ -839,10 +836,9 @@ __log_server(void *arg) } /* Wait until the next event. */ - __wt_epoch(session, &start); - __wt_cond_auto_wait_signal(session, - conn->log_cond, did_work, &signalled); + __wt_cond_auto_wait_signal( + session, conn->log_cond, did_work, NULL, &signalled); __wt_epoch(session, &now); timediff = WT_TIMEDIFF_MS(now, start); } @@ -884,8 +880,7 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_spin_init(session, &log->log_sync_lock, "log sync")); WT_RET(__wt_spin_init(session, &log->log_writelsn_lock, "log write LSN")); - WT_RET(__wt_rwlock_alloc(session, - &log->log_archive_lock, "log archive lock")); + __wt_rwlock_init(session, &log->log_archive_lock); if (FLD_ISSET(conn->direct_io, WT_DIRECT_IO_LOG)) log->allocsize = (uint32_t) WT_MAX(conn->buffer_alignment, WT_LOG_ALIGN); @@ -904,10 +899,8 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_INIT_LSN(&log->write_lsn); WT_INIT_LSN(&log->write_start_lsn); log->fileid = 0; - WT_RET(__wt_cond_alloc( - session, "log sync", false, &log->log_sync_cond)); - WT_RET(__wt_cond_alloc( - session, "log write", false, &log->log_write_cond)); + WT_RET(__wt_cond_alloc(session, "log sync", &log->log_sync_cond)); + WT_RET(__wt_cond_alloc(session, "log write", &log->log_write_cond)); WT_RET(__wt_log_open(session)); WT_RET(__wt_log_slot_init(session)); @@ -930,6 +923,8 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) return (0); + F_SET(conn, WT_CONN_SERVER_LOG); + /* * Start the log close thread. It is not configurable. * If logging is enabled, this thread runs. @@ -937,8 +932,8 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) session_flags = WT_SESSION_NO_DATA_HANDLES; WT_RET(__wt_open_internal_session(conn, "log-close-server", false, session_flags, &conn->log_file_session)); - WT_RET(__wt_cond_alloc(conn->log_file_session, - "log close server", false, &conn->log_file_cond)); + WT_RET(__wt_cond_alloc( + conn->log_file_session, "log close server", &conn->log_file_cond)); /* * Start the log file close thread. @@ -954,8 +949,7 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) WT_RET(__wt_open_internal_session(conn, "log-wrlsn-server", false, session_flags, &conn->log_wrlsn_session)); WT_RET(__wt_cond_auto_alloc(conn->log_wrlsn_session, - "log write lsn server", false, 10000, WT_MILLION, - &conn->log_wrlsn_cond)); + "log write lsn server", 10000, WT_MILLION, &conn->log_wrlsn_cond)); WT_RET(__wt_thread_create(conn->log_wrlsn_session, &conn->log_wrlsn_tid, __log_wrlsn_server, conn->log_wrlsn_session)); conn->log_wrlsn_tid_set = true; @@ -969,13 +963,13 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) if (conn->log_session != NULL) { WT_ASSERT(session, conn->log_cond != NULL); WT_ASSERT(session, conn->log_tid_set == true); - __wt_cond_auto_signal(session, conn->log_cond); + __wt_cond_signal(session, conn->log_cond); } else { /* The log server gets its own session. */ WT_RET(__wt_open_internal_session(conn, "log-server", false, session_flags, &conn->log_session)); WT_RET(__wt_cond_auto_alloc(conn->log_session, - "log server", false, 50000, WT_MILLION, &conn->log_cond)); + "log server", 50000, WT_MILLION, &conn->log_cond)); /* * Start the thread. @@ -1001,6 +995,8 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) conn = S2C(session); + F_CLR(conn, WT_CONN_SERVER_LOG); + if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) { /* * We always set up the log_path so printlog can work without @@ -1011,7 +1007,7 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) return (0); } if (conn->log_tid_set) { - __wt_cond_auto_signal(session, conn->log_cond); + __wt_cond_signal(session, conn->log_cond); WT_TRET(__wt_thread_join(session, conn->log_tid)); conn->log_tid_set = false; } @@ -1026,7 +1022,7 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) conn->log_file_session = NULL; } if (conn->log_wrlsn_tid_set) { - __wt_cond_auto_signal(session, conn->log_wrlsn_cond); + __wt_cond_signal(session, conn->log_wrlsn_cond); WT_TRET(__wt_thread_join(session, conn->log_wrlsn_tid)); conn->log_wrlsn_tid_set = false; } @@ -1047,9 +1043,9 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) } /* Destroy the condition variables now that all threads are stopped */ - WT_TRET(__wt_cond_auto_destroy(session, &conn->log_cond)); + WT_TRET(__wt_cond_destroy(session, &conn->log_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond)); - WT_TRET(__wt_cond_auto_destroy(session, &conn->log_wrlsn_cond)); + WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log->log_write_cond)); diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c index d4ace127bb2..eb3c79422a0 100644 --- a/src/conn/conn_open.c +++ b/src/conn/conn_open.c @@ -21,12 +21,6 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]) session = conn->default_session; WT_ASSERT(session, session->iface.connection == &conn->iface); - /* - * Tell internal server threads to run: this must be set before opening - * any sessions. - */ - F_SET(conn, WT_CONN_SERVER_RUN | WT_CONN_LOG_SERVER_RUN); - /* WT_SESSION_IMPL array. */ WT_RET(__wt_calloc(session, conn->session_size, sizeof(WT_SESSION_IMPL), &conn->sessions)); @@ -100,8 +94,16 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) __wt_yield(); } - /* Clear any pending async ops. */ + /* Shut down the subsystems, ensuring workers see the state change. */ + F_SET(conn, WT_CONN_CLOSING); + WT_FULL_BARRIER(); + + /* + * Clear any pending async operations and shut down the async worker + * threads and system before closing LSM. + */ WT_TRET(__wt_async_flush(session)); + WT_TRET(__wt_async_destroy(session)); /* * Shut down server threads other than the eviction server, which is @@ -109,15 +111,20 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) * btree handles, so take care in ordering shutdown to make sure they * exit before files are closed. */ - F_CLR(conn, WT_CONN_SERVER_RUN); - WT_TRET(__wt_async_destroy(session)); WT_TRET(__wt_lsm_manager_destroy(session)); - WT_TRET(__wt_sweep_destroy(session)); - F_SET(conn, WT_CONN_CLOSING); + /* + * Once the async and LSM threads exit, we shouldn't be opening any + * more files. + */ + F_SET(conn, WT_CONN_CLOSING_NO_MORE_OPENS); + WT_FULL_BARRIER(); WT_TRET(__wt_checkpoint_server_destroy(session)); WT_TRET(__wt_statlog_destroy(session, true)); + WT_TRET(__wt_sweep_destroy(session)); + + /* The eviction server is shut down last. */ WT_TRET(__wt_evict_destroy(session)); /* Shut down the lookaside table, after all eviction is complete. */ @@ -126,7 +133,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) /* Close open data handles. */ WT_TRET(__wt_conn_dhandle_discard(session)); - /* Shut down metadata tracking, required before creating tables. */ + /* Shut down metadata tracking. */ WT_TRET(__wt_meta_track_destroy(session)); /* @@ -140,7 +147,6 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE)) WT_TRET(__wt_txn_checkpoint_log( session, true, WT_TXN_LOG_CKPT_STOP, NULL)); - F_CLR(conn, WT_CONN_LOG_SERVER_RUN); WT_TRET(__wt_logmgr_destroy(session)); /* Free memory for collators, compressors, data sources. */ @@ -159,15 +165,6 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) /* Discard transaction state. */ __wt_txn_global_destroy(session); - /* Close extensions, first calling any unload entry point. */ - while ((dlh = TAILQ_FIRST(&conn->dlhqh)) != NULL) { - TAILQ_REMOVE(&conn->dlhqh, dlh, q); - - if (dlh->terminate != NULL) - WT_TRET(dlh->terminate(wt_conn)); - WT_TRET(__wt_dlclose(session, dlh)); - } - /* Close the lock file, opening up the database to other connections. */ if (conn->lock_fh != NULL) WT_TRET(__wt_close(session, &conn->lock_fh)); @@ -199,8 +196,22 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) __wt_free(session, s->hazard); } + /* Destroy the file-system configuration. */ + if (conn->file_system != NULL && conn->file_system->terminate != NULL) + WT_TRET(conn->file_system->terminate( + conn->file_system, (WT_SESSION *)session)); + + /* Close extensions, first calling any unload entry point. */ + while ((dlh = TAILQ_FIRST(&conn->dlhqh)) != NULL) { + TAILQ_REMOVE(&conn->dlhqh, dlh, q); + + if (dlh->terminate != NULL) + WT_TRET(dlh->terminate(wt_conn)); + WT_TRET(__wt_dlclose(session, dlh)); + } + /* Destroy the handle. */ - WT_TRET(__wt_connection_destroy(conn)); + __wt_connection_destroy(conn); return (ret); } diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c index 3bcdfd7ecb1..d89392b66c6 100644 --- a/src/conn/conn_stat.c +++ b/src/conn/conn_stat.c @@ -409,7 +409,6 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp) struct timespec ts; struct tm *tm, _tm; WT_CONNECTION_IMPL *conn; - WT_DECL_RET; WT_FSTREAM *log_stream; conn = S2C(session); @@ -446,12 +445,9 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp) * Lock the schema and walk the list of open handles, dumping * any that match the list of object sources. */ - if (conn->stat_sources != NULL) { - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_conn_btree_apply( + if (conn->stat_sources != NULL) + WT_RET(__wt_conn_btree_apply( session, NULL, __statlog_apply, NULL, NULL)); - WT_RET(ret); - } /* * Walk the list of open LSM trees, dumping any that match the @@ -485,8 +481,7 @@ __statlog_on_close(WT_SESSION_IMPL *session) if (!FLD_ISSET(conn->stat_flags, WT_STAT_ON_CLOSE)) return (0); - if (F_ISSET(conn, WT_CONN_SERVER_RUN) && - F_ISSET(conn, WT_CONN_SERVER_STATISTICS)) + if (F_ISSET(conn, WT_CONN_SERVER_STATISTICS)) WT_RET_MSG(session, EINVAL, "Attempt to log statistics while a server is running"); @@ -498,6 +493,16 @@ err: __wt_scr_free(session, &tmp); } /* + * __statlog_server_run_chk -- + * Check to decide if the statistics log server should continue running. + */ +static bool +__statlog_server_run_chk(WT_SESSION_IMPL *session) +{ + return (F_ISSET(S2C(session), WT_CONN_SERVER_STATISTICS)); +} + +/* * __statlog_server -- * The statistics server thread. */ @@ -525,10 +530,14 @@ __statlog_server(void *arg) WT_ERR(__wt_buf_init(session, &path, strlen(conn->stat_path) + 128)); WT_ERR(__wt_buf_init(session, &tmp, strlen(conn->stat_path) + 128)); - while (F_ISSET(conn, WT_CONN_SERVER_RUN) && - F_ISSET(conn, WT_CONN_SERVER_STATISTICS)) { + for (;;) { /* Wait until the next event. */ - __wt_cond_wait(session, conn->stat_cond, conn->stat_usecs); + __wt_cond_wait(session, conn->stat_cond, + conn->stat_usecs, __statlog_server_run_chk); + + /* Check if we're quitting or being reconfigured. */ + if (!__statlog_server_run_chk(session)) + break; if (WT_STAT_ENABLED(session)) WT_ERR(__statlog_log_one(session, &path, &tmp)); @@ -563,7 +572,7 @@ __statlog_start(WT_CONNECTION_IMPL *conn) session = conn->stat_session; WT_RET(__wt_cond_alloc( - session, "statistics log server", false, &conn->stat_cond)); + session, "statistics log server", &conn->stat_cond)); /* * Start the thread. diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c index d1254d8afcc..22d90b08438 100644 --- a/src/conn/conn_sweep.c +++ b/src/conn/conn_sweep.c @@ -10,7 +10,7 @@ #define WT_DHANDLE_CAN_DISCARD(dhandle) \ (!F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_OPEN) && \ - dhandle->session_inuse == 0 && dhandle->session_ref == 0) + (dhandle)->session_inuse == 0 && (dhandle)->session_ref == 0) /* * __sweep_mark -- @@ -81,7 +81,7 @@ __sweep_expire_one(WT_SESSION_IMPL *session) * handle list lock so that connection-level handle searches * never need to retry. */ - WT_RET(__wt_try_writelock(session, dhandle->rwlock)); + WT_RET(__wt_try_writelock(session, &dhandle->rwlock)); /* Only sweep clean trees where all updates are visible. */ if (btree->modified || @@ -95,7 +95,7 @@ __sweep_expire_one(WT_SESSION_IMPL *session) */ ret = __wt_conn_btree_sync_and_close(session, false, true); -err: __wt_writeunlock(session, dhandle->rwlock); +err: __wt_writeunlock(session, &dhandle->rwlock); return (ret); } @@ -188,7 +188,7 @@ __sweep_remove_one(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle) WT_DECL_RET; /* Try to get exclusive access. */ - WT_RET(__wt_try_writelock(session, dhandle->rwlock)); + WT_RET(__wt_try_writelock(session, &dhandle->rwlock)); /* * If there are no longer any references to the handle in any @@ -205,7 +205,7 @@ __sweep_remove_one(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle) * don't retry the discard until it times out again. */ if (ret != 0) { -err: __wt_writeunlock(session, dhandle->rwlock); +err: __wt_writeunlock(session, &dhandle->rwlock); } return (ret); @@ -233,7 +233,7 @@ __sweep_remove_handles(WT_SESSION_IMPL *session) if (!WT_DHANDLE_CAN_DISCARD(dhandle)) continue; - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __sweep_remove_one(session, dhandle)); if (ret == 0) WT_STAT_CONN_INCR(session, dh_sweep_remove); @@ -246,6 +246,16 @@ __sweep_remove_handles(WT_SESSION_IMPL *session) } /* + * __sweep_server_run_chk -- + * Check to decide if the checkpoint server should continue running. + */ +static bool +__sweep_server_run_chk(WT_SESSION_IMPL *session) +{ + return (F_ISSET(S2C(session), WT_CONN_SERVER_SWEEP)); +} + +/* * __sweep_server -- * The handle sweep server thread. */ @@ -266,11 +276,15 @@ __sweep_server(void *arg) /* * Sweep for dead and excess handles. */ - while (F_ISSET(conn, WT_CONN_SERVER_RUN) && - F_ISSET(conn, WT_CONN_SERVER_SWEEP)) { + for (;;) { /* Wait until the next event. */ - __wt_cond_wait(session, - conn->sweep_cond, conn->sweep_interval * WT_MILLION); + __wt_cond_wait(session, conn->sweep_cond, + conn->sweep_interval * WT_MILLION, __sweep_server_run_chk); + + /* Check if we're quitting or being reconfigured. */ + if (!__sweep_server_run_chk(session)) + break; + __wt_seconds(session, &now); WT_STAT_CONN_INCR(session, dh_sweeps); @@ -390,7 +404,7 @@ __wt_sweep_create(WT_SESSION_IMPL *session) session = conn->sweep_session; WT_RET(__wt_cond_alloc( - session, "handle sweep server", false, &conn->sweep_cond)); + session, "handle sweep server", &conn->sweep_cond)); WT_RET(__wt_thread_create( session, &conn->sweep_tid, __sweep_server, session)); diff --git a/src/cursor/cur_backup.c b/src/cursor/cur_backup.c index 456aa2e0f02..61ced8d11e7 100644 --- a/src/cursor/cur_backup.c +++ b/src/cursor/cur_backup.c @@ -230,10 +230,10 @@ __backup_start( * We are holding the checkpoint and schema locks so schema operations * will not see the backup file list until it is complete and valid. */ - __wt_writelock(session, conn->hot_backup_lock); + __wt_writelock(session, &conn->hot_backup_lock); conn->hot_backup = true; conn->hot_backup_list = NULL; - __wt_writeunlock(session, conn->hot_backup_lock); + __wt_writeunlock(session, &conn->hot_backup_lock); /* We're the lock holder, we own cleanup. */ F_SET(cb, WT_CURBACKUP_LOCKER); @@ -297,9 +297,9 @@ err: /* Close the hot backup file. */ if (ret == 0) { WT_ASSERT(session, dest != NULL); WT_TRET(__wt_fs_rename(session, WT_BACKUP_TMP, dest, false)); - __wt_writelock(session, conn->hot_backup_lock); + __wt_writelock(session, &conn->hot_backup_lock); conn->hot_backup_list = cb->list; - __wt_writeunlock(session, conn->hot_backup_lock); + __wt_writeunlock(session, &conn->hot_backup_lock); } return (ret); @@ -319,9 +319,9 @@ __backup_stop(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb) conn = S2C(session); /* Release all btree names held by the backup. */ - __wt_writelock(session, conn->hot_backup_lock); + __wt_writelock(session, &conn->hot_backup_lock); conn->hot_backup_list = NULL; - __wt_writeunlock(session, conn->hot_backup_lock); + __wt_writeunlock(session, &conn->hot_backup_lock); if (cb->list != NULL) { for (i = 0; cb->list[i] != NULL; ++i) __wt_free(session, cb->list[i]); @@ -332,9 +332,9 @@ __backup_stop(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb) WT_TRET(__wt_backup_file_remove(session)); /* Checkpoint deletion can proceed, as can the next hot backup. */ - __wt_writelock(session, conn->hot_backup_lock); + __wt_writelock(session, &conn->hot_backup_lock); conn->hot_backup = false; - __wt_writeunlock(session, conn->hot_backup_lock); + __wt_writeunlock(session, &conn->hot_backup_lock); return (ret); } @@ -346,13 +346,9 @@ __backup_stop(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb) static int __backup_all(WT_SESSION_IMPL *session) { - WT_DECL_RET; - /* Build a list of the file objects that need to be copied. */ - WT_WITH_HANDLE_LIST_LOCK(session, ret = - __wt_meta_apply_all(session, NULL, __backup_list_uri_append, NULL)); - - return (ret); + return (__wt_meta_apply_all( + session, NULL, __backup_list_uri_append, NULL)); } /* diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c index 0ec917fbf95..205afb607c3 100644 --- a/src/cursor/cur_file.c +++ b/src/cursor/cur_file.c @@ -9,29 +9,6 @@ #include "wt_internal.h" /* - * WT_BTREE_CURSOR_SAVE_AND_RESTORE - * Save the cursor's key/value data/size fields, call an underlying btree - * function, and then consistently handle failure and success. - */ -#define WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, f, ret) do { \ - WT_ITEM __key_copy = (cursor)->key; \ - uint64_t __recno = (cursor)->recno; \ - WT_ITEM __value_copy = (cursor)->value; \ - if (((ret) = (f)) == 0) { \ - F_CLR(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); \ - F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); \ - } else { \ - if (F_ISSET(cursor, WT_CURSTD_KEY_EXT)) { \ - (cursor)->recno = __recno; \ - WT_ITEM_SET((cursor)->key, __key_copy); \ - } \ - if (F_ISSET(cursor, WT_CURSTD_VALUE_EXT)) \ - WT_ITEM_SET((cursor)->value, __value_copy); \ - F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); \ - } \ -} while (0) - -/* * __curfile_compare -- * WT_CURSOR->compare method for the btree cursor type. */ @@ -109,9 +86,12 @@ __curfile_next(WT_CURSOR *cursor) cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_API_CALL(cursor, session, next, cbt->btree); - F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); - if ((ret = __wt_btcur_next(cbt, false)) == 0) - F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + WT_ERR(__wt_btcur_next(cbt, false)); + + /* Next maintains a position, key and value. */ + WT_ASSERT(session, + F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT && + F_MASK(cursor, WT_CURSTD_VALUE_SET) == WT_CURSTD_VALUE_INT); err: API_END_RET(session, ret); } @@ -131,9 +111,12 @@ __wt_curfile_next_random(WT_CURSOR *cursor) cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_API_CALL(cursor, session, next, cbt->btree); - F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); - if ((ret = __wt_btcur_next_random(cbt)) == 0) - F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + WT_ERR(__wt_btcur_next_random(cbt)); + + /* Next-random maintains a position, key and value. */ + WT_ASSERT(session, + F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT && + F_MASK(cursor, WT_CURSTD_VALUE_SET) == WT_CURSTD_VALUE_INT); err: API_END_RET(session, ret); } @@ -152,9 +135,12 @@ __curfile_prev(WT_CURSOR *cursor) cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_API_CALL(cursor, session, prev, cbt->btree); - F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); - if ((ret = __wt_btcur_prev(cbt, false)) == 0) - F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); + WT_ERR(__wt_btcur_prev(cbt, false)); + + /* Prev maintains a position, key and value. */ + WT_ASSERT(session, + F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT && + F_MASK(cursor, WT_CURSTD_VALUE_SET) == WT_CURSTD_VALUE_INT); err: API_END_RET(session, ret); } @@ -175,7 +161,10 @@ __curfile_reset(WT_CURSOR *cursor) ret = __wt_btcur_reset(cbt); - F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + /* Reset maintains no position, key or value. */ + WT_ASSERT(session, + F_MASK(cursor, WT_CURSTD_KEY_SET) == 0 && + F_MASK(cursor, WT_CURSTD_VALUE_SET) == 0); err: API_END_RET(session, ret); } @@ -194,10 +183,15 @@ __curfile_search(WT_CURSOR *cursor) cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_API_CALL(cursor, session, search, cbt->btree); - WT_CURSOR_NEEDKEY(cursor); + WT_CURSOR_CHECKKEY(cursor); WT_CURSOR_NOVALUE(cursor); - WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_search(cbt), ret); + WT_ERR(__wt_btcur_search(cbt)); + + /* Search maintains a position, key and value. */ + WT_ASSERT(session, + F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT && + F_MASK(cursor, WT_CURSTD_VALUE_SET) == WT_CURSTD_VALUE_INT); err: API_END_RET(session, ret); } @@ -216,11 +210,15 @@ __curfile_search_near(WT_CURSOR *cursor, int *exact) cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_API_CALL(cursor, session, search_near, cbt->btree); - WT_CURSOR_NEEDKEY(cursor); + WT_CURSOR_CHECKKEY(cursor); WT_CURSOR_NOVALUE(cursor); - WT_BTREE_CURSOR_SAVE_AND_RESTORE( - cursor, __wt_btcur_search_near(cbt, exact), ret); + WT_ERR(__wt_btcur_search_near(cbt, exact)); + + /* Search-near maintains a position, key and value. */ + WT_ASSERT(session, + F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT && + F_MASK(cursor, WT_CURSTD_VALUE_SET) == WT_CURSTD_VALUE_INT); err: API_END_RET(session, ret); } @@ -238,38 +236,33 @@ __curfile_insert(WT_CURSOR *cursor) cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_UPDATE_API_CALL(cursor, session, insert, cbt->btree); + if (!F_ISSET(cursor, WT_CURSTD_APPEND)) - WT_CURSOR_NEEDKEY(cursor); - WT_CURSOR_NEEDVALUE(cursor); + WT_CURSOR_CHECKKEY(cursor); + WT_CURSOR_CHECKVALUE(cursor); - WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_insert(cbt), ret); + WT_ERR(__wt_btcur_insert(cbt)); /* - * Insert is the one cursor operation that doesn't end with the cursor - * pointing to an on-page item (except for column-store appends, where - * we are returning a key). That is, the application's cursor continues - * to reference the application's memory after a successful cursor call, - * which isn't true anywhere else. We don't want to have to explain that - * scoping corner case, so we reset the application's cursor so it can - * free the referenced memory and continue on without risking subsequent - * core dumps. + * Insert maintains no position, key or value (except for column-store + * appends, where we are returning a key). */ - if (ret == 0) { - if (!F_ISSET(cursor, WT_CURSTD_APPEND)) - F_CLR(cursor, WT_CURSTD_KEY_INT); - F_CLR(cursor, WT_CURSTD_VALUE_INT); - } + WT_ASSERT(session, + (F_ISSET(cursor, WT_CURSTD_APPEND) && + F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT) || + (!F_ISSET(cursor, WT_CURSTD_APPEND) && + F_MASK(cursor, WT_CURSTD_KEY_SET) == 0)); err: CURSOR_UPDATE_API_END(session, ret); return (ret); } /* - * __curfile_update -- - * WT_CURSOR->update method for the btree cursor type. + * __wt_curfile_insert_check -- + * WT_CURSOR->insert_check method for the btree cursor type. */ -static int -__curfile_update(WT_CURSOR *cursor) +int +__wt_curfile_insert_check(WT_CURSOR *cursor) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; @@ -278,21 +271,21 @@ __curfile_update(WT_CURSOR *cursor) cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_UPDATE_API_CALL(cursor, session, update, cbt->btree); - WT_CURSOR_NEEDKEY(cursor); - WT_CURSOR_NEEDVALUE(cursor); + WT_CURSOR_CHECKKEY(cursor); + WT_CURSOR_NOVALUE(cursor); - WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_update(cbt), ret); + ret = __wt_btcur_insert_check(cbt); err: CURSOR_UPDATE_API_END(session, ret); return (ret); } /* - * __wt_curfile_update_check -- - * WT_CURSOR->update_check method for the btree cursor type. + * __curfile_update -- + * WT_CURSOR->update method for the btree cursor type. */ -int -__wt_curfile_update_check(WT_CURSOR *cursor) +static int +__curfile_update(WT_CURSOR *cursor) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; @@ -301,11 +294,15 @@ __wt_curfile_update_check(WT_CURSOR *cursor) cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_UPDATE_API_CALL(cursor, session, update, cbt->btree); - WT_CURSOR_NEEDKEY(cursor); - WT_CURSOR_NOVALUE(cursor); + WT_CURSOR_CHECKKEY(cursor); + WT_CURSOR_CHECKVALUE(cursor); - WT_BTREE_CURSOR_SAVE_AND_RESTORE( - cursor, __wt_btcur_update_check(cbt), ret); + WT_ERR(__wt_btcur_update(cbt)); + + /* Update maintains a position, key and value. */ + WT_ASSERT(session, + F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT && + F_MASK(cursor, WT_CURSTD_VALUE_SET) == WT_CURSTD_VALUE_INT); err: CURSOR_UPDATE_API_END(session, ret); return (ret); @@ -325,24 +322,21 @@ __curfile_remove(WT_CURSOR *cursor) cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_REMOVE_API_CALL(cursor, session, cbt->btree); - WT_CURSOR_NEEDKEY(cursor); + WT_CURSOR_CHECKKEY(cursor); WT_CURSOR_NOVALUE(cursor); - WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_remove(cbt), ret); + WT_ERR(__wt_btcur_remove(cbt)); /* - * After a successful remove, copy the key: the value is not available. + * Remove with a search-key is fire-and-forget, no position and no key. + * Remove starting from a position maintains the position and a key. + * We don't know which it was at this layer, so can only assert the key + * is not set at all, or internal. There's never a value. */ - if (ret == 0) { - if (F_ISSET(cursor, WT_CURSTD_KEY_INT) && - !WT_DATA_IN_ITEM(&(cursor)->key)) { - WT_ERR(__wt_buf_set(session, &cursor->key, - cursor->key.data, cursor->key.size)); - F_CLR(cursor, WT_CURSTD_KEY_INT); - F_SET(cursor, WT_CURSTD_KEY_EXT); - } - F_CLR(cursor, WT_CURSTD_VALUE_SET); - } + WT_ASSERT(session, + F_MASK(cursor, WT_CURSTD_KEY_SET) == 0 || + F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT); + WT_ASSERT(session, F_MASK(cursor, WT_CURSTD_VALUE_SET) == 0); err: CURSOR_UPDATE_API_END(session, ret); return (ret); diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c index 0ab992bc88c..6fc01c0421f 100644 --- a/src/cursor/cur_index.c +++ b/src/cursor/cur_index.c @@ -240,7 +240,17 @@ __curindex_search(WT_CURSOR *cursor) found_key = child->key; if (found_key.size < cursor->key.size) WT_ERR(WT_NOTFOUND); - found_key.size = cursor->key.size; + + /* + * Custom collators expect to see complete keys, pass an item containing + * all the visible fields so it unpacks correctly. + */ + if (cindex->index->collator != NULL && + !F_ISSET(cursor, WT_CURSTD_RAW_SEARCH)) + WT_ERR(__wt_struct_repack(session, child->key_format, + cindex->iface.key_format, &child->key, &found_key)); + else + found_key.size = cursor->key.size; WT_ERR(__wt_compare( session, cindex->index->collator, &cursor->key, &found_key, &cmp)); @@ -307,8 +317,18 @@ __curindex_search_near(WT_CURSOR *cursor, int *exact) * so we flip the sign of the result to match what callers expect. */ found_key = child->key; - if (found_key.size > cursor->key.size) - found_key.size = cursor->key.size; + if (found_key.size > cursor->key.size) { + /* + * Custom collators expect to see complete keys, pass an item + * containing all the visible fields so it unpacks correctly. + */ + if (cindex->index->collator != NULL) + WT_ERR(__wt_struct_repack(session, + cindex->child->key_format, cindex->iface.key_format, + &child->key, &found_key)); + else + found_key.size = cursor->key.size; + } WT_ERR(__wt_compare( session, cindex->index->collator, &cursor->key, &found_key, exact)); @@ -520,8 +540,8 @@ __wt_curindex_open(WT_SESSION_IMPL *session, WT_ERR(__curindex_open_colgroups(session, cindex, cfg)); if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) - __wt_json_column_init( - cursor, table->key_format, &idx->colconf, &table->colconf); + __wt_json_column_init(cursor, uri, table->key_format, + &idx->colconf, &table->colconf); if (0) { err: WT_TRET(__curindex_close(cursor)); diff --git a/src/cursor/cur_join.c b/src/cursor/cur_join.c index 013a64ef2d5..80afaf798dc 100644 --- a/src/cursor/cur_join.c +++ b/src/cursor/cur_join.c @@ -185,7 +185,7 @@ __curjoin_iter_set_entry(WT_CURSOR_JOIN_ITER *iter, u_int entry_pos) size = strlen(to_dup->internal_uri) + 3; WT_ERR(__wt_calloc(session, size, 1, &uri)); - snprintf(uri, size, "%s()", to_dup->internal_uri); + WT_ERR(__wt_snprintf(uri, size, "%s()", to_dup->internal_uri)); if ((c = iter->cursor) == NULL || !WT_STREQ(c->uri, uri)) { iter->cursor = NULL; if (c != NULL) @@ -270,7 +270,7 @@ again: iter->positioned = true; return (ret); } - else if (ret == WT_NOTFOUND) { + if (ret == WT_NOTFOUND) { WT_RET(__curjoin_iter_close_all(iter->child)); entry->subjoin->iter = NULL; iter->child = NULL; @@ -518,8 +518,7 @@ __curjoin_entry_in_range(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry, } if (disjunction && end == endmax) return (WT_NOTFOUND); - else - return (0); + return (0); } typedef struct { @@ -930,7 +929,7 @@ __curjoin_init_next(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, if ((proj = cjoin->projection) != NULL) { size = strlen(urimain) + strlen(proj) + 1; WT_ERR(__wt_calloc(session, size, 1, &mainbuf)); - snprintf(mainbuf, size, "%s%s", urimain, proj); + WT_ERR(__wt_snprintf(mainbuf, size, "%s%s", urimain, proj)); urimain = mainbuf; } WT_ERR(__wt_open_cursor(session, urimain, (WT_CURSOR *)cjoin, config, @@ -975,8 +974,8 @@ __curjoin_init_next(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, if (!iterable && F_ISSET(je, WT_CURJOIN_ENTRY_BLOOM)) { if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED) WT_ERR_MSG(session, EINVAL, - "join cursors with Bloom filters cannot be " - "used with read-uncommitted isolation"); + "join cursors with Bloom filters cannot be " + "used with read-uncommitted isolation"); if (je->bloom == NULL) { /* * Look for compatible filters to be shared, @@ -1149,8 +1148,8 @@ __curjoin_open_main(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, newsize = strlen(cjoin->table->name) + idx->colconf.len + 1; WT_ERR(__wt_calloc(session, 1, newsize, &main_uri)); - snprintf(main_uri, newsize, "%s%.*s", - cjoin->table->name, (int)idx->colconf.len, idx->colconf.str); + WT_ERR(__wt_snprintf(main_uri, newsize, "%s%.*s", + cjoin->table->name, (int)idx->colconf.len, idx->colconf.str)); WT_ERR(__wt_open_cursor(session, main_uri, (WT_CURSOR *)cjoin, raw_cfg, &entry->main)); if (idx->extractor == NULL) { @@ -1163,7 +1162,8 @@ __curjoin_open_main(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, */ len = strlen(entry->main->value_format) + 3; WT_ERR(__wt_calloc(session, len, 1, &newformat)); - snprintf(newformat, len, "%s0x", entry->main->value_format); + WT_ERR(__wt_snprintf( + newformat, len, "%s0x", entry->main->value_format)); __wt_free(session, entry->main->value_format); entry->main->value_format = newformat; } @@ -1532,8 +1532,8 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, len = strlen(cindex->iface.key_format) + 3; WT_RET(__wt_calloc(session, len, 1, &entry->repack_format)); - snprintf(entry->repack_format, len, "%s0x", - cindex->iface.key_format); + WT_RET(__wt_snprintf(entry->repack_format, + len, "%s0x", cindex->iface.key_format)); } } return (0); diff --git a/src/cursor/cur_json.c b/src/cursor/cur_json.c index a0a3ffdd974..e8ddb767863 100644 --- a/src/cursor/cur_json.c +++ b/src/cursor/cur_json.c @@ -8,8 +8,8 @@ #include "wt_internal.h" -static size_t __json_unpack_put(WT_SESSION_IMPL *, void *, u_char *, size_t, - WT_CONFIG_ITEM *); +static int __json_unpack_put( + WT_SESSION_IMPL *, void *, u_char *, size_t, WT_CONFIG_ITEM *, size_t *); static inline int __json_struct_size(WT_SESSION_IMPL *, const void *, size_t, const char *, WT_CONFIG_ITEM *, bool, size_t *); static inline int __json_struct_unpackv(WT_SESSION_IMPL *, const void *, size_t, @@ -23,20 +23,20 @@ static int __json_pack_size(WT_SESSION_IMPL *, const char *, WT_CONFIG_ITEM *, bool, const char *, size_t *); #define WT_PACK_JSON_GET(session, pv, jstr) do { \ - switch (pv.type) { \ + switch ((pv).type) { \ case 'x': \ break; \ case 's': \ case 'S': \ - WT_RET(json_string_arg(session, &jstr, &pv.u.item)); \ - pv.type = pv.type == 's' ? 'j' : 'J'; \ + WT_RET(json_string_arg(session, &(jstr), &(pv).u.item));\ + (pv).type = (pv).type == 's' ? 'j' : 'J'; \ break; \ case 'b': \ case 'h': \ case 'i': \ case 'l': \ case 'q': \ - WT_RET(json_int_arg(session, &jstr, &pv.u.i)); \ + WT_RET(json_int_arg(session, &(jstr), &(pv).u.i)); \ break; \ case 'B': \ case 'H': \ @@ -46,11 +46,11 @@ static int __json_pack_size(WT_SESSION_IMPL *, const char *, WT_CONFIG_ITEM *, case 'r': \ case 'R': \ case 't': \ - WT_RET(json_uint_arg(session, &jstr, &pv.u.u)); \ + WT_RET(json_uint_arg(session, &(jstr), &(pv).u.u)); \ break; \ case 'u': \ - WT_RET(json_string_arg(session, &jstr, &pv.u.item)); \ - pv.type = 'K'; \ + WT_RET(json_string_arg(session, &(jstr), &(pv).u.item));\ + (pv).type = 'K'; \ break; \ /* User format strings have already been validated. */ \ WT_ILLEGAL_VALUE(session); \ @@ -61,22 +61,22 @@ static int __json_pack_size(WT_SESSION_IMPL *, const char *, WT_CONFIG_ITEM *, * __json_unpack_put -- * Calculate the size of a packed byte string as formatted for JSON. */ -static size_t +static int __json_unpack_put(WT_SESSION_IMPL *session, void *voidpv, - u_char *buf, size_t bufsz, WT_CONFIG_ITEM *name) + u_char *buf, size_t bufsz, WT_CONFIG_ITEM *name, size_t *retsizep) { WT_PACK_VALUE *pv; const u_char *p, *end; size_t s, n; pv = (WT_PACK_VALUE *)voidpv; - s = (size_t)snprintf((char *)buf, bufsz, "\"%.*s\" : ", - (int)name->len, name->str); + + WT_RET(__wt_snprintf_len_set( + (char *)buf, bufsz, &s, "\"%.*s\" : ", (int)name->len, name->str)); if (s <= bufsz) { bufsz -= s; buf += s; - } - else + } else bufsz = 0; switch (pv->type) { @@ -118,7 +118,8 @@ __json_unpack_put(WT_SESSION_IMPL *session, void *voidpv, } if (bufsz > 0) *buf++ = '"'; - return (s); + *retsizep += s; + return (0); case 'U': case 'u': s += 2; @@ -140,14 +141,17 @@ __json_unpack_put(WT_SESSION_IMPL *session, void *voidpv, } if (bufsz > 0) *buf++ = '"'; - return (s); + *retsizep += s; + return (0); case 'b': case 'h': case 'i': case 'l': case 'q': - return (s + - (size_t)snprintf((char *)buf, bufsz, "%" PRId64, pv->u.i)); + WT_RET(__wt_snprintf_len_incr( + (char *)buf, bufsz, &s, "%" PRId64, pv->u.i)); + *retsizep += s; + return (0); case 'B': case 't': case 'H': @@ -156,11 +160,14 @@ __json_unpack_put(WT_SESSION_IMPL *session, void *voidpv, case 'Q': case 'r': case 'R': - return (s + - (size_t)snprintf((char *)buf, bufsz, "%" PRId64, pv->u.u)); + WT_RET(__wt_snprintf_len_incr( + (char *)buf, bufsz, &s, "%" PRId64, pv->u.u)); + *retsizep += s; + return (0); } - __wt_err(session, EINVAL, "unknown pack-value type: %c", (int)pv->type); - return ((size_t)-1); + + WT_RET_MSG(session, EINVAL, + "unknown pack-value type: %c", (int)pv->type); } /* @@ -194,7 +201,8 @@ __json_struct_size(WT_SESSION_IMPL *session, const void *buffer, needcr = true; WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p))); WT_RET(__pack_name_next(&packname, &name)); - result += __json_unpack_put(session, &pv, NULL, 0, &name); + WT_RET( + __json_unpack_put(session, &pv, NULL, 0, &name, &result)); } if (ret == WT_NOTFOUND) ret = 0; @@ -243,8 +251,9 @@ __json_struct_unpackv(WT_SESSION_IMPL *session, needcr = true; WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p))); WT_RET(__pack_name_next(&packname, &name)); - jsize = __json_unpack_put(session, - (u_char *)&pv, jbuf, jbufsize, &name); + jsize = 0; + WT_RET(__json_unpack_put(session, + (u_char *)&pv, jbuf, jbufsize, &name, &jsize)); WT_ASSERT(session, jsize <= jbufsize); jbuf += jsize; jbufsize -= jsize; @@ -304,7 +313,6 @@ __wt_json_close(WT_SESSION_IMPL *session, WT_CURSOR *cursor) __wt_free(session, json->value_buf); __wt_free(session, json); } - return; } /* @@ -323,33 +331,32 @@ __wt_json_unpack_char(u_char ch, u_char *buf, size_t bufsz, bool force_unicode) if (bufsz >= 1) *buf = ch; return (1); - } else { - abbrev = '\0'; - switch (ch) { - case '\\': - case '"': - abbrev = ch; - break; - case '\f': - abbrev = 'f'; - break; - case '\n': - abbrev = 'n'; - break; - case '\r': - abbrev = 'r'; - break; - case '\t': - abbrev = 't'; - break; - } - if (abbrev != '\0') { - if (bufsz >= 2) { - *buf++ = '\\'; - *buf = abbrev; - } - return (2); + } + abbrev = '\0'; + switch (ch) { + case '\\': + case '"': + abbrev = ch; + break; + case '\f': + abbrev = 'f'; + break; + case '\n': + abbrev = 'n'; + break; + case '\r': + abbrev = 'r'; + break; + case '\t': + abbrev = 't'; + break; + } + if (abbrev != '\0') { + if (bufsz >= 2) { + *buf++ = '\\'; + *buf = abbrev; } + return (2); } } if (bufsz >= 6) { @@ -369,11 +376,11 @@ __wt_json_unpack_char(u_char ch, u_char *buf, size_t bufsz, bool force_unicode) * of column names. */ void -__wt_json_column_init(WT_CURSOR *cursor, const char *keyformat, +__wt_json_column_init(WT_CURSOR *cursor, const char *uri, const char *keyformat, const WT_CONFIG_ITEM *idxconf, const WT_CONFIG_ITEM *colconf) { WT_CURSOR_JSON *json; - const char *p, *end, *beginkey; + const char *beginkey, *end, *lparen, *p; uint32_t keycnt, nkeys; json = (WT_CURSOR_JSON *)cursor->json_private; @@ -400,8 +407,16 @@ __wt_json_column_init(WT_CURSOR *cursor, const char *keyformat, keycnt++; p++; } - json->value_names.str = p; - json->value_names.len = WT_PTRDIFF(end, p); + if ((lparen = strchr(uri, '(')) != NULL) { + /* This cursor is a projection. */ + json->value_names.str = lparen; + json->value_names.len = strlen(lparen) - 1; + WT_ASSERT((WT_SESSION_IMPL *)cursor->session, + json->value_names.str[json->value_names.len] == ')'); + } else { + json->value_names.str = p; + json->value_names.len = WT_PTRDIFF(end, p); + } if (idxconf == NULL) { if (p > beginkey) p--; @@ -413,16 +428,16 @@ __wt_json_column_init(WT_CURSOR *cursor, const char *keyformat, #define MATCH_KEYWORD(session, in, result, keyword, matchval) do { \ size_t _kwlen = strlen(keyword); \ if (strncmp(in, keyword, _kwlen) == 0 && \ - !__wt_isalnum((u_char)in[_kwlen])) { \ - in += _kwlen; \ - result = matchval; \ + !__wt_isalnum((u_char)(in)[_kwlen])) { \ + (in) += _kwlen; \ + (result) = matchval; \ } else { \ - const char *_bad = in; \ - while (__wt_isalnum((u_char)*in)) \ - in++; \ + const char *_bad = (in); \ + while (__wt_isalnum((u_char)*(in))) \ + (in)++; \ WT_RET_MSG(session, EINVAL, \ "unknown keyword \"%.*s\" in JSON", \ - (int)(in - _bad), _bad); \ + (int)((in) - _bad), _bad); \ } \ } while (0) @@ -684,12 +699,13 @@ json_uint_arg(WT_SESSION_IMPL *session, const char **jstr, uint64_t *up) #define JSON_EXPECT_TOKEN_GET(session, jstr, tokval, start, sz) do { \ int __tok; \ - WT_RET(__wt_json_token((WT_SESSION *)session, jstr, &__tok, &start, &sz));\ - if (__tok != tokval) \ + WT_RET(__wt_json_token( \ + (WT_SESSION *)(session), jstr, &__tok, &(start), &(sz))); \ + if (__tok != (tokval)) \ WT_RET_MSG(session, EINVAL, \ "expected JSON %s, got %s", \ __wt_json_tokname(tokval), __wt_json_tokname(__tok)); \ - jstr = start + sz; \ + (jstr) = (start) + (sz); \ } while (0) #define JSON_EXPECT_TOKEN(session, jstr, tokval) do { \ diff --git a/src/cursor/cur_log.c b/src/cursor/cur_log.c index 3ee6554b3c0..e5b56aa406f 100644 --- a/src/cursor/cur_log.c +++ b/src/cursor/cur_log.c @@ -305,7 +305,7 @@ __curlog_close(WT_CURSOR *cursor) WT_ASSERT(session, FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)); if (F_ISSET(cl, WT_CURLOG_ARCHIVE_LOCK)) - __wt_readunlock(session, conn->log->log_archive_lock); + __wt_readunlock(session, &conn->log->log_archive_lock); __wt_free(session, cl->cur_lsn); __wt_free(session, cl->next_lsn); @@ -383,7 +383,7 @@ __wt_curlog_open(WT_SESSION_IMPL *session, WT_ERR(__wt_log_force_write(session, 1, NULL)); /* Log cursors block archiving. */ - __wt_readlock(session, log->log_archive_lock); + __wt_readlock(session, &log->log_archive_lock); F_SET(cl, WT_CURLOG_ARCHIVE_LOCK); if (0) { diff --git a/src/cursor/cur_metadata.c b/src/cursor/cur_metadata.c index 10e2fdf28be..fbfc73956e2 100644 --- a/src/cursor/cur_metadata.c +++ b/src/cursor/cur_metadata.c @@ -16,7 +16,7 @@ WT_CURSOR_NEEDKEY(cursor); \ WT_ERR(__wt_buf_set(session, \ &((WT_CURSOR_METADATA *)(cursor))->file_cursor->key, \ - cursor->key.data, cursor->key.size)); \ + (cursor)->key.data, (cursor)->key.size)); \ F_SET(((WT_CURSOR_METADATA *)(cursor))->file_cursor, \ WT_CURSTD_KEY_EXT); \ } while (0) @@ -25,7 +25,7 @@ WT_CURSOR_NEEDVALUE(cursor); \ WT_ERR(__wt_buf_set(session, \ &((WT_CURSOR_METADATA *)(cursor))->file_cursor->value, \ - cursor->value.data, cursor->value.size)); \ + (cursor)->value.data, (cursor)->value.size)); \ F_SET(((WT_CURSOR_METADATA *)(cursor))->file_cursor, \ WT_CURSTD_VALUE_EXT); \ } while (0) diff --git a/src/cursor/cur_stat.c b/src/cursor/cur_stat.c index 5fde64c74ca..0bff642370d 100644 --- a/src/cursor/cur_stat.c +++ b/src/cursor/cur_stat.c @@ -163,7 +163,6 @@ static void __curstat_set_value(WT_CURSOR *cursor, ...) { WT_UNUSED(cursor); - return; } /* @@ -478,8 +477,8 @@ __curstat_join_desc(WT_CURSOR_STAT *cst, int slot, const char **resultp) len = strlen("join: ") + strlen(sgrp->desc_prefix) + strlen(static_desc) + 1; WT_RET(__wt_realloc(session, NULL, len, &cst->desc_buf)); - snprintf(cst->desc_buf, len, "join: %s%s", sgrp->desc_prefix, - static_desc); + WT_RET(__wt_snprintf( + cst->desc_buf, len, "join: %s%s", sgrp->desc_prefix, static_desc)); *resultp = cst->desc_buf; return (0); } diff --git a/src/cursor/cur_std.c b/src/cursor/cur_std.c index 6264de89df9..99a9e373354 100644 --- a/src/cursor/cur_std.c +++ b/src/cursor/cur_std.c @@ -144,6 +144,7 @@ __wt_cursor_set_notsup(WT_CURSOR *cursor) */ int __wt_cursor_kv_not_set(WT_CURSOR *cursor, bool key) + WT_GCC_FUNC_ATTRIBUTE((cold)) { WT_SESSION_IMPL *session; @@ -632,6 +633,7 @@ __wt_cursor_reconfigure(WT_CURSOR *cursor, const char *config) int __wt_cursor_dup_position(WT_CURSOR *to_dup, WT_CURSOR *cursor) { + WT_DECL_RET; WT_ITEM key; /* @@ -661,9 +663,11 @@ __wt_cursor_dup_position(WT_CURSOR *to_dup, WT_CURSOR *cursor) * cursors cannot reference application memory after cursor operations * and that requirement will save the day. */ - WT_RET(cursor->search(cursor)); + F_SET(cursor, WT_CURSTD_RAW_SEARCH); + ret = cursor->search(cursor); + F_CLR(cursor, WT_CURSTD_RAW_SEARCH); - return (0); + return (ret); } /* diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c index fae7667e44f..3b72bb0730f 100644 --- a/src/cursor/cur_table.c +++ b/src/cursor/cur_table.c @@ -14,8 +14,8 @@ static int __curtable_update(WT_CURSOR *cursor); #define APPLY_CG(ctable, f) do { \ WT_CURSOR **__cp; \ u_int __i; \ - for (__i = 0, __cp = ctable->cg_cursors; \ - __i < WT_COLGROUPS(ctable->table); \ + for (__i = 0, __cp = (ctable)->cg_cursors; \ + __i < WT_COLGROUPS((ctable)->table); \ __i++, __cp++) \ WT_TRET((*__cp)->f(*__cp)); \ } while (0) @@ -511,9 +511,16 @@ __curtable_insert(WT_CURSOR *cursor) */ F_SET(primary, flag_orig | WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); - if (ret == WT_DUPLICATE_KEY && F_ISSET(cursor, WT_CURSTD_OVERWRITE)) + if (ret == WT_DUPLICATE_KEY && F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curtable_update(cursor)); - else { + + /* + * The cursor is no longer positioned. This isn't just cosmetic, + * without a reset, iteration on this cursor won't start at the + * beginning/end of the table. + */ + APPLY_CG(ctable, reset); + } else { WT_ERR(ret); for (i = 1; i < WT_COLGROUPS(ctable->table); i++, cp++) { @@ -601,22 +608,53 @@ err: CURSOR_UPDATE_API_END(session, ret); static int __curtable_remove(WT_CURSOR *cursor) { + WT_CURSOR *primary; WT_CURSOR_TABLE *ctable; WT_DECL_RET; WT_SESSION_IMPL *session; + bool positioned; ctable = (WT_CURSOR_TABLE *)cursor; JOINABLE_CURSOR_REMOVE_API_CALL(cursor, session, NULL); WT_ERR(__curtable_open_indices(ctable)); + /* Check if the cursor was positioned. */ + primary = *ctable->cg_cursors; + positioned = F_ISSET(primary, WT_CURSTD_KEY_INT); + /* Find the old record so it can be removed from indices */ if (ctable->table->nindices > 0) { APPLY_CG(ctable, search); + if (ret == WT_NOTFOUND) + goto notfound; WT_ERR(ret); WT_ERR(__apply_idx(ctable, offsetof(WT_CURSOR, remove), false)); } APPLY_CG(ctable, remove); + if (ret == WT_NOTFOUND) + goto notfound; + WT_ERR(ret); + +notfound: + /* + * If the cursor is configured to overwrite and the record is not found, + * that is exactly what we want. + */ + if (ret == WT_NOTFOUND && F_ISSET(primary, WT_CURSTD_OVERWRITE)) + ret = 0; + + /* + * If the cursor was positioned, it stays positioned with a key but no + * no value, otherwise, there's no position, key or value. This isn't + * just cosmetic, without a reset, iteration on this cursor won't start + * at the beginning/end of the table. + */ + F_CLR(primary, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + if (positioned) + F_SET(primary, WT_CURSTD_KEY_INT); + else + APPLY_CG(ctable, reset); err: CURSOR_UPDATE_API_END(session, ret); return (ret); @@ -769,7 +807,7 @@ __curtable_complete(WT_SESSION_IMPL *session, WT_TABLE *table) return (0); /* If the table is incomplete, wait on the table lock and recheck. */ - WT_WITH_TABLE_LOCK(session, complete = table->cg_complete); + WT_WITH_TABLE_READ_LOCK(session, complete = table->cg_complete); if (!complete) WT_RET_MSG(session, EINVAL, "'%s' not available until all column groups are created", @@ -951,7 +989,7 @@ __wt_curtable_open(WT_SESSION_IMPL *session, if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) __wt_json_column_init( - cursor, table->key_format, NULL, &table->colconf); + cursor, uri, table->key_format, NULL, &table->colconf); /* * Open the colgroup cursors immediately: we're going to need them for @@ -989,11 +1027,15 @@ __wt_curtable_open(WT_SESSION_IMPL *session, if (0) { err: if (*cursorp != NULL) { - if (*cursorp != cursor) - WT_TRET(__wt_cursor_close(*cursorp)); + /* + * When a dump cursor is opened, then *cursorp, not + * cursor, is the dump cursor. Close the dump cursor, + * and the table cursor will be closed as its child. + */ + cursor = *cursorp; *cursorp = NULL; } - WT_TRET(__curtable_close(cursor)); + WT_TRET(cursor->close(cursor)); } __wt_scr_free(session, &tmp); diff --git a/src/docs/Doxyfile b/src/docs/Doxyfile index 69e9716b425..3d8c46962f1 100644 --- a/src/docs/Doxyfile +++ b/src/docs/Doxyfile @@ -216,11 +216,19 @@ ALIASES = "notyet{1}=Note: <b>"\1"</b> not yet supported in Wired "hrow{3}=<tr><th>\1</th><th>\2</th><th>\3</th></tr>" \ "hrow{4}=<tr><th>\1</th><th>\2</th><th>\3</th><th>\4</th></tr>" \ "hrow{5}=<tr><th>\1</th><th>\2</th><th>\3</th><th>\4</th><th>\5</th></tr>" \ + "hrow{6}=<tr><th>\1</th><th>\2</th><th>\3</th><th>\4</th><th>\5</th><th>\6</th></tr>" \ + "hrow{7}=<tr><th>\1</th><th>\2</th><th>\3</th><th>\4</th><th>\5</th><th>\6</th><th>\7</th></tr>" \ + "hrow{8}=<tr><th>\1</th><th>\2</th><th>\3</th><th>\4</th><th>\5</th><th>\6</th><th>\7</th><th>\8</th></tr>" \ + "hrow{9}=<tr><th>\1</th><th>\2</th><th>\3</th><th>\4</th><th>\5</th><th>\6</th><th>\7</th><th>\8</th><th>\9</th></tr>" \ "row{1}=<tr><td>\1</td></tr>" \ "row{2}=<tr><td>\1</td><td>\2</td></tr>" \ "row{3}=<tr><td>\1</td><td>\2</td><td>\3</td></tr>" \ "row{4}=<tr><td>\1</td><td>\2</td><td>\3</td><td>\4</td></tr>" \ "row{5}=<tr><td>\1</td><td>\2</td><td>\3</td><td>\4</td><td>\5</td></tr>" \ + "row{6}=<tr><td>\1</td><td>\2</td><td>\3</td><td>\4</td><td>\5</td><td>\6</td></tr>" \ + "row{7}=<tr><td>\1</td><td>\2</td><td>\3</td><td>\4</td><td>\5</td><td>\6</td><td>\7</td></tr>" \ + "row{8}=<tr><td>\1</td><td>\2</td><td>\3</td><td>\4</td><td>\5</td><td>\6</td><td>\7</td><td>\8</td></tr>" \ + "row{9}=<tr><td>\1</td><td>\2</td><td>\3</td><td>\4</td><td>\5</td><td>\6</td><td>\7</td><td>\8</td><td>\9</td></tr>" \ "configstart{2}=@param config\n Configuration string, see @ref config_strings. Permitted values:\n <table>@hrow{Name,Effect,Values}" \ "config{3}= @row{<tt>\1</tt>,\2,\3}" \ "configend= </table>" \ diff --git a/src/docs/build-pydoc.sh b/src/docs/build-pydoc.sh index aef88fd4c97..5e6e3635be5 100755 --- a/src/docs/build-pydoc.sh +++ b/src/docs/build-pydoc.sh @@ -3,4 +3,4 @@ TOP=$DOCS/.. . $TOP/config.sh cd python -PYTHONPATH=../../lang/python/src:$THRIFT_HOME/lib/python2.7/site-packages pydoc -w wiredtiger +PYTHONPATH=../../lang/python/src:$THRIFT_HOME/lib/python2.6/site-packages pydoc -w wiredtiger diff --git a/src/docs/command-line.dox b/src/docs/command-line.dox index 5726a1d19a1..df52324f8f8 100644 --- a/src/docs/command-line.dox +++ b/src/docs/command-line.dox @@ -370,6 +370,19 @@ Include only "fast" statistics in the output (equivalent to passing <code>statistics=(fast)</code>) to WT_SESSION::open_cursor. <hr> +@section util_truncate wt truncate +Truncate a table, removing all data. + +The \c truncate command truncates the specified \c uri. It is equivalent to a +call to WT_SESSION::truncate with no start or stop specified. + +@subsection util_truncate_synopsis Synopsis +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] truncate uri</code> + +@subsection util_truncate_options Options +The \c truncate command has no command-specific options. + +<hr> @section util_upgrade wt upgrade Upgrade a table. diff --git a/src/docs/cursor-ops.dox b/src/docs/cursor-ops.dox index b743d81db57..e479ff29191 100644 --- a/src/docs/cursor-ops.dox +++ b/src/docs/cursor-ops.dox @@ -145,9 +145,5 @@ that may not be modified or freed by the application. If a longer scope is required, the application must make a copy of the memory before the cursor is re-used, closed or reset. -The comments in this example code explain when the application can safely -modify memory passed to WT_CURSOR::set_key or WT_CURSOR::set_value: - -@snippet ex_scope.c cursor scope operation @m_endif */ diff --git a/src/docs/cursor-random.dox b/src/docs/cursor-random.dox index a0a3212be6d..b6434e3d161 100644 --- a/src/docs/cursor-random.dox +++ b/src/docs/cursor-random.dox @@ -20,9 +20,4 @@ cursor configured using \c next_random_sample_size divides the object into \c next_random_sample_size pieces, and each subsequent retrieval returns a record from the next one of those pieces. -For example, setting \c next_random_sample_percent to \c 10 would cause -the cursor to sequentially return records from each tenth part of the -object. Setting \c next_random_sample_percent to \c 1000 would cause the -cursor to sequentially return records from each .1% of the object. - */ diff --git a/src/docs/file-formats.dox b/src/docs/file-formats.dox index d8990aca7a6..21dc4580bc2 100644 --- a/src/docs/file-formats.dox +++ b/src/docs/file-formats.dox @@ -110,7 +110,7 @@ considered. (See @subpage_single huffman for details.) compressing blocks of the backing object's file. The cost is additional CPU and memory use when reading and writing pages to disk. Note the additional CPU cost of block compression can be high, and should be -considered. (See @x_ref compression_formats for details.) +considered. (See @x_ref compression_considerations for details.) Block compression is disabled by default. @@ -146,7 +146,7 @@ Huffman encoding can be high, and should be considered. compressing blocks of the backing object's file. The cost is additional CPU and memory use when reading and writing pages to disk. Note the additional CPU cost of block compression can be high, and should be -considered. (See @x_ref compression_formats for details.) +considered. (See @x_ref compression_considerations for details.) Block compression is disabled by default. @@ -157,7 +157,7 @@ compression: block compression. compressing blocks of the backing object's file. The cost is additional CPU and memory use when reading and writing pages to disk. Note the additional CPU cost of block compression can be high, and should be -considered. (See @x_ref compression_formats for details.) +considered. (See @x_ref compression_considerations for details.) Block compression is disabled by default. diff --git a/src/docs/programming.dox b/src/docs/programming.dox index 81e612e8ee8..aa76bef4614 100644 --- a/src/docs/programming.dox +++ b/src/docs/programming.dox @@ -66,14 +66,13 @@ each of which is ordered by one or more columns. - @subpage_single wtstats <p> - @subpage_single tune_memory_allocator -- @subpage_single tune_page_sizes +- @subpage_single tune_page_size_and_comp - @subpage_single tune_cache - @subpage_single tune_bulk_load - @subpage_single tune_cursor_persist - @subpage_single tune_read_only - @subpage_single tune_durability - @subpage_single tune_checksum -- @subpage_single tune_compression - @subpage_single tune_file_alloc - @subpage_single tune_system_buffer_cache - @subpage_single tune_transparent_huge_pages diff --git a/src/docs/spell.ok b/src/docs/spell.ok index 2413cbc93fb..bc2e16b1122 100644 --- a/src/docs/spell.ok +++ b/src/docs/spell.ok @@ -50,6 +50,7 @@ LDFLAGS LIBS LLVM LOGREC +LRU LRVv LSB LSM @@ -167,6 +168,7 @@ dNLen dNOff dT dataN +database's dataitem dataset datasets diff --git a/src/docs/testing.dox b/src/docs/testing.dox index cf280e8f3ff..7d454d54212 100644 --- a/src/docs/testing.dox +++ b/src/docs/testing.dox @@ -27,7 +27,7 @@ The WiredTiger unit test suite includes tests that cover: The WiredTiger Python test suite is built using the WiredTiger Python API and the Python unittest functionality (the test suite requires at -least Python version 2.7). +least Python version 2.6). The WiredTiger test suite automatically runs as part of every commit into the WiredTiger GitHub source tree. diff --git a/src/docs/top/main.dox b/src/docs/top/main.dox index 01acc849d50..84487c13174 100644 --- a/src/docs/top/main.dox +++ b/src/docs/top/main.dox @@ -6,12 +6,12 @@ WiredTiger is an high performance, scalable, production quality, NoSQL, @section releases Releases <table> -@row{<b>WiredTiger 2.9.0</b> (current), +@row{<b>WiredTiger 2.9.1</b> (current), + <a href="releases/wiredtiger-2.9.1.tar.bz2"><b>[Release package]</b></a>, + <a href="2.9.1/index.html"><b>[Documentation]</b></a>} +@row{<b>WiredTiger 2.9.0</b> (previous), <a href="releases/wiredtiger-2.9.0.tar.bz2"><b>[Release package]</b></a>, <a href="2.9.0/index.html"><b>[Documentation]</b></a>} -@row{<b>WiredTiger 2.8.0</b> (previous), - <a href="releases/wiredtiger-2.8.0.tar.bz2"><b>[Release package]</b></a>, - <a href="2.8.0/index.html"><b>[Documentation]</b></a>} @row{<b>Development branch</b>, <a href="https://github.com/wiredtiger/wiredtiger"><b>[Source code]</b></a>, <a href="develop/index.html"><b>[Documentation]</b></a>} diff --git a/src/docs/transactions.dox b/src/docs/transactions.dox index bbbd2d52296..3b438eda366 100644 --- a/src/docs/transactions.dox +++ b/src/docs/transactions.dox @@ -141,7 +141,7 @@ as if the transaction started at the time of the WT_SESSION::snapshot call that created the snapshot. Named snapshots keep data pinned in cache as if a real transaction were -running for the time that the named transaction is active. The resources +running for the time that the named snapshot is active. The resources associated with named snapshots should be released by calling WT_SESSION::snapshot with a configuration that includes <code>"drop="</code>. See WT_SESSION::snapshot documentation for details of diff --git a/src/docs/tune-compression.dox b/src/docs/tune-compression.dox deleted file mode 100644 index 8db2151aa76..00000000000 --- a/src/docs/tune-compression.dox +++ /dev/null @@ -1,62 +0,0 @@ -/*! @page tune_compression Compression - -WiredTiger includes a number of optional compression techniques. Configuring -compression generally decreases on-disk and in-memory resource requirements -and the amount of I/O, and increases CPU cost when data are read and written. - -Configuring compression may change application throughput. For example, -in applications using solid-state drives (where I/O is less expensive), -turning off compression may increase application performance by reducing -CPU costs; in applications where I/O costs are more expensive, turning on -compression may increase application performance by reducing the overall -number of I/O operations. - -An example of turning on row-store key prefix compression: - -@snippet ex_all.c Configure key prefix compression on - -An example of turning on row-store or column-store dictionary compression: - -@snippet ex_all.c Configure dictionary compression on - -@section compression_formats Block Compression Formats -WiredTiger provides two methods of compressing your data when using block -compression: the raw and noraw methods. These methods change how WiredTiger -works to fit data into the blocks that are stored on disk. - -@subsection noraw_compression Noraw Compression -Noraw compression is the traditional compression model where a fixed -amount of data is given to the compression system, then turned into a -compressed block of data. The amount of data chosen to compress is the -data needed to fill the uncompressed block. Thus when compressed, the block will -be smaller than the normal data size and the sizes written to disk will often -vary depending on how compressible the data being stored is. Algorithms -using noraw compression include zlib-noraw, lz4-noraw and snappy. - -@subsection raw_compression Raw Compression -WiredTiger's raw compression takes advantage of compressors that provide a -streaming compression API. Using the streaming API WiredTiger will try to fit -as much data as possible into one block. This means that blocks created -with raw compression should be of similar size. Using a streaming compression -method should also make for less overhead in compression, as the setup and -initial work for compressing is done fewer times compared to the amount of -data stored. Algorithms using raw compression include zlib, lz4. - -@subsection to_raw_or_noraw Choosing between Raw and Noraw Compression -When looking at which compression method to use the biggest consideration is -that raw compression will normally provide higher compression levels while -using more CPU for compression. - -An additional consideration is that raw compression may provide a performance -advantage in workloads where data is accessed sequentially. That is because -more data is generally packed into each block on disk. Conversely, noraw -compression may perform better for workloads with random access patterns -because each block will tend to be smaller and require less work to read and -decompress. - -See @ref file_formats_compression for more information on available -compression techniques. - -See @ref compression for information on how to configure and enable compression. - - */ diff --git a/src/docs/tune-page-size-and-comp.dox b/src/docs/tune-page-size-and-comp.dox new file mode 100644 index 00000000000..96b0fda2333 --- /dev/null +++ b/src/docs/tune-page-size-and-comp.dox @@ -0,0 +1,426 @@ +/*! @page tune_page_size_and_comp Tuning page size and compression + +This document aims to explain the role played by different page sizes in +WiredTiger. It also details motivation behind an application wanting to modify +these page sizes from their default values and the procedure to do so. +Applications commonly configure page sizes based on their workload's typical key +and value size. Once a page size has been chosen, appropriate defaults for the +other configuration values are derived by WiredTiger from the page sizes, and +relatively few applications will need to modify the other page and key/value +size configuration options. WiredTiger also offers several compression options +that have an impact on the size of the data both in-memory and on-disk. Hence +while selecting page sizes, an application must also look at its desired +compression needs. Since the data and workload for a table differs from one +table to another in the database, an application can choose to set page sizes +and compression options on a per-table basis. + +@section data_life_cycle Data life cycle +Before detailing each page size, here is a review of how data gets stored inside +WiredTiger: + - WiredTiger uses the physical disks to store data durably, creating on-disk +files for the tables in the database directory. It also caches the portion of +the table being currently accessed by the application for reading or writing in +main memory. + - WiredTiger maintains a table's data in memory using a data structure called a +<a href="https://en.wikipedia.org/wiki/B-tree">B-Tree</a> ( +<a href="https://en.wikipedia.org/wiki/B%2B_tree">B+ Tree</a> to be specific), +referring to the nodes of a B-Tree as pages. Internal pages carry only keys. The +leaf pages store both keys and values. + - The format of the in-memory pages is not the same as the format of the +on-disk pages. Therefore, the in-memory pages regularly go through a process +called reconciliation to create data structures appropriate for storage on the +disk. These data structures are referred to as on-disk pages. An application can +set a maximum size separately for the internal and leaf on-disk pages otherwise +WiredTiger uses a default value. If reconciliation of an in-memory page is +leading to an on-disk page size greater than this maximum, WiredTiger creates +multiple smaller on-disk pages. + - A component of WiredTiger called the Block Manager divides the on-disk pages +into smaller chunks called blocks, which then get written to the disk. The size +of these blocks is defined by a parameter called allocation_size, which is the +underlying unit of allocation for the file the data gets stored in. An +application might choose to have data compressed before it gets stored to disk +by enabling block compression. + - A database's tables are usually much larger than the main memory available. +Not all of the data can be kept in memory at any given time. A process called +eviction takes care of making space for new data by freeing the memory of data +infrequently accessed. An eviction server regularly finds in-memory pages that +have not been accessed in a while (following an LRU algorithm). Several +background eviction threads continuously process these pages, reconcile them to +disk and remove them from the main memory. + - When an application does an insert or an update of a key/value pair, the +associated key is used to refer to an in-memory page. In the case of this page +not being in memory, appropriate on-disk page(s) are read and an in-memory page +constructed (the opposite of reconciliation). A data structure is maintained on +every in-memory page to store any insertions or modifications to the data done +on that page. As more and more data gets written to this page, the page's memory +footprint keeps growing. + - An application can choose to set the maximum size a page is allowed to grow +in-memory. A default size is set by WiredTiger if the application doesn't +specify one. To keep page management efficient, as a page grows larger in-memory +and approaches this maximum size, if possible, it is split into smaller +in-memory pages. + - When doing an insert or an update, if a page grows larger than the maximum, +the application thread is used to forcefully evict this page. This is done to +split the growing page into smaller in-memory pages and reconcile them into +on-disk pages. Once written to the disk they are removed from the main memory, +making space for more data to be written. When an application gets involved in +forced eviction, it might take longer than usual to do these inserts and +updates. It is not always possible to (force) evict a page from memory and this +page can temporarily grow larger in size than the configured maximum. This page +then remains marked to be evicted and reattempts are made as the application +puts more data in it. + +@section configurable_page_struct Configurable page structures in WiredTiger +There are three page sizes that the user can configure: + 1. The maximum page size of any type of in-memory page in the WiredTiger cache, +memory_page_max. + 2. The maximum size of the on-disk page for an internal page, internal_page_max. + 3. The maximum size of the on-disk leaf page, leaf_page_max. + +There are additional configuration settings that tune more esoteric and +specialized data. Those are included for completeness but are rarely changed. + +@subsection memory_page_max memory_page_max +The maximum size a table's page is allowed to grow to in memory before being +reconciled to disk. + - An integer, with acceptable values between 512B and 10TB + - Default size: 5 MB + - Additionally constrained by the condition: + leaf_page_max <= memory_page_max <= cache_size/10 + - Motivation to tune the value: +\n memory_page_max is significant for applications wanting to tune for +consistency in write intensive workloads. + - This is the parameter to start with for tuning and trying different values +to find the correct balance between overall throughput and individual operation +latency for each table. + - Splitting a growing in-memory page into smaller pages and reconciliation +both require exclusive access to the page which makes an application's write +operations wait. Having a large memory_page_max means that the pages will need +to be split and reconciled less often. But when that happens, the duration that +an exclusive access to the page is required is longer, increasing the latency of +an application's insert or update operations. Conversely, having a smaller +memory_page_max reduces the time taken for splitting and reconciling the pages, +but causes it to happen more frequently, forcing more frequent but shorter +exclusive accesses to the pages. + - Applications should choose the memory_page_max value considering the +trade-off between frequency of exclusive access to the pages (for reconciliation +or splitting pages into smaller pages) versus the duration that the exclusive +access is required. + - Configuration: +\n Specified as memory_page_max configuration option to WT_SESSION::create(). An +example of such a configuration string is as follows: + +<pre> + "key_format=S,value_format=S,memory_page_max=10MB" +</pre> + +@subsection internal_page_max internal_page_max +The maximum page size for the reconciled on-disk internal pages of the B-Tree, +in bytes. When an internal page grows past this size, it splits into multiple +pages. + - An integer, with acceptable values between 512B and 512MB + - Default size: 4 KB (*appropriate for applications with relatively small keys) + - Additionally constrained by the condition: the size must be a multiple of the +allocation size + - Motivation to tune the value: +\n internal_page_max is significant for applications wanting to avoid excessive +L2 cache misses while searching the tree. + - Recall that only keys are stored on internal pages, so the type and size of +the key values for a table help drive the setting for this parameter. + - Should be sized to fit into on-chip caches. + - Applications doing full-table scans with out-of-memory workloads might +increase internal_page_max to transfer more data per I/O. + - Influences the shape of the B-Tree, i.e. depth and the number of children +each page in B-Tree has. To iterate to the desired key/value pair in the B-Tree, +WiredTiger has to binary search the key-range in a page to determine the child +page to proceed to and continue down the depth until it reaches the correct leaf +page. Having an unusually deep B-Tree, or having too many children per page can +negatively impact time taken to iterate the B-Tree, slowing down the application. +The number of children per page and, hence, the tree depth depends upon the +number of keys that can be stored in an internal page, which is +internal_page_max divided by key size. Applications should choose an appropriate +internal_page_max size that avoids the B-Tree from getting too deep. + - Configuration: +\n Specified as internal_page_max configuration option to WT_SESSION::create(). +An example of such a configuration string is as follows: + +<pre> + "key_format=S,value_format=S,internal_page_max=16KB,leaf_page_max=1MB" +</pre> + +@subsection leaf_page_max leaf_page_max +The maximum page size for the reconciled on-disk leaf pages of the B-Tree, in +bytes. When a leaf page grows past this size, it splits into multiple pages. + - An integer, with acceptable values between 512B and 512MB + - Default size: 32 KB (*appropriate for applications with relatively small keys +and values) + - Additionally constrained by the condition: must be a multiple of the +allocation size + - Motivation to tune the value: +\n leaf_page_max is significant for applications wanting to maximize sequential +data transfer from a storage device. + - Should be sized to maximize I/O performance (when reading from disk, it is +usually desirable to read a large amount of data, assuming some locality of +reference in the application's access pattern). + - Applications doing full-table scans through out-of-cache workloads might +increase leaf_page_max to transfer more data per I/O. + - Applications focused on read/write amplification might decrease the page +size to better match the underlying storage block size. + - Configuration: +\n Specified as leaf_page_max configuration option to WT_SESSION::create(). An +example of such a configuration string is as follows: + +<pre> + "key_format=S,value_format=S,internal_page_max=16KB,leaf_page_max=1MB" +</pre> + +The following configuration items following are rarely used. They are described +for completeness: + +@subsection allocation_size allocation_size +This is the underlying unit of allocation for the file. As the unit of file +allocation, it sets the minimum page size and how much space is wasted when +storing small amounts of data and overflow items. + - an integer between 512B and 128 MB + - must a power-of-two + - default : 4 KB + - Motivation to tune the value: +\n Most applications should not need to tune the allocation size. + - To be compatible with virtual memory page sizes and direct I/O requirements +on the platform (4KB for most common server platforms) + - Smaller values decrease the file space required by overflow items. + - For example, if the allocation size is set to 4KB, an overflow item of +18,000 bytes requires 5 allocation units and wastes about 2KB of space. If the +allocation size is 16KB, the same overflow item would waste more than 10KB. + - Configuration: +\n Specified as allocation_size configuration option to WT_SESSION::create(). An +example of such a configuration string is as follows: + +<pre> + "key_format=S,value_format=S,allocation_size=4KB" +</pre> + +@subsection key_val_max internal/leaf key/value max + - Overflow items +\n Overflow items are keys and values too large to easily store on a page. Overflow +items are stored separately in the file from the page where the item logically +appears, and so reading or writing an overflow item is more expensive than an +on-page item, normally requiring additional I/O. Additionally, overflow values +are not cached in memory. This means overflow items won't affect the caching +behavior of the application. It also means that each time an overflow value is +read, it is re-read from disk. + - internal_key_max +\n The largest key stored in an internal page, in bytes. If set, keys larger than +the specified size are stored as overflow items. + - The default and the maximum allowed value are both one-tenth the size of a +newly split internal page. + - leaf_key_max +\n The largest key stored in a leaf page, in bytes. If set, keys larger than the +specified size are stored as overflow items. + - The default value is one-tenth the size of a newly split leaf page. + - leaf_value_max +\n The largest value stored in a leaf page, in bytes. If set, values larger than +the specified size are stored as overflow items + - The default is one-half the size of a newly split leaf page. + - If the size is larger than the maximum leaf page size, the page size is +temporarily ignored when large values are written. + - Motivation to tune the values: +\n Most applications should not need to tune the maximum key and value sizes. +Applications requiring a small page size, but also having latency concerns such +that the additional work to retrieve an overflow item may find modifying these +values useful. +\n Since overflow items are separately stored in the on-disk file, aren't cached +and require additional I/O to access (read or write), applications should avoid +creating overflow items. + - Since page sizes also determine the default size of overflow items, i.e., +keys and values too large to easily store on a page, they can be configured to +avoid performance penalties working with overflow items: + - Applications with large keys and values, and concerned with latency, +might increase the page size to avoid creating overflow items, in order to avoid +the additional cost of retrieving them. + - Applications with large keys and values, doing random searches, might +decrease the page size to avoid wasting cache space on overflow items that +aren't likely to be needed. + - Applications with large keys and values, doing table scans, might +increase the page size to avoid creating overflow items, as the overflow items +must be read into memory in all cases, anyway. + - internal_key_max, leaf_key_max and leaf_value_max configuration values +allow applications to change the size at which a key or value will be treated +as an overflow item. + - Most applications should not need to tune the maximum key and value +sizes. + - The value of internal_key_max is relative to the maximum internal page +size. Because the number of keys on an internal page determines the depth of the +tree, the internal_key_max value can only be adjusted within a certain range, +and the configured value will be automatically adjusted by WiredTiger, if +necessary, to ensure a reasonable number of keys fit on an internal page. + - The values of leaf_key_max and leaf_value_max are not relative to the +maximum leaf page size. If either is larger than the maximum page size, the page +size will be ignored when the larger keys and values are being written, and a +larger page will be created as necessary. + - Configuration: +\n Specified as internal_key_max, leaf_key_max and leaf_value_max configuration +options to WT_SESSION::create(). An example of configuration string for a large +leaf overflow value: + +<pre> + "key_format=S,value_format=S,leaf_page_max=16KB,leaf_value_max=256KB" +</pre> + +@subsection split_pct split_pct (split percentage) +The size (specified as percentage of internal/leaf page_max) at which the +reconciled page must be split into multiple smaller pages before being sent for +compression and then be written to the disk. If the reconciled page can fit into +a single on-disk page without the page growing beyond it's set max size, +split_pct is ignored and the page isn't split. + - an integer between 25 and 100 + - default : 75 + - Motivation to tune the value: +\n Most applications should not need to tune the split percentage size. + - This value should be selected to avoid creating a large number of tiny +pages or repeatedly splitting whenever new entries are inserted. +\n For example, if the maximum page size is 1MB, a split_pct value of 10% +would potentially result in creating a large number of 100KB pages, which may +not be optimal for future I/O. Or, if the maximum page size is 1MB, a split_pct +value of 90% would potentially result in repeatedly splitting pages as the split +pages grow to 1MB over and over. The default value for split_pct is 75%, +intended to keep large pages relatively large, while still giving split pages +room to grow. + - Configuration: +\n Specified as split_pct configuration option to WT_SESSION::create(). An +example of such a configuration string is as follows: + +<pre> + "key_format=S,value_format=S,split_pct=60" +</pre> + +@section compression_considerations Compression considerations +WiredTiger compresses data at several stages to preserve memory and disk space. +Applications can configure these different compression algorithms to tailor +their requirements between memory, disk and CPU consumption. Compression +algorithms other than block compression work by modifying how the keys and +values are represented, and hence reduce data size in-memory and on-disk. Block +compression on the other hand compress the data in its binary representation +while saving it on the disk. + +Configuring compression may change application throughput. For example, in +applications using solid-state drives (where I/O is less expensive), turning +off compression may increase application performance by reducing CPU costs; in +applications where I/O costs are more expensive, turning on compression may +increase application performance by reducing the overall number of I/O +operations. + +WiredTiger uses some internal algorithms to compress the amount of data stored +that are not configurable, but always on. For example, run-length reduces the +size requirement by storing sequential, duplicate values in the store only a +single time (with an associated count). + +Different compression options available with WiredTiger: + - Key-prefix + - Reduces the size requirement by storing any identical key prefix only once +per page. The cost is additional CPU and memory when operating on the in-memory +tree. Specifically, reverse sequential cursor movement (but not forward) through +a prefix-compressed page or the random lookup of a key/value pair will allocate +sufficient memory to hold some number of uncompressed keys. So, for example, if +key prefix compression only saves a small number of bytes per key, the +additional memory cost of instantiating the uncompressed key may mean prefix +compression is not worthwhile. Further, in cases where the on-disk cost is the +primary concern, block compression may mean prefix compression is less useful. + - Configuration: +\n Specified as prefix_compression configuration option to +WT_SESSION::create(). Applications may limit the use of prefix compression by +configuring the minimum number of bytes that must be gained before prefix +compression is used with prefix_compression_min configuration option. An example +of such a configuration string is as follows: + +<pre> + "key_format=S,value_format=S,prefix_compression=true,prefix_compression_min=7" +</pre> + + - Dictionary + - Reduces the size requirement by storing any identical value only once per +page. + - Configuration: +\n Specified as dictionary configuration configuration option to +WT_SESSION::create(), which specifies the maximum number of unique values +remembered in the B-Tree row-store leaf page value dictionary. An example of +such a configuration string is as follows: + +<pre> + "key_format=S,value_format=S,dictionary=1000" +</pre> + + - Huffman + - Reduces the size requirement by compressing individual key/value items, and +can be separately configured either or both keys and values. The additional CPU +cost of Huffman encoding can be high, and should be considered. (See Huffman +Encoding for details.) + - Configuration: +\n Specified as huffman_key and/or huffman_value configuration option to +WT_SESSION::create(). These options can take values of "english" (to use a +built-in English language frequency table), "utf8<file>" or "utf16<file>" (to +use a custom utf8 or utf16 symbol frequency table file). An example of such a +configuration string is as follows: + +<pre> + "key_format=S,value_format=S,huffman_key=english,huffman_value=english" +</pre> + + - Block Compression + - Reduces the size requirement of on-disk objects by compressing blocks of +the backing object's file. The additional CPU cost of block compression can be +high, and should be considered. When block compression has been configured, +configured page sizes will not match the actual size of the page on disk. + - WiredTiger provides two methods of compressing your data when using block +compression: the raw and noraw methods. These methods change how WiredTiger +works to fit data into the blocks that are stored on disk. Applications needing +to write specific sized blocks may want to consider implementing a +WT_COMPRESSOR::compress_raw function. + - Noraw compression: +\n A fixed amount of data is given to the compression system, then turned into +a compressed block of data. The amount of data chosen to compress is the data +needed to fill the uncompressed block. Thus when compressed, the block will be +smaller than the normal data size and the sizes written to disk will often vary +depending on how compressible the data being stored is. Algorithms using noraw +compression include zlib-noraw, lz4-noraw and snappy. +Noraw compression is better suited for workloads with random access patterns +because each block will tend to be smaller and require less work to read and +decompress. + - Raw compression: +\n WiredTiger's raw compression takes advantage of compressors that provide a +streaming compression API. Using the streaming API WiredTiger will try to fit as +much data as possible into one block. This means that blocks created with raw +compression should be of similar size. Using a streaming compression method +should also make for less overhead in compression, as the setup and initial work +for compressing is done fewer times compared to the amount of data stored. +Algorithms using raw compression include zlib, lz4. +Compared to noraw, raw compression provides more compression while using more +CPU. Raw compression may provide a performance advantage in workloads where data +is accessed sequentially. That is because more data is generally packed into +each block on disk. + - Configuration: +\n Specified as the block_compressor configuration option to +WT_SESSION::create(). If WiredTiger has builtin support for "lz4", "snappy", +"zlib" or "zstd" compression, these names are available as the value to the +option. An example of such a configuration string is as follows: + +<pre> + "key_format=S,value_format=S,block_compressor=snappy" +</pre> + +See @ref compression for further information on how to configure and enable +different compression options. + +@subsection table_compress Table summarizing compression in WiredTiger + +<table> +@hrow{Compression Type, Supported by row-store, Supported by variable col-store, + Supported by fixed col-store, Default config, Reduces in-mem size, + Reduces on-disk size, CPU and Memory cost} +@row{Key-prefix, yes, no, no, disabled, yes, yes, minor} +@row{Dictionary, yes, yes, no, disabled, yes, yes, minor} +@row{Huffman, yes, yes, no, disabled, yes, yes, can be high} +@row{Block, yes, yes, yes, disabled, no, yes, can be high} +</table> + +*/ diff --git a/src/docs/tune-page-sizes.dox b/src/docs/tune-page-sizes.dox deleted file mode 100644 index 130e047a02d..00000000000 --- a/src/docs/tune-page-sizes.dox +++ /dev/null @@ -1,142 +0,0 @@ -/*! @page tune_page_sizes Page and overflow key/value sizes - -There are seven page and key/value size configuration strings: - -- allocation size (\c allocation_size), -- page sizes (\c internal_page_max and \c leaf_page_max), -- key and value sizes (\c internal_key_max, \c leaf_key_max and \c leaf_value_max), and the -- page-split percentage (\c split_pct). - -All seven are specified to the WT_SESSION::create method, in other -words, they are configurable on a per-file basis. - -Applications commonly configure page sizes, based on their workload's -typical key and value size. Once the correct page size has been chosen, -appropriate defaults for the other configuration values are derived from -the page sizes, and relatively few applications will need to modify the -other page and key/value size configuration options. - -An example of configuring page and key/value sizes: - -@snippet ex_all.c Create a table and configure the page size - -@section tune_page_sizes_sizes Page, key and value sizes - -The \c internal_page_max and \c leaf_page_max configuration values -specify a maximum size for Btree internal and leaf pages. That is, when -an internal or leaf page grows past that size, it splits into multiple -pages. Generally, internal pages should be sized to fit into on-chip -caches in order to minimize cache misses when searching the tree, while -leaf pages should be sized to maximize I/O performance (if reading from -disk is necessary, it is usually desirable to read a large amount of -data, assuming some locality of reference in the application's access -pattern). - -The default page size configurations (2KB for \c internal_page_max, 32KB -for \c leaf_page_max), are appropriate for applications with relatively -small keys and values. - -- Applications doing full-table scans through out-of-memory workloads -might increase both internal and leaf page sizes to transfer more data -per I/O. -- Applications focused on read/write amplification might decrease the page -size to better match the underlying storage block size. - -When block compression has been configured, configured page sizes will -not match the actual size of the page on disk. Block compression in -WiredTiger happens within the I/O subsystem, and so a page might split -even if subsequent compression would result in a resulting page size -small enough to leave as a single page. In other words, page sizes are -based on in-memory sizes, not on-disk sizes. Applications needing to -write specific sized blocks may want to consider implementing a -WT_COMPRESSOR::compress_raw function. - -The page sizes also determine the default size of overflow items, that -is, keys and values too large to easily store on a page. Overflow items -are stored separately in the file from the page where the item logically -appears, and so reading or writing an overflow item is more expensive -than an on-page item, normally requiring additional I/O. Additionally, -overflow values are not cached in memory. This means overflow items -won't affect the caching behavior of the application, but it also means -that each time an overflow value is read, it is re-read from disk. - -For both of these reasons, applications should avoid creating large -numbers of commonly referenced overflow items. This is especially -important for keys, as keys on internal pages are referenced during -random searches, not just during data retrieval. Generally, -applications should make every attempt to avoid creating overflow keys. - -- Applications with large keys and values, and concerned with latency, -might increase the page size to avoid creating overflow items, in order -to avoid the additional cost of retrieving them. - -- Applications with large keys and values, doing random searches, might -decrease the page size to avoid wasting cache space on overflow items -that aren't likely to be needed. - -- Applications with large keys and values, doing table scans, might -increase the page size to avoid creating overflow items, as the overflow -items must be read into memory in all cases, anyway. - -The \c internal_key_max, \c leaf_key_max and \c leaf_value_max -configuration values allow applications to change the size at which a -key or value will be treated as an overflow item. - -The value of \c internal_key_max is relative to the maximum internal -page size. Because the number of keys on an internal page determines -the depth of the tree, the \c internal_key_max value can only be -adjusted within a certain range, and the configured value will be -automatically adjusted by WiredTiger, if necessary to ensure a -reasonable number of keys fit on an internal page. - -The values of \c leaf_key_max and \c leaf_value_max are not relative to -the maximum leaf page size. If either is larger than the maximum page -size, the page size will be ignored when the larger keys and values are -being written, and a larger page will be created as necessary. - -Most applications should not need to tune the maximum key and value -sizes. Applications requiring a small page size, but also having -latency concerns such that the additional work to retrieve an overflow -item is an issue, may find them useful. - -An example of configuring a large leaf overflow value: - -@snippet ex_all.c Create a table and configure a large leaf value max - -@section tune_page_sizes_split_percentage Split percentage - -The \c split_pct configuration string configures the size of a split -page. When a page grows sufficiently large that it must be written as -multiple disk blocks, the newly written block size is \c split_pct -percent of the maximum page size. This value should be selected to -avoid creating a large number of tiny pages or repeatedly splitting -whenever new entries are inserted. For example, if the maximum page -size is 1MB, a \c split_pct value of 10% would potentially result in -creating a large number of 100KB pages, which may not be optimal for -future I/O. Or, if the maximum page size is 1MB, a \c split_pct value -of 90% would potentially result in repeatedly splitting pages as the -split pages grow to 1MB over and over. The default value for \c -split_pct is 75%, intended to keep large pages relatively large, while -still giving split pages room to grow. - -Most applications should not need to tune the split percentage size. - -@section tune_page_sizes_allocation_size Allocation size - -The \c allocation_size configuration value is the underlying unit of -allocation for the file. As the unit of file allocation, it sets the -minimum page size and how much space is wasted when storing small -amounts of data and overflow items. For example, if the allocation size -is set to 4KB, an overflow item of 18,000 bytes requires 5 allocation -units and wastes about 2KB of space. If the allocation size is 16KB, -the same overflow item would waste more than 10KB. - -The default allocation size is 4KB, chosen for compatibility with -virtual memory page sizes and direct I/O requirements on common server -platforms. - -Most applications should not need to tune the allocation size; it is -primarily intended for applications coping with the specific -requirements some file systems make to support features like direct I/O. - -*/ diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox index 1e0e2eaf99a..e5fce3d0d5d 100644 --- a/src/docs/upgrading.dox +++ b/src/docs/upgrading.dox @@ -1,22 +1,66 @@ /*! @page upgrading Upgrading WiredTiger applications -@section version_291 Upgrading to Version 2.9.1 + +@section version_292 Upgrading to Version 2.9.2 <dl> -<dt>WiredTiger now requires Python 2.7 at minimum</dt> + +<dt>WiredTiger utility now supports truncate</dt> +<dd> +The WiredTiger utility \c wt can now \c truncate objects, removing all +contents from the specified object. +</dd> + +<dt>Handle list lock statistics</dt> <dd> -The minimum version of Python supported by WiredTiger is now 2.7 up from the -previous version of 2.6. This is due to extra unit tests added in this release -that depend on 2.7. This is not due to a change in the Python API. +In the 2.9.1 release we added statistics tracking handle list lock timing, we +have switched that lock from a spin lock to a read-write lock, and consequently +changed the statistics tracking lock related wait time. </dd> +<dt>Forced and named checkpoint error conditions changed</dt> +<dd> +There are new cases where checkpoints created with an explicit name or the +"force" configuration option can return an EBUSY error. This can happen if +the checkpoint overlaps with other schema operations, for example table create. +</dd> + +<dt>WT_CURSOR::remove may not return a positioned cursor</dt> +<dd> +The WT_CURSOR::remove method was previously documented to always return a +positioned cursor on success, which is not possible when \c overwrite=true +and the record does not exist. + +The documentation has been updated, and the method has been changed to +never return a cursor position unless called with an existing cursor +position. In other words, if the cursor is positioned and the +WT_CURSOR::remove is called, the cursor will remain positioned; if the +cursor is not positioned and the WT_CURSOR::remove method is called, the +cursor will not be positioned on return. +</dd> + +</dl><hr> +@section version_291 Upgrading to Version 2.9.1 +<dl> + <dt>Changes to hazard pointer configuration</dt> <dd> The \c hazard_max parameter to ::wiredtiger_open is now ignored. Memory is allocated for hazard pointers as required by each session. </dd> -</dl><hr> +<dt>Change to the default fadvise behavior for data files</dt> +<dd> +The old default behavior was to advise the file system that access would be +random for data files, and there was no way to alter that. We no longer +call advise the file system of expected access patterns by default, and +have added a new \c access_pattern_hint configuration option available for +WT_SESSION::create that can be used to restore the old default by setting +the value to "random". +</dd> + +</dl><hr> @section version_290 Upgrading to Version 2.9.0 <dl> + <dt>Changes to cursor behavior after WT_CURSOR::insert</dt> <dd> After a successful call to WT_CURSOR::insert, unless a cursor has record @@ -314,7 +358,7 @@ be updated. The WT_SESSION::create \c internal_item_max and \c leaf_item_max configuration strings are now deprecated in favor of the \c internal_key_max, \c leaf_key_max, and \c leaf_value_max -configuration strings. See @ref tune_page_sizes for more information. +configuration strings. See @ref tune_page_size_and_comp for more information. </dd> </dl><hr> diff --git a/src/docs/wtperf.dox b/src/docs/wtperf.dox index 83aadf8a776..6bdcf5f4f8d 100644 --- a/src/docs/wtperf.dox +++ b/src/docs/wtperf.dox @@ -167,6 +167,8 @@ do population phase; false to use existing database number of WiredTiger databases to use. Each database will execute the workload using a separate home directory and complete set of worker threads @par drop_tables (boolean, default=false) Whether to drop all tables at the end of the run, and report time taken to do the drop. +@par in_memory (boolean, default=false) +Whether to create the database in-memory. @par icount (unsigned int, default=5000) number of records to initially populate. If multiple tables are configured the count is spread evenly across all tables. @par idle_table_cycle (unsigned int, default=0) @@ -195,14 +197,14 @@ use pareto distribution for random numbers. Zero to disable, otherwise a percen number of operations to group into each transaction in the populate phase, zero for auto-commit @par populate_threads (unsigned int, default=1) number of populate threads, 1 for bulk load +@par pre_load_data (boolean, default=false) +Scan all data prior to starting the workload phase to warm the cache @par random_range (unsigned int, default=0) if non zero choose a value from within this range as the key for insert operations @par random_value (boolean, default=false) generate random content for the value @par range_partition (boolean, default=false) partition data by range (vs hash) -@par read_range (unsigned int, default=0) -scan a range of keys after each search @par readonly (boolean, default=false) reopen the connection between populate and workload phases in readonly mode. Requires reopen_connection turned on (default). Requires that read be the only workload specified @par reopen_connection (boolean, default=true) @@ -228,7 +230,7 @@ number of tables to run operations over. Keys are divided evenly over the table @par table_count_idle (unsigned int, default=0) number of tables to create, that won't be populated. Default 0. @par threads (string, default="") -workload configuration: each 'count' entry is the total number of threads, and the 'insert', 'read' and 'update' entries are the ratios of insert, read and update operations done by each worker thread; If a throttle value is provided each thread will do a maximum of that number of operations per second; multiple workload configurations may be specified per threads configuration; for example, a more complex threads configuration might be 'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' which would create 2 threads doing nothing but reads and 8 threads each doing 50% inserts and 25% reads and updates. Allowed configuration values are 'count', 'throttle', 'update_delta', 'reads', 'inserts', 'updates', 'truncate', 'truncate_pct' and 'truncate_count'. There are also behavior modifiers, supported modifiers are 'ops_per_txn' +workload configuration: each 'count' entry is the total number of threads, and the 'insert', 'read' and 'update' entries are the ratios of insert, read and update operations done by each worker thread; If a throttle value is provided each thread will do a maximum of that number of operations per second; multiple workload configurations may be specified per threads configuration; for example, a more complex threads configuration might be 'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' which would create 2 threads doing nothing but reads and 8 threads each doing 50% inserts and 25% reads and updates. Allowed configuration values are 'count', 'throttle', 'update_delta', 'reads', 'read_range', 'inserts', 'updates', 'truncate', 'truncate_pct' and 'truncate_count'. There are also behavior modifiers, supported modifiers are 'ops_per_txn' @par transaction_config (string, default="") WT_SESSION.begin_transaction configuration string, applied during the populate phase when populate_ops_per_txn is nonzero @par table_name (string, default="test") diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c index 17b038fb003..3d8f4a61ca7 100644 --- a/src/evict/evict_file.c +++ b/src/evict/evict_file.c @@ -15,15 +15,27 @@ int __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) { + WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; WT_REF *next_ref, *ref; + btree = S2BT(session); + /* - * We need exclusive access to the file -- disable ordinary eviction - * and drain any blocks already queued. + * We need exclusive access to the file, we're about to discard the root + * page. Assert eviction has been locked out. */ - WT_RET(__wt_evict_file_exclusive_on(session)); + WT_ASSERT(session, + btree->evict_disabled > 0 || + !F_ISSET(session->dhandle, WT_DHANDLE_OPEN)); + + /* + * We do discard objects without pages in memory. If that's the case, + * we're done. + */ + if (btree->root.page == NULL) + return (0); /* Make sure the oldest transaction ID is up-to-date. */ WT_RET(__wt_txn_update_oldest( @@ -102,7 +114,5 @@ err: /* On error, clear any left-over tree walk. */ session, next_ref, WT_READ_NO_EVICT)); } - __wt_evict_file_exclusive_off(session); - return (ret); } diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 6fa728916de..26bbf9f679b 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -15,6 +15,7 @@ static int __evict_lru_walk(WT_SESSION_IMPL *); static int __evict_page(WT_SESSION_IMPL *, bool); static int __evict_pass(WT_SESSION_IMPL *); static int __evict_server(WT_SESSION_IMPL *, bool *); +static int __evict_tune_workers(WT_SESSION_IMPL *session); static int __evict_walk(WT_SESSION_IMPL *, WT_EVICT_QUEUE *); static int __evict_walk_file( WT_SESSION_IMPL *, WT_EVICT_QUEUE *, u_int, u_int *); @@ -23,6 +24,60 @@ static int __evict_walk_file( (S2C(s)->evict_threads.current_threads > 1) /* + * __evict_lock_handle_list -- + * Try to get the handle list lock, with yield and sleep back off. + * Keep timing statistics overall. + */ +static int +__evict_lock_handle_list(WT_SESSION_IMPL *session) +{ + struct timespec enter, leave; + WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_RWLOCK *dh_lock; + u_int spins; + bool dh_stats; + + conn = S2C(session); + cache = conn->cache; + dh_lock = &conn->dhandle_lock; + + /* + * Setup tracking of handle lock acquisition wait time if statistics + * are enabled. + */ + dh_stats = WT_STAT_ENABLED(session); + + if (dh_stats) + __wt_epoch(session, &enter); + + /* + * Use a custom lock acquisition back off loop so the eviction server + * notices any interrupt quickly. + */ + for (spins = 0; + (ret = __wt_try_readlock(session, dh_lock)) == EBUSY && + cache->pass_intr == 0; spins++) { + if (spins < WT_THOUSAND) + __wt_yield(); + else + __wt_sleep(0, WT_THOUSAND); + } + /* + * Only record statistics on success. + */ + WT_RET(ret); + if (dh_stats) { + __wt_epoch(session, &leave); + WT_STAT_CONN_INCRV( + session, lock_handle_list_wait_eviction, + (int64_t)WT_TIMEDIFF_US(leave, enter)); + } + return (0); +} + +/* * __evict_entry_priority -- * Get the adjusted read generation for an eviction entry. */ @@ -143,8 +198,7 @@ __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref) } __wt_spin_unlock(session, &cache->evict_queues[q].evict_lock); } - WT_ASSERT(session, - !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU)); + WT_ASSERT(session, !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU)); __wt_spin_unlock(session, &cache->evict_queue_lock); } @@ -213,7 +267,7 @@ __wt_evict_server_wake(WT_SESSION_IMPL *session) } #endif - __wt_cond_auto_signal(session, cache->evict_cond); + __wt_cond_signal(session, cache->evict_cond); } /* @@ -226,12 +280,12 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread) WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; - bool did_work; + bool did_work, was_intr; conn = S2C(session); cache = conn->cache; -#ifdef HAVE_DIAGNOSTIC +#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) /* * Ensure the cache stuck timer is initialized when starting eviction. */ @@ -254,12 +308,28 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread) ret = __evict_server(session, &did_work); F_CLR(cache->walk_session, WT_SESSION_LOCKED_PASS); F_CLR(session, WT_SESSION_LOCKED_PASS); + was_intr = cache->pass_intr != 0; __wt_spin_unlock(session, &cache->evict_pass_lock); WT_ERR(ret); + + /* + * If the eviction server was interrupted, wait until + * requests have been processed: the system may + * otherwise be busy so don't go to sleep. + */ + if (was_intr) { + while (cache->pass_intr != 0 && + F_ISSET(conn, WT_CONN_EVICTION_RUN) && + F_ISSET(thread, WT_THREAD_RUN)) + __wt_yield(); + continue; + } + __wt_verbose(session, WT_VERB_EVICTSERVER, "sleeping"); + /* Don't rely on signals: check periodically. */ __wt_cond_auto_wait( - session, cache->evict_cond, did_work); + session, cache->evict_cond, did_work, NULL); __wt_verbose(session, WT_VERB_EVICTSERVER, "waking"); } else WT_ERR(__evict_lru_pages(session, false)); @@ -299,14 +369,13 @@ err: WT_PANIC_MSG(session, ret, "cache eviction thread error"); static int __evict_server(WT_SESSION_IMPL *session, bool *did_work) { +#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) + struct timespec now; +#endif WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; -#ifdef HAVE_DIAGNOSTIC - struct timespec now; -#endif uint64_t orig_pages_evicted; - u_int spins; conn = S2C(session); cache = conn->cache; @@ -317,7 +386,8 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work) /* Evict pages from the cache as needed. */ WT_RET(__evict_pass(session)); - if (!F_ISSET(conn, WT_CONN_EVICTION_RUN)) + if (!F_ISSET(conn, WT_CONN_EVICTION_RUN) || + cache->pass_intr != 0) return (0); /* @@ -325,35 +395,31 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work) * otherwise we can block applications evicting large pages. */ if (!__wt_cache_stuck(session)) { - for (spins = 0; (ret = __wt_spin_trylock( - session, &conn->dhandle_lock)) == EBUSY && - cache->pass_intr == 0; spins++) { - if (spins < WT_THOUSAND) - __wt_yield(); - else - __wt_sleep(0, WT_THOUSAND); - } /* - * If we gave up acquiring the lock, that indicates a - * session is waiting for us to clear walks. Do that - * as part of a normal pass (without the handle list + * Try to get the handle list lock: if we give up, that + * indicates a session is waiting for us to clear walks. Do + * that as part of a normal pass (without the handle list * lock) to avoid deadlock. */ - if (ret == EBUSY) + if ((ret = __evict_lock_handle_list(session)) == EBUSY) return (0); WT_RET(ret); ret = __evict_clear_all_walks(session); - __wt_spin_unlock(session, &conn->dhandle_lock); + __wt_readunlock(session, &conn->dhandle_lock); WT_RET(ret); cache->pages_evicted = 0; } else if (cache->pages_evicted != cache->pages_evict) { cache->pages_evicted = cache->pages_evict; -#ifdef HAVE_DIAGNOSTIC +#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) __wt_epoch(session, &cache->stuck_ts); } else if (!F_ISSET(conn, WT_CONN_IN_MEMORY)) { /* - * After being stuck for 5 minutes, give up. + * If we're stuck for 5 minutes in diagnostic mode, or the + * verbose evict_stuck flag is configured, log the cache + * and transaction state. + * + * If we're stuck for 5 minutes in diagnostic mode, give up. * * We don't do this check for in-memory workloads because * application threads are not blocked by the cache being full. @@ -362,11 +428,22 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work) */ __wt_epoch(session, &now); if (WT_TIMEDIFF_SEC(now, cache->stuck_ts) > 300) { - ret = ETIMEDOUT; - __wt_err(session, ret, +#if defined(HAVE_DIAGNOSTIC) + __wt_err(session, ETIMEDOUT, "Cache stuck for too long, giving up"); - WT_TRET(__wt_cache_dump(session, NULL)); + ret = ETIMEDOUT; + WT_TRET(__wt_verbose_dump_txn(session)); + WT_TRET(__wt_verbose_dump_cache(session)); return (ret); +#elif defined(HAVE_VERBOSE) + if (WT_VERBOSE_ISSET(session, WT_VERB_EVICT_STUCK)) { + WT_RET(__wt_verbose_dump_txn(session)); + WT_RET(__wt_verbose_dump_cache(session)); + + /* Reset the timer. */ + __wt_epoch(session, &cache->stuck_ts); + } +#endif } #endif } @@ -389,11 +466,13 @@ __wt_evict_create(WT_SESSION_IMPL *session) /* Set first, the thread might run before we finish up. */ F_SET(conn, WT_CONN_EVICTION_RUN); - /* Create the eviction thread group */ + /* + * Create the eviction thread group. + * Set the group size to the maximum allowed sessions. + */ WT_RET(__wt_thread_group_create(session, &conn->evict_threads, - "eviction-server", conn->evict_threads_min, - conn->evict_threads_max, WT_THREAD_CAN_WAIT | WT_THREAD_PANIC_FAIL, - __wt_evict_thread_run)); + "eviction-server", conn->evict_threads_min, conn->evict_threads_max, + WT_THREAD_CAN_WAIT | WT_THREAD_PANIC_FAIL, __wt_evict_thread_run)); /* * Allow queues to be populated now that the eviction threads @@ -420,7 +499,7 @@ __wt_evict_destroy(WT_SESSION_IMPL *session) return (0); /* Wait for any eviction thread group changes to stabilize. */ - __wt_writelock(session, conn->evict_threads.lock); + __wt_writelock(session, &conn->evict_threads.lock); /* * Signal the threads to finish and stop populating the queue. @@ -548,6 +627,8 @@ __evict_pass(WT_SESSION_IMPL *session) if (loop == 0) prev = now; + if (conn->evict_threads.threads[0]->session == session) + WT_RET(__evict_tune_workers(session)); /* * Increment the shared read generation. Do this occasionally * even if eviction is not currently required, so that pages @@ -573,14 +654,6 @@ __evict_pass(WT_SESSION_IMPL *session) if (!__evict_update_work(session)) break; - /* - * Try to start a new thread if we have capacity and haven't - * reached the eviction targets. - */ - if (F_ISSET(cache, WT_CACHE_EVICT_ALL)) - WT_RET(__wt_thread_group_start_one( - session, &conn->evict_threads, false)); - __wt_verbose(session, WT_VERB_EVICTSERVER, "Eviction pass with: Max: %" PRIu64 " In use: %" PRIu64 " Dirty: %" PRIu64, @@ -655,8 +728,8 @@ __evict_pass(WT_SESSION_IMPL *session) */ WT_STAT_CONN_INCR(session, cache_eviction_server_slept); - __wt_cond_wait( - session, cache->evict_cond, WT_THOUSAND); + __wt_cond_wait(session, + cache->evict_cond, WT_THOUSAND, NULL); continue; } @@ -683,7 +756,7 @@ __evict_pass(WT_SESSION_IMPL *session) * Clear a single walk point. */ static int -__evict_clear_walk(WT_SESSION_IMPL *session, bool count_stat) +__evict_clear_walk(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; @@ -700,14 +773,14 @@ __evict_clear_walk(WT_SESSION_IMPL *session, bool count_stat) if ((ref = btree->evict_ref) == NULL) return (0); - if (count_stat) - WT_STAT_CONN_INCR(session, cache_eviction_walks_abandoned); + WT_STAT_CONN_INCR(session, cache_eviction_walks_abandoned); /* - * Clear evict_ref first, in case releasing it forces eviction (we - * assert we never try to evict the current eviction walk point). + * Clear evict_ref before releasing it in case that forces eviction (we + * assert that we never try to evict the current eviction walk point). */ btree->evict_ref = NULL; + WT_WITH_DHANDLE(cache->walk_session, session->dhandle, (ret = __wt_page_release(cache->walk_session, ref, WT_READ_NO_EVICT))); @@ -730,7 +803,7 @@ __evict_clear_all_walks(WT_SESSION_IMPL *session) TAILQ_FOREACH(dhandle, &conn->dhqh, q) if (WT_PREFIX_MATCH(dhandle->name, "file:")) WT_WITH_DHANDLE(session, dhandle, - WT_TRET(__evict_clear_walk(session, true))); + WT_TRET(__evict_clear_walk(session))); return (ret); } @@ -751,31 +824,19 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session) btree = S2BT(session); cache = S2C(session)->cache; - /* - * Hold the walk lock to set the no-eviction flag. - * - * The no-eviction flag can be set permanently, in which case we never - * increment the no-eviction count. - */ + /* Hold the walk lock to turn off eviction. */ __wt_spin_lock(session, &cache->evict_walk_lock); - if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) { - if (btree->evict_disabled != 0) - ++btree->evict_disabled; + if (++btree->evict_disabled > 1) { __wt_spin_unlock(session, &cache->evict_walk_lock); return (0); } - ++btree->evict_disabled; /* * Ensure no new pages from the file will be queued for eviction after - * this point. + * this point, then clear any existing LRU eviction walk for the file. */ - F_SET(btree, WT_BTREE_NO_EVICTION); (void)__wt_atomic_addv32(&cache->pass_intr, 1); - - /* Clear any existing LRU eviction walk for the file. */ - WT_WITH_PASS_LOCK(session, - ret = __evict_clear_walk(session, true)); + WT_WITH_PASS_LOCK(session, ret = __evict_clear_walk(session)); (void)__wt_atomic_subv32(&cache->pass_intr, 1); WT_ERR(ret); @@ -806,7 +867,6 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session) if (0) { err: --btree->evict_disabled; - F_CLR(btree, WT_BTREE_NO_EVICTION); } __wt_spin_unlock(session, &cache->evict_walk_lock); return (ret); @@ -831,17 +891,233 @@ __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session) */ WT_DIAGNOSTIC_YIELD; + /* Hold the walk lock to turn on eviction. */ + __wt_spin_lock(session, &cache->evict_walk_lock); WT_ASSERT(session, - btree->evict_ref == NULL && F_ISSET(btree, WT_BTREE_NO_EVICTION)); + btree->evict_ref == NULL && btree->evict_disabled > 0); + --btree->evict_disabled; + __wt_spin_unlock(session, &cache->evict_walk_lock); +} + +#define EVICT_TUNE_BATCH 1 /* Max workers to add each period */ +/* + * Data points needed before deciding if we should keep adding workers or settle + * on an earlier value. + */ +#define EVICT_TUNE_DATAPT_MIN 3 +#define EVICT_TUNE_PERIOD 1 /* Tune period in seconds */ + +/* + * We will do a fresh re-tune every that many seconds to adjust to + * significant phase changes. + */ +#define EVICT_FORCE_RETUNE 30 + +/* + * __evict_tune_workers -- + * Find the right number of eviction workers. Gradually ramp up the number of + * workers increasing the number in batches indicated by the setting above. + * Store the number of workers that gave us the best throughput so far and the + * number of data points we have tried. + * + * Every once in a while when we have the minimum number of data points we check + * whether the eviction throughput achieved with the current number of workers + * is the best we have seen so far. If so, we will keep increasing the number of + * workers. If not, we are past the infliction point on the eviction throughput + * curve. In that case, we will set the number of workers to the best observed + * so far and settle into a stable state. + */ +static int +__evict_tune_workers(WT_SESSION_IMPL *session) +{ + struct timespec current_time; + WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + uint64_t delta_msec, delta_pages; + uint64_t pgs_evicted_cur, pgs_evicted_persec_cur, time_diff; + int32_t cur_threads, i, target_threads, thread_surplus; + + conn = S2C(session); + cache = conn->cache; + + WT_ASSERT(session, conn->evict_threads.threads[0]->session == session); + pgs_evicted_cur = pgs_evicted_persec_cur = 0; + + __wt_epoch(session, ¤t_time); + time_diff = WT_TIMEDIFF_SEC(current_time, conn->evict_tune_last_time); /* - * The no-eviction flag can be set permanently, in which case we never - * increment the no-eviction count. + * If we have reached the stable state and have not run long enough to + * surpass the forced re-tuning threshold, return. */ - __wt_spin_lock(session, &cache->evict_walk_lock); - if (btree->evict_disabled > 0 && --btree->evict_disabled == 0) - F_CLR(btree, WT_BTREE_NO_EVICTION); - __wt_spin_unlock(session, &cache->evict_walk_lock); + if (conn->evict_tune_stable) { + if (time_diff < EVICT_FORCE_RETUNE) + return (0); + + /* + * Stable state was reached a long time ago. Let's re-tune. + * Reset all the state. + */ + conn->evict_tune_stable = 0; + conn->evict_tune_last_action_time.tv_sec = 0; + conn->evict_tune_pgs_last = 0; + conn->evict_tune_num_points = 0; + conn->evict_tune_pg_sec_max = 0; + conn->evict_tune_workers_best = 0; + + /* Reduce the number of eviction workers to the minimum */ + thread_surplus = + (int32_t)conn->evict_threads.current_threads - + (int32_t)conn->evict_threads_min; + + for (i = 0; i < thread_surplus; i++) { + WT_ERR(__wt_thread_group_stop_one( + session, &conn->evict_threads, false)); + WT_STAT_CONN_INCR(session, + cache_eviction_worker_removed); + } + WT_STAT_CONN_INCR(session, cache_eviction_force_retune); + } else + if (time_diff < EVICT_TUNE_PERIOD) + /* + * If we have not reached stable state, don't do + * anything unless enough time has passed since the last + * time we have taken any action in this function. + */ + return (0); + + /* + * Measure the number of evicted pages so far. Eviction rate correlates + * to performance, so this is our metric of success. + */ + pgs_evicted_cur = cache->pages_evict; + + /* + * If we have recorded the number of pages evicted at the end of + * the previous measurement interval, we can compute the eviction + * rate in evicted pages per second achieved during the current + * measurement interval. + * Otherwise, we just record the number of evicted pages and return. + */ + if (conn->evict_tune_pgs_last == 0) + goto err; + + delta_msec = WT_TIMEDIFF_MS(current_time, conn->evict_tune_last_time); + delta_pages = pgs_evicted_cur - conn->evict_tune_pgs_last; + pgs_evicted_persec_cur = (delta_pages * WT_THOUSAND) / delta_msec; + conn->evict_tune_num_points++; + + /* + * Keep track of the maximum eviction throughput seen and the number + * of workers corresponding to that throughput. + */ + if (pgs_evicted_persec_cur > conn->evict_tune_pg_sec_max) { + conn->evict_tune_pg_sec_max = pgs_evicted_persec_cur; + conn->evict_tune_workers_best = + conn->evict_threads.current_threads; + } + + /* + * Compare the current number of data points with the number + * needed variable. If they are equal, we will check whether + * we are still going up on the performance curve, in which + * case we will continue increasing the number of workers, or + * we are past the inflection point on the curve, in which case + * we will go back to the best observed number of workers and + * settle into a stable state. + */ + if (conn->evict_tune_num_points >= conn->evict_tune_datapts_needed) { + if (conn->evict_tune_workers_best == + conn->evict_threads.current_threads && + conn->evict_threads.current_threads < + conn->evict_threads_max) { + /* + * Keep adding workers. We will check again + * at the next check point. + */ + conn->evict_tune_datapts_needed += WT_MIN( + EVICT_TUNE_DATAPT_MIN, + (conn->evict_threads_max - + conn->evict_threads.current_threads) / + EVICT_TUNE_BATCH); + } else { + /* + * We are past the inflection point. Choose the + * best number of eviction workers observed and + * settle into a stable state. + */ + thread_surplus = + (int32_t)conn->evict_threads.current_threads - + (int32_t)conn->evict_tune_workers_best; + + for (i = 0; i < thread_surplus; i++) { + /* + * If we get an error, it should be because we + * were unable to acquire the thread group lock. + * Break out of trying. + */ + WT_ERR(__wt_thread_group_stop_one( + session, &conn->evict_threads, false)); + WT_STAT_CONN_INCR(session, + cache_eviction_worker_removed); + } + WT_STAT_CONN_SET(session, + cache_eviction_stable_state_workers, + conn->evict_tune_workers_best); + conn->evict_tune_stable = true; + WT_STAT_CONN_SET(session, cache_eviction_active_workers, + conn->evict_threads.current_threads); + goto err; + } + } + + /* + * If we have not added any worker threads in the past, we set the + * number needed equal to the number of data points that we must + * accumulate before deciding if we should keep adding workers or settle + * on a previously tried value of workers. + */ + if (conn->evict_tune_last_action_time.tv_sec == 0) + conn->evict_tune_datapts_needed = WT_MIN(EVICT_TUNE_DATAPT_MIN, + (conn->evict_threads_max - + conn->evict_threads.current_threads) / EVICT_TUNE_BATCH); + + if (F_ISSET(cache, WT_CACHE_EVICT_ALL)) { + cur_threads = (int32_t)conn->evict_threads.current_threads; + target_threads = WT_MIN(cur_threads + EVICT_TUNE_BATCH, + (int32_t)conn->evict_threads_max); + /* + * Start the new threads. + */ + for (i = cur_threads; i < target_threads; ++i) { + /* + * If we get an error, it should be because we were + * unable to acquire the thread group lock. Break out + * of trying. + */ + WT_ERR(__wt_thread_group_start_one(session, + &conn->evict_threads, false)); + WT_STAT_CONN_INCR(session, + cache_eviction_worker_created); + __wt_verbose(session, WT_VERB_EVICTSERVER, + "added worker thread"); + } + conn->evict_tune_last_action_time = current_time; + } + + WT_STAT_CONN_SET(session, cache_eviction_active_workers, + conn->evict_threads.current_threads); + +err: conn->evict_tune_last_time = current_time; + conn->evict_tune_pgs_last = pgs_evicted_cur; + /* + * If we got an EBUSY trying to acquire the lock just return. + * We can try to tune the workers next time. + */ + if (ret == EBUSY) + ret = 0; + return (ret); } /* @@ -867,7 +1143,8 @@ __evict_lru_pages(WT_SESSION_IMPL *session, bool is_server) /* If a worker thread found the queue empty, pause. */ if (ret == WT_NOTFOUND && !is_server && F_ISSET(S2C(session), WT_CONN_EVICTION_RUN)) - __wt_cond_wait(session, conn->evict_threads.wait_cond, 10000); + __wt_cond_wait( + session, conn->evict_threads.wait_cond, 10000, NULL); return (ret == WT_NOTFOUND ? 0 : ret); } @@ -1046,7 +1323,7 @@ __evict_walk(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue) WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; - u_int max_entries, retries, slot, spins, start_slot, total_candidates; + u_int max_entries, retries, slot, start_slot, total_candidates; bool dhandle_locked, incr; conn = S2C(session); @@ -1084,16 +1361,7 @@ retry: while (slot < max_entries) { * reference count to keep it alive while we sweep. */ if (!dhandle_locked) { - for (spins = 0; (ret = __wt_spin_trylock( - session, &conn->dhandle_lock)) == EBUSY && - cache->pass_intr == 0; - spins++) { - if (spins < WT_THOUSAND) - __wt_yield(); - else - __wt_sleep(0, WT_THOUSAND); - } - WT_ERR(ret); + WT_ERR(__evict_lock_handle_list(session)); dhandle_locked = true; } @@ -1129,7 +1397,7 @@ retry: while (slot < max_entries) { /* Skip files that don't allow eviction. */ btree = dhandle->handle; - if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) + if (btree->evict_disabled > 0) continue; /* @@ -1172,7 +1440,7 @@ retry: while (slot < max_entries) { (void)__wt_atomic_addi32(&dhandle->session_inuse, 1); incr = true; - __wt_spin_unlock(session, &conn->dhandle_lock); + __wt_readunlock(session, &conn->dhandle_lock); dhandle_locked = false; /* @@ -1185,13 +1453,23 @@ retry: while (slot < max_entries) { * the tree's current eviction point, and part of the process is * waiting on this thread to acknowledge that action. */ - if (!F_ISSET(btree, WT_BTREE_NO_EVICTION) && + if (btree->evict_disabled == 0 && !__wt_spin_trylock(session, &cache->evict_walk_lock)) { - if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) { + if (btree->evict_disabled == 0) { + /* + * Assert the handle has a root page: eviction + * should have been locked out if the tree is + * being discarded or the root page is changing. + * As this has not always been the case, assert + * to debug that change. + */ + WT_ASSERT(session, btree->root.page != NULL); + cache->evict_file_next = dhandle; - WT_WITH_DHANDLE(session, dhandle, ret = - __evict_walk_file(session, queue, - max_entries, &slot)); + WT_WITH_DHANDLE(session, dhandle, + ret = __evict_walk_file( + session, queue, max_entries, &slot)); + WT_ASSERT(session, session->split_gen == 0); } __wt_spin_unlock(session, &cache->evict_walk_lock); @@ -1219,7 +1497,7 @@ retry: while (slot < max_entries) { } err: if (dhandle_locked) { - __wt_spin_unlock(session, &conn->dhandle_lock); + __wt_readunlock(session, &conn->dhandle_lock); dhandle_locked = false; } @@ -1282,8 +1560,8 @@ __evict_push_candidate(WT_SESSION_IMPL *session, * Get a few page eviction candidates from a single underlying file. */ static int -__evict_walk_file(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, - u_int max_entries, u_int *slotp) +__evict_walk_file(WT_SESSION_IMPL *session, + WT_EVICT_QUEUE *queue, u_int max_entries, u_int *slotp) { WT_BTREE *btree; WT_CACHE *cache; @@ -1315,6 +1593,19 @@ __evict_walk_file(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, start = queue->evict_queue + *slotp; remaining_slots = max_entries - *slotp; total_slots = max_entries - queue->evict_entries; + btree_inuse = cache_inuse = 0; + target_pages_clean = target_pages_dirty = 0; + + /* + * The number of times we should fill the queue by the end of + * considering all trees. + */ +#define QUEUE_FILLS_PER_PASS 10 + + /* + * The minimum number of pages we should consider per tree. + */ +#define MIN_PAGES_PER_TREE 10 /* * The target number of pages for this tree is proportional to the @@ -1323,13 +1614,12 @@ __evict_walk_file(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, * cache (and only have to walk it once). */ if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) { - btree_inuse = __wt_btree_bytes_inuse(session); + btree_inuse = __wt_btree_bytes_evictable(session); cache_inuse = __wt_cache_bytes_inuse(cache); bytes_per_slot = 1 + cache_inuse / total_slots; target_pages_clean = (uint32_t)( (btree_inuse + bytes_per_slot / 2) / bytes_per_slot); - } else - target_pages_clean = 0; + } if (F_ISSET(cache, WT_CACHE_EVICT_DIRTY)) { btree_inuse = __wt_btree_dirty_leaf_inuse(session); @@ -1337,35 +1627,58 @@ __evict_walk_file(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, bytes_per_slot = 1 + cache_inuse / total_slots; target_pages_dirty = (uint32_t)( (btree_inuse + bytes_per_slot / 2) / bytes_per_slot); - } else - target_pages_dirty = 0; + } - target_pages = WT_MAX(target_pages_clean, target_pages_dirty); + /* + * Weight the number of target pages by the number of times we want to + * fill the cache per pass through all the trees. Note that we don't + * build this into the calculation above because we don't want to favor + * small trees, so round to a whole number of slots (zero for small + * trees) before multiplying. + */ + target_pages = WT_MAX(target_pages_clean, target_pages_dirty) * + QUEUE_FILLS_PER_PASS; + /* + * Randomly walk trees with a small fraction of the cache in case there + * are so many trees that none of them use enough of the cache to be + * allocated slots. + * + * The chance of walking a tree is equal to the chance that a random + * byte in cache belongs to the tree, weighted by how many times we + * want to fill queues during a pass through all the trees in cache. + */ if (target_pages == 0) { - /* - * Randomly walk trees with a tiny fraction of the cache in - * case there are so many trees that none of them use enough of - * the cache to be allocated slots. Walk small trees 1% of the - * time. - */ - if (__wt_random(&session->rnd) > UINT32_MAX / 100) + if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) { + btree_inuse = __wt_btree_bytes_evictable(session); + cache_inuse = __wt_cache_bytes_inuse(cache); + } else { + btree_inuse = __wt_btree_dirty_leaf_inuse(session); + cache_inuse = __wt_cache_dirty_leaf_inuse(cache); + } + if (btree_inuse == 0 || cache_inuse == 0) + return (0); + if (__wt_random64(&session->rnd) % cache_inuse > + btree_inuse * QUEUE_FILLS_PER_PASS) return (0); - target_pages = 10; } + /* + * There is some cost associated with walking a tree. If we're going + * to visit this tree, always look for a minimum number of pages. + */ + if (target_pages < MIN_PAGES_PER_TREE) + target_pages = MIN_PAGES_PER_TREE; + + /* + * If the tree is dead or we're near the end of the queue, fill the + * remaining slots. + */ if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || target_pages > remaining_slots) target_pages = remaining_slots; end = start + target_pages; - walk_flags = - WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT; - - /* Randomize the walk direction. */ - if (btree->evict_walk_reverse) - FLD_SET(walk_flags, WT_READ_PREV); - /* * Examine at least a reasonable number of pages before deciding * whether to give up. When we are only looking for dirty pages, @@ -1376,9 +1689,44 @@ __evict_walk_file(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, !F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) min_pages *= 10; + walk_flags = + WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT; + + /* + * Choose a random point in the tree if looking for candidates in a + * tree with no starting point set. This is mostly aimed at ensuring + * eviction fairly visits all pages in trees with a lot of in-cache + * content. + */ + switch ((WT_EVICT_WALK_START)btree->evict_start_type) { + case WT_EVICT_WALK_NEXT: + break; + case WT_EVICT_WALK_PREV: + FLD_SET(walk_flags, WT_READ_PREV); + break; + case WT_EVICT_WALK_RAND_PREV: + FLD_SET(walk_flags, WT_READ_PREV); + /* FALLTHROUGH */ + case WT_EVICT_WALK_RAND_NEXT: + if (btree->evict_ref == NULL) { + /* Ensure internal pages indexes remain valid */ + WT_WITH_PAGE_INDEX(session, ret = __wt_random_descent( + session, &btree->evict_ref, true)); + WT_RET_NOTFOUND_OK(ret); + } + break; + } + + /* + * Get some more eviction candidate pages, starting at the last saved + * point. Clear the saved point immediately, we assert when discarding + * pages we're not discarding an eviction point, so this clear must be + * complete before the page is released. + */ + ref = btree->evict_ref; + btree->evict_ref = NULL; + /* - * Get some more eviction candidate pages. - * * !!! Take care terminating this loop. * * Don't make an extra call to __wt_tree_walk after we hit the end of a @@ -1391,7 +1739,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, for (evict = start, pages_queued = pages_seen = refs_walked = 0; evict < end && (ret == 0 || ret == WT_NOTFOUND); ret = __wt_tree_walk_count( - session, &btree->evict_ref, &refs_walked, walk_flags)) { + session, &ref, &refs_walked, walk_flags)) { /* * Check whether we're finding a good ratio of candidates vs * pages seen. Some workloads create "deserts" in trees where @@ -1402,10 +1750,18 @@ __evict_walk_file(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, pages_seen > min_pages && (pages_queued == 0 || (pages_seen / pages_queued) > (min_pages / target_pages)); - if (give_up) + if (give_up) { + /* + * Try a different walk start point next time if a + * walk gave up. + */ + btree->evict_start_type = + (btree->evict_start_type + 1) % + WT_EVICT_WALK_START_NUM; break; + } - if ((ref = btree->evict_ref) == NULL) { + if (ref == NULL) { if (++restarts == 2) break; WT_STAT_CONN_INCR( @@ -1439,7 +1795,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, if (page->read_gen == WT_READGEN_NOTSET) __wt_cache_read_gen_new(session, page); - /* Pages we no longer need (clean or dirty), are found money. */ + /* Pages being forcibly evicted go on the urgent queue. */ if (page->read_gen == WT_READGEN_OLDEST || page->memory_footprint >= btree->splitmempage) { WT_STAT_CONN_INCR( @@ -1449,7 +1805,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue, continue; } - /* Pages that are empty or from dead trees are also good. */ + /* Pages that are empty or from dead trees are fast-tracked. */ if (__wt_page_is_empty(page) || F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) goto fast; @@ -1495,7 +1851,7 @@ fast: /* If the page can't be evicted, give up. */ ++pages_queued; if (WT_PAGE_IS_INTERNAL(page)) - ++internal_pages; + ++internal_pages; __wt_verbose(session, WT_VERB_EVICTSERVER, "select: %p, size %" WT_SIZET_FMT, @@ -1508,12 +1864,10 @@ fast: /* If the page can't be evicted, give up. */ session, cache_eviction_pages_queued, (u_int)(evict - start)); /* - * If we didn't find any candidates in the file, reverse the direction - * of the walk and skip it next time. + * If we couldn't find the number of pages we were looking for, skip + * the tree next time. */ - if (give_up) - btree->evict_walk_reverse = !btree->evict_walk_reverse; - if (pages_queued == 0 && !urgent_queued) + if (pages_queued < target_pages / 2 && !urgent_queued) btree->evict_walk_period = WT_MIN( WT_MAX(1, 2 * btree->evict_walk_period), 100); else if (pages_queued == target_pages) @@ -1522,6 +1876,8 @@ fast: /* If the page can't be evicted, give up. */ btree->evict_walk_period /= 2; /* + * Give up the walk occasionally. + * * If we happen to end up on the root page or a page requiring urgent * eviction, clear it. We have to track hazard pointers, and the root * page complicates that calculation. @@ -1533,16 +1889,20 @@ fast: /* If the page can't be evicted, give up. */ * If we land on a page requiring forced eviction, move on to the next * page: we want this page evicted as quickly as possible. */ - if ((ref = btree->evict_ref) != NULL) { - /* Give up the walk occasionally. */ + if (ref != NULL) { if (__wt_ref_is_root(ref) || evict == start || give_up || ref->page->read_gen == WT_READGEN_OLDEST || - ref->page->memory_footprint >= btree->splitmempage) - WT_RET(__evict_clear_walk(session, restarts == 0)); - else if (ref->page->read_gen == WT_READGEN_OLDEST) + ref->page->memory_footprint >= btree->splitmempage) { + if (restarts == 0) + WT_STAT_CONN_INCR( + session, cache_eviction_walks_abandoned); + WT_RET(__wt_page_release(cache->walk_session, + ref, WT_READ_NO_EVICT)); + ref = NULL; + } else if (ref->page->read_gen == WT_READGEN_OLDEST) WT_RET_NOTFOUND_OK(__wt_tree_walk_count( - session, &btree->evict_ref, - &refs_walked, walk_flags)); + session, &ref, &refs_walked, walk_flags)); + btree->evict_ref = ref; } WT_STAT_CONN_INCRV(session, cache_eviction_walk, refs_walked); @@ -1799,6 +2159,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; uint64_t init_evict_count, max_pages_evicted; + bool timer; conn = S2C(session); cache = conn->cache; @@ -1819,7 +2180,9 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) __wt_evict_server_wake(session); /* Track how long application threads spend doing eviction. */ - if (WT_STAT_ENABLED(session) && !F_ISSET(session, WT_SESSION_INTERNAL)) + timer = + WT_STAT_ENABLED(session) && !F_ISSET(session, WT_SESSION_INTERNAL); + if (timer) __wt_epoch(session, &enter); for (init_evict_count = cache->pages_evict;; ret = 0) { @@ -1876,8 +2239,8 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) break; case WT_NOTFOUND: /* Allow the queue to re-populate before retrying. */ - __wt_cond_wait( - session, conn->evict_threads.wait_cond, 10000); + __wt_cond_wait(session, + conn->evict_threads.wait_cond, 10000, NULL); cache->app_waits++; break; default: @@ -1885,8 +2248,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) } } -err: if (WT_STAT_ENABLED(session) && - !F_ISSET(session, WT_SESSION_INTERNAL)) { +err: if (timer) { __wt_epoch(session, &leave); WT_STAT_CONN_INCRV(session, application_cache_time, WT_TIMEDIFF_US(leave, enter)); @@ -1914,7 +2276,7 @@ __wt_page_evict_urgent(WT_SESSION_IMPL *session, WT_REF *ref) page = ref->page; if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU) || - F_ISSET(S2BT(session), WT_BTREE_NO_EVICTION)) + S2BT(session)->evict_disabled > 0) return (false); /* Append to the urgent queue if we can. */ @@ -1924,7 +2286,7 @@ __wt_page_evict_urgent(WT_SESSION_IMPL *session, WT_REF *ref) __wt_spin_lock(session, &cache->evict_queue_lock); if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU) || - F_ISSET(S2BT(session), WT_BTREE_NO_EVICTION)) + S2BT(session)->evict_disabled > 0) goto done; __wt_spin_lock(session, &urgent_queue->evict_lock); @@ -1973,125 +2335,140 @@ __wt_evict_priority_clear(WT_SESSION_IMPL *session) S2BT(session)->evict_priority = 0; } -#ifdef HAVE_DIAGNOSTIC +#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) /* - * __wt_cache_dump -- - * Dump debugging information to a file (default stderr) about the size of - * the files in the cache. + * __verbose_dump_cache_single -- + * Output diagnostic information about a single file in the cache. */ -int -__wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile) +static int +__verbose_dump_cache_single(WT_SESSION_IMPL *session, + uint64_t *total_bytesp, uint64_t *total_dirty_bytesp) { - FILE *fp; - WT_CONNECTION_IMPL *conn; - WT_DATA_HANDLE *dhandle, *saved_dhandle; + WT_DATA_HANDLE *dhandle; WT_PAGE *page; WT_REF *next_walk; + size_t size; uint64_t intl_bytes, intl_bytes_max, intl_dirty_bytes; uint64_t intl_dirty_bytes_max, intl_dirty_pages, intl_pages; uint64_t leaf_bytes, leaf_bytes_max, leaf_dirty_bytes; uint64_t leaf_dirty_bytes_max, leaf_dirty_pages, leaf_pages; + + intl_bytes = intl_bytes_max = intl_dirty_bytes = 0; + intl_dirty_bytes_max = intl_dirty_pages = intl_pages = 0; + leaf_bytes = leaf_bytes_max = leaf_dirty_bytes = 0; + leaf_dirty_bytes_max = leaf_dirty_pages = leaf_pages = 0; + + next_walk = NULL; + while (__wt_tree_walk(session, &next_walk, + WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 && + next_walk != NULL) { + page = next_walk->page; + size = page->memory_footprint; + + if (WT_PAGE_IS_INTERNAL(page)) { + ++intl_pages; + intl_bytes += size; + intl_bytes_max = WT_MAX(intl_bytes_max, size); + if (__wt_page_is_modified(page)) { + ++intl_dirty_pages; + intl_dirty_bytes += size; + intl_dirty_bytes_max = + WT_MAX(intl_dirty_bytes_max, size); + } + } else { + ++leaf_pages; + leaf_bytes += size; + leaf_bytes_max = WT_MAX(leaf_bytes_max, size); + if (__wt_page_is_modified(page)) { + ++leaf_dirty_pages; + leaf_dirty_bytes += size; + leaf_dirty_bytes_max = + WT_MAX(leaf_dirty_bytes_max, size); + } + } + } + + dhandle = session->dhandle; + if (dhandle->checkpoint == NULL) + WT_RET(__wt_msg(session, "%s(<live>):", dhandle->name)); + else + WT_RET(__wt_msg(session, "%s(checkpoint=%s):", + dhandle->name, dhandle->checkpoint)); + if (intl_pages != 0) + WT_RET(__wt_msg(session, + "internal: " + "%" PRIu64 " pages, " + "%" PRIu64 "MB, " + "%" PRIu64 "/%" PRIu64 " clean/dirty pages, " + "%" PRIu64 "/%" PRIu64 " clean/dirty MB, " + "%" PRIu64 "MB max page, " + "%" PRIu64 "MB max dirty page", + intl_pages, + intl_bytes / WT_MEGABYTE, + intl_pages - intl_dirty_pages, + intl_dirty_pages, + (intl_bytes - intl_dirty_bytes) / WT_MEGABYTE, + intl_dirty_bytes / WT_MEGABYTE, + intl_bytes_max / WT_MEGABYTE, + intl_dirty_bytes_max / WT_MEGABYTE)); + if (leaf_pages != 0) + WT_RET(__wt_msg(session, + "leaf: " + "%" PRIu64 " pages, " + "%" PRIu64 "MB, " + "%" PRIu64 "/%" PRIu64 " clean/dirty pages, " + "%" PRIu64 "/%" PRIu64 " clean/dirty MB, " + "%" PRIu64 "MB max page, " + "%" PRIu64 "MB max dirty page", + leaf_pages, + leaf_bytes / WT_MEGABYTE, + leaf_pages - leaf_dirty_pages, + leaf_dirty_pages, + (leaf_bytes - leaf_dirty_bytes) / WT_MEGABYTE, + leaf_dirty_bytes / WT_MEGABYTE, + leaf_bytes_max / WT_MEGABYTE, + leaf_dirty_bytes_max / WT_MEGABYTE)); + + *total_bytesp += intl_bytes + leaf_bytes; + *total_dirty_bytesp += intl_dirty_bytes + leaf_dirty_bytes; + + return (0); +} + +/* + * __wt_verbose_dump_cache -- + * Output diagnostic information about the cache. + */ +int +__wt_verbose_dump_cache(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; uint64_t total_bytes, total_dirty_bytes; - size_t size; conn = S2C(session); total_bytes = total_dirty_bytes = 0; - if (ofile == NULL) - fp = stderr; - else if ((fp = fopen(ofile, "w")) == NULL) - return (EIO); - - /* Note: odd string concatenation avoids spelling errors. */ - (void)fprintf(fp, "==========\n" "cache dump\n"); + WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); + WT_RET(__wt_msg(session, "cache dump")); - saved_dhandle = session->dhandle; - TAILQ_FOREACH(dhandle, &conn->dhqh, q) { + for (dhandle = NULL;;) { + WT_WITH_HANDLE_LIST_READ_LOCK(session, + WT_DHANDLE_NEXT(session, dhandle, &conn->dhqh, q)); + if (dhandle == NULL) + break; if (!WT_PREFIX_MATCH(dhandle->name, "file:") || !F_ISSET(dhandle, WT_DHANDLE_OPEN)) continue; - intl_bytes = intl_bytes_max = intl_dirty_bytes = 0; - intl_dirty_bytes_max = intl_dirty_pages = intl_pages = 0; - leaf_bytes = leaf_bytes_max = leaf_dirty_bytes = 0; - leaf_dirty_bytes_max = leaf_dirty_pages = leaf_pages = 0; - - next_walk = NULL; - session->dhandle = dhandle; - while (__wt_tree_walk(session, &next_walk, - WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_WAIT) == 0 && - next_walk != NULL) { - page = next_walk->page; - size = page->memory_footprint; - - if (WT_PAGE_IS_INTERNAL(page)) { - ++intl_pages; - intl_bytes += size; - intl_bytes_max = WT_MAX(intl_bytes_max, size); - if (__wt_page_is_modified(page)) { - ++intl_dirty_pages; - intl_dirty_bytes += size; - intl_dirty_bytes_max = - WT_MAX(intl_dirty_bytes_max, size); - } - } else { - ++leaf_pages; - leaf_bytes += size; - leaf_bytes_max = WT_MAX(leaf_bytes_max, size); - if (__wt_page_is_modified(page)) { - ++leaf_dirty_pages; - leaf_dirty_bytes += size; - leaf_dirty_bytes_max = - WT_MAX(leaf_dirty_bytes_max, size); - } - } - } - session->dhandle = NULL; - - if (dhandle->checkpoint == NULL) - (void)fprintf(fp, "%s(<live>): \n", dhandle->name); - else - (void)fprintf(fp, "%s(checkpoint=%s): \n", - dhandle->name, dhandle->checkpoint); - if (intl_pages != 0) - (void)fprintf(fp, - "\t" "internal: " - "%" PRIu64 " pages, " - "%" PRIu64 "MB, " - "%" PRIu64 "/%" PRIu64 " clean/dirty pages, " - "%" PRIu64 "/%" PRIu64 " clean/dirty MB, " - "%" PRIu64 "MB max page, " - "%" PRIu64 "MB max dirty page\n", - intl_pages, - intl_bytes >> 20, - intl_pages - intl_dirty_pages, - intl_dirty_pages, - (intl_bytes - intl_dirty_bytes) >> 20, - intl_dirty_bytes >> 20, - intl_bytes_max >> 20, - intl_dirty_bytes_max >> 20); - if (leaf_pages != 0) - (void)fprintf(fp, - "\t" "leaf: " - "%" PRIu64 " pages, " - "%" PRIu64 "MB, " - "%" PRIu64 "/%" PRIu64 " clean/dirty pages, " - "%" PRIu64 "/%" PRIu64 " clean/dirty MB, " - "%" PRIu64 "MB max page, " - "%" PRIu64 "MB max dirty page\n", - leaf_pages, - leaf_bytes >> 20, - leaf_pages - leaf_dirty_pages, - leaf_dirty_pages, - (leaf_bytes - leaf_dirty_bytes) >> 20, - leaf_dirty_bytes >> 20, - leaf_bytes_max >> 20, - leaf_dirty_bytes_max >> 20); - - total_bytes += intl_bytes + leaf_bytes; - total_dirty_bytes += intl_dirty_bytes + leaf_dirty_bytes; + WT_WITH_DHANDLE(session, dhandle, + ret = __verbose_dump_cache_single( + session, &total_bytes, &total_dirty_bytes)); + if (ret != 0) + break; } - session->dhandle = saved_dhandle; + WT_RET(ret); /* * Apply the overhead percentage so our total bytes are comparable with @@ -2099,16 +2476,16 @@ __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile) */ total_bytes = __wt_cache_bytes_plus_overhead(conn->cache, total_bytes); - (void)fprintf(fp, + WT_RET(__wt_msg(session, "cache dump: " - "total found = %" PRIu64 "MB vs tracked inuse %" PRIu64 "MB\n" - "total dirty bytes = %" PRIu64 "MB\n", - total_bytes >> 20, __wt_cache_bytes_inuse(conn->cache) >> 20, - total_dirty_bytes >> 20); - (void)fprintf(fp, "==========\n"); - - if (ofile != NULL && fclose(fp) != 0) - return (EIO); + "total found: %" PRIu64 "MB vs tracked inuse %" PRIu64 "MB", + total_bytes / WT_MEGABYTE, + __wt_cache_bytes_inuse(conn->cache) / WT_MEGABYTE)); + WT_RET(__wt_msg(session, + "total dirty bytes: %" PRIu64 "MB", + total_dirty_bytes / WT_MEGABYTE)); + WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); + return (0); } #endif diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c index 5b17a78a4dd..85689efd0b1 100644 --- a/src/evict/evict_page.c +++ b/src/evict/evict_page.c @@ -480,8 +480,8 @@ __evict_review( if (LF_ISSET(WT_EVICT_INMEM_SPLIT)) return (__wt_split_insert(session, ref)); - /* We are done if reconciliation is disabled. */ - if (F_ISSET(S2BT(session), WT_BTREE_NO_RECONCILE)) + /* If splits are the only permitted operation, we're done. */ + if (F_ISSET(S2BT(session), WT_BTREE_ALLOW_SPLITS)) return (EBUSY); } diff --git a/src/evict/evict_stat.c b/src/evict/evict_stat.c index 2dd3b1e83a0..7c2d5722a63 100644 --- a/src/evict/evict_stat.c +++ b/src/evict/evict_stat.c @@ -134,5 +134,5 @@ __wt_curstat_cache_walk(WT_SESSION_IMPL *session) WT_STAT_DATA_SET(session, cache_state_root_size, btree->root.page->memory_footprint); - WT_WITH_HANDLE_LIST_LOCK(session, __evict_stat_walk(session)); + __evict_stat_walk(session); } diff --git a/src/include/api.h b/src/include/api.h index 2783d17f825..a3636eb8040 100644 --- a/src/include/api.h +++ b/src/include/api.h @@ -7,22 +7,21 @@ */ /* Standard entry points to the API: declares/initializes local variables. */ -#define API_SESSION_INIT(s, h, n, cur, dh) \ +#define API_SESSION_INIT(s, h, n, dh) \ WT_DATA_HANDLE *__olddh = (s)->dhandle; \ const char *__oldname = (s)->name; \ - (s)->cursor = (cur); \ (s)->dhandle = (dh); \ (s)->name = (s)->lastop = #h "." #n; \ -#define API_CALL_NOCONF(s, h, n, cur, dh) do { \ - API_SESSION_INIT(s, h, n, cur, dh); \ +#define API_CALL_NOCONF(s, h, n, dh) do { \ + API_SESSION_INIT(s, h, n, dh); \ WT_ERR(WT_SESSION_CHECK_PANIC(s)); \ __wt_verbose((s), WT_VERB_API, "CALL: " #h ":" #n) -#define API_CALL(s, h, n, cur, dh, config, cfg) do { \ - const char *cfg[] = \ +#define API_CALL(s, h, n, dh, config, cfg) do { \ + const char *(cfg)[] = \ { WT_CONFIG_BASE(s, h##_##n), config, NULL }; \ - API_SESSION_INIT(s, h, n, cur, dh); \ + API_SESSION_INIT(s, h, n, dh); \ WT_ERR(WT_SESSION_CHECK_PANIC(s)); \ if ((config) != NULL) \ WT_ERR(__wt_config_check((s), \ @@ -42,17 +41,17 @@ } while (0) /* An API call wrapped in a transaction if necessary. */ -#define TXN_API_CALL(s, h, n, cur, bt, config, cfg) do { \ +#define TXN_API_CALL(s, h, n, bt, config, cfg) do { \ bool __autotxn = false; \ - API_CALL(s, h, n, bt, cur, config, cfg); \ + API_CALL(s, h, n, bt, config, cfg); \ __autotxn = !F_ISSET(&(s)->txn, WT_TXN_AUTOCOMMIT | WT_TXN_RUNNING);\ if (__autotxn) \ F_SET(&(s)->txn, WT_TXN_AUTOCOMMIT) /* An API call wrapped in a transaction if necessary. */ -#define TXN_API_CALL_NOCONF(s, h, n, cur, bt) do { \ +#define TXN_API_CALL_NOCONF(s, h, n, bt) do { \ bool __autotxn = false; \ - API_CALL_NOCONF(s, h, n, cur, bt); \ + API_CALL_NOCONF(s, h, n, bt); \ __autotxn = !F_ISSET(&(s)->txn, WT_TXN_AUTOCOMMIT | WT_TXN_RUNNING);\ if (__autotxn) \ F_SET(&(s)->txn, WT_TXN_AUTOCOMMIT) @@ -63,15 +62,16 @@ if (__autotxn) { \ if (F_ISSET(&(s)->txn, WT_TXN_AUTOCOMMIT)) \ F_CLR(&(s)->txn, WT_TXN_AUTOCOMMIT); \ - else if (ret == 0 && !F_ISSET(&(s)->txn, WT_TXN_ERROR)) \ - ret = __wt_txn_commit((s), NULL); \ + else if ((ret) == 0 && \ + !F_ISSET(&(s)->txn, WT_TXN_ERROR)) \ + (ret) = __wt_txn_commit((s), NULL); \ else { \ if (retry) \ WT_TRET(__wt_session_copy_values(s)); \ WT_TRET(__wt_txn_rollback((s), NULL)); \ - if ((ret == 0 || ret == WT_ROLLBACK) && \ + if (((ret) == 0 || (ret) == WT_ROLLBACK) && \ (retry)) { \ - ret = 0; \ + (ret) = 0; \ continue; \ } \ WT_TRET(__wt_session_reset_cursors(s, false)); \ @@ -98,24 +98,24 @@ #define CONNECTION_API_CALL(conn, s, n, config, cfg) \ s = (conn)->default_session; \ - API_CALL(s, WT_CONNECTION, n, NULL, NULL, config, cfg) + API_CALL(s, WT_CONNECTION, n, NULL, config, cfg) #define CONNECTION_API_CALL_NOCONF(conn, s, n) \ s = (conn)->default_session; \ - API_CALL_NOCONF(s, WT_CONNECTION, n, NULL, NULL) + API_CALL_NOCONF(s, WT_CONNECTION, n, NULL) #define SESSION_API_CALL(s, n, config, cfg) \ - API_CALL(s, WT_SESSION, n, NULL, NULL, config, cfg) + API_CALL(s, WT_SESSION, n, NULL, config, cfg) #define SESSION_API_CALL_NOCONF(s, n) \ - API_CALL_NOCONF(s, WT_SESSION, n, NULL, NULL) + API_CALL_NOCONF(s, WT_SESSION, n, NULL) #define SESSION_TXN_API_CALL(s, n, config, cfg) \ - TXN_API_CALL(s, WT_SESSION, n, NULL, NULL, config, cfg) + TXN_API_CALL(s, WT_SESSION, n, NULL, config, cfg) #define CURSOR_API_CALL(cur, s, n, bt) \ (s) = (WT_SESSION_IMPL *)(cur)->session; \ - API_CALL_NOCONF(s, WT_CURSOR, n, cur, \ + API_CALL_NOCONF(s, WT_CURSOR, n, \ ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle) #define JOINABLE_CURSOR_CALL_CHECK(cur) \ @@ -128,7 +128,7 @@ #define CURSOR_REMOVE_API_CALL(cur, s, bt) \ (s) = (WT_SESSION_IMPL *)(cur)->session; \ - TXN_API_CALL_NOCONF(s, WT_CURSOR, remove, cur, \ + TXN_API_CALL_NOCONF(s, WT_CURSOR, remove, \ ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle); #define JOINABLE_CURSOR_REMOVE_API_CALL(cur, s, bt) \ @@ -137,7 +137,7 @@ #define CURSOR_UPDATE_API_CALL(cur, s, n, bt) \ (s) = (WT_SESSION_IMPL *)(cur)->session; \ - TXN_API_CALL_NOCONF(s, WT_CURSOR, n, cur, \ + TXN_API_CALL_NOCONF(s, WT_CURSOR, n, \ ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle); \ if (F_ISSET(S2C(s), WT_CONN_IN_MEMORY) && \ !F_ISSET((WT_BTREE *)(bt), WT_BTREE_IGNORE_CACHE) && \ @@ -153,4 +153,4 @@ #define ASYNCOP_API_CALL(conn, s, n) \ s = (conn)->default_session; \ - API_CALL_NOCONF(s, asyncop, n, NULL, NULL) + API_CALL_NOCONF(s, asyncop, n, NULL) diff --git a/src/include/bitstring.i b/src/include/bitstring.i index 08746beb9b9..118dc0bba01 100644 --- a/src/include/bitstring.i +++ b/src/include/bitstring.i @@ -230,7 +230,7 @@ __bit_getv(uint8_t *bitf, uint64_t entry, uint8_t width) #define __BIT_GET(len, mask) \ case len: \ if (__bit_test(bitf, bit)) \ - value |= mask; \ + value |= (mask); \ ++bit \ /* FALLTHROUGH */ diff --git a/src/include/btmem.h b/src/include/btmem.h index 9bd835f5d09..f1bb08d2699 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -435,6 +435,19 @@ struct __wt_page_modify { }; /* + * WT_COL_RLE -- + * Variable-length column-store pages have an array of page entries with RLE + * counts greater than 1 when reading the page, so it's not necessary to walk + * the page counting records to find a specific entry. We can do a binary search + * in this array, then an offset calculation to find the cell. + */ +WT_PACKED_STRUCT_BEGIN(__wt_col_rle) + uint64_t recno; /* Record number of first repeat. */ + uint64_t rle; /* Repeat count. */ + uint32_t indx; /* Slot of entry in col_var. */ +WT_PACKED_STRUCT_END + +/* * WT_PAGE -- * The WT_PAGE structure describes the in-memory page information. */ @@ -470,6 +483,7 @@ struct __wt_page { */ struct { WT_REF *parent_ref; /* Parent reference */ + uint64_t split_gen; /* Generation of last split */ struct __wt_page_index { uint32_t entries; @@ -479,6 +493,8 @@ struct __wt_page { } intl; #undef pg_intl_parent_ref #define pg_intl_parent_ref u.intl.parent_ref +#undef pg_intl_split_gen +#define pg_intl_split_gen u.intl.split_gen /* * Macros to copy/set the index because the name is obscured to ensure @@ -491,7 +507,7 @@ struct __wt_page { #define WT_INTL_INDEX_GET_SAFE(page) \ ((page)->u.intl.__index) #define WT_INTL_INDEX_GET(session, page, pindex) do { \ - WT_ASSERT(session, session->split_gen != 0); \ + WT_ASSERT(session, (session)->split_gen != 0); \ (pindex) = WT_INTL_INDEX_GET_SAFE(page); \ } while (0) #define WT_INTL_INDEX_SET(page, v) do { \ @@ -515,53 +531,54 @@ struct __wt_page { } while (0) /* Row-store leaf page. */ - struct { - WT_ROW *d; /* Key/value pairs */ - uint32_t entries; /* Entries */ - } row; -#undef pg_row_d -#define pg_row_d u.row.d -#undef pg_row_entries -#define pg_row_entries u.row.entries + WT_ROW *row; /* Key/value pairs */ +#undef pg_row +#define pg_row u.row /* Fixed-length column-store leaf page. */ - struct { - uint8_t *bitf; /* Values */ - uint32_t entries; /* Entries */ - } col_fix; + uint8_t *fix_bitf; /* Values */ #undef pg_fix_bitf -#define pg_fix_bitf u.col_fix.bitf -#undef pg_fix_entries -#define pg_fix_entries u.col_fix.entries +#define pg_fix_bitf u.fix_bitf /* Variable-length column-store leaf page. */ struct { - WT_COL *d; /* Values */ + WT_COL *col_var; /* Values */ /* - * Variable-length column-store files maintain a list of - * RLE entries on the page so it's unnecessary to walk - * the page counting records to find a specific entry. + * Variable-length column-store pages have an array + * of page entries with RLE counts greater than 1 when + * reading the page, so it's not necessary to walk the + * page counting records to find a specific entry. We + * can do a binary search in this array, then an offset + * calculation to find the cell. + * + * It's a separate structure to keep the page structure + * as small as possible. */ - WT_COL_RLE *repeats; /* RLE array for lookups */ - uint32_t nrepeats; /* Number of repeat slots */ - - uint32_t entries; /* Entries */ + struct __wt_col_var_repeat { + uint32_t nrepeats; /* repeat slots */ + WT_COL_RLE repeats[0]; /* lookup RLE array */ + } *repeats; +#define WT_COL_VAR_REPEAT_SET(page) \ + ((page)->u.col_var.repeats != NULL) } col_var; -#undef pg_var_d -#define pg_var_d u.col_var.d +#undef pg_var +#define pg_var u.col_var.col_var #undef pg_var_repeats -#define pg_var_repeats u.col_var.repeats +#define pg_var_repeats u.col_var.repeats->repeats #undef pg_var_nrepeats -#define pg_var_nrepeats u.col_var.nrepeats -#undef pg_var_entries -#define pg_var_entries u.col_var.entries +#define pg_var_nrepeats u.col_var.repeats->nrepeats } u; /* - * The page's type and flags are positioned at the end of the WT_PAGE - * union, it reduces cache misses in the row-store search function. + * Page entries, type and flags are positioned at the end of the WT_PAGE + * union to reduce cache misses in the row-store search function. + * + * The entries field only applies to leaf pages, internal pages use the + * page-index entries instead. */ + uint32_t entries; /* Leaf page entries */ + #define WT_PAGE_IS_INTERNAL(page) \ ((page)->type == WT_PAGE_COL_INT || (page)->type == WT_PAGE_ROW_INT) #define WT_PAGE_INVALID 0 /* Invalid page */ @@ -579,9 +596,8 @@ struct __wt_page { #define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */ #define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */ #define WT_PAGE_OVERFLOW_KEYS 0x10 /* Page has overflow keys */ -#define WT_PAGE_SPLIT_BLOCK 0x20 /* Split blocking eviction and splits */ -#define WT_PAGE_SPLIT_INSERT 0x40 /* A leaf page was split for append */ -#define WT_PAGE_UPDATE_IGNORE 0x80 /* Ignore updates on page discard */ +#define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */ +#define WT_PAGE_UPDATE_IGNORE 0x40 /* Ignore updates on page discard */ uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ uint8_t unused[2]; /* Unused padding */ @@ -618,8 +634,8 @@ struct __wt_page { #define WT_READGEN_START_VALUE 100 #define WT_READGEN_STEP 100 uint64_t read_gen; - /* The evict pass generation for the page */ - uint64_t evict_pass_gen; + + uint64_t evict_pass_gen; /* Eviction pass generation */ size_t memory_footprint; /* Memory attached to the page */ @@ -792,11 +808,11 @@ struct __wt_row { /* On-page key, on-page cell, or off-page WT_IKEY */ * Walk the entries of an in-memory row-store leaf page. */ #define WT_ROW_FOREACH(page, rip, i) \ - for ((i) = (page)->pg_row_entries, \ - (rip) = (page)->pg_row_d; (i) > 0; ++(rip), --(i)) + for ((i) = (page)->entries, \ + (rip) = (page)->pg_row; (i) > 0; ++(rip), --(i)) #define WT_ROW_FOREACH_REVERSE(page, rip, i) \ - for ((i) = (page)->pg_row_entries, \ - (rip) = (page)->pg_row_d + ((page)->pg_row_entries - 1); \ + for ((i) = (page)->entries, \ + (rip) = (page)->pg_row + ((page)->entries - 1); \ (i) > 0; --(rip), --(i)) /* @@ -804,7 +820,7 @@ struct __wt_row { /* On-page key, on-page cell, or off-page WT_IKEY */ * Return the 0-based array offset based on a WT_ROW reference. */ #define WT_ROW_SLOT(page, rip) \ - ((uint32_t)(((WT_ROW *)(rip)) - (page)->pg_row_d)) + ((uint32_t)(((WT_ROW *)(rip)) - (page)->pg_row)) /* * WT_COL -- @@ -829,18 +845,6 @@ struct __wt_col { }; /* - * WT_COL_RLE -- - * In variable-length column store leaf pages, we build an array of entries - * with RLE counts greater than 1 when reading the page. We can do a binary - * search in this array, then an offset calculation to find the cell. - */ -WT_PACKED_STRUCT_BEGIN(__wt_col_rle) - uint64_t recno; /* Record number of first repeat. */ - uint64_t rle; /* Repeat count. */ - uint32_t indx; /* Slot of entry in col_var.d */ -WT_PACKED_STRUCT_END - -/* * WT_COL_PTR, WT_COL_PTR_SET -- * Return/Set a pointer corresponding to the data offset. (If the item does * not exist on the page, return a NULL.) @@ -856,15 +860,15 @@ WT_PACKED_STRUCT_END * Walk the entries of variable-length column-store leaf page. */ #define WT_COL_FOREACH(page, cip, i) \ - for ((i) = (page)->pg_var_entries, \ - (cip) = (page)->pg_var_d; (i) > 0; ++(cip), --(i)) + for ((i) = (page)->entries, \ + (cip) = (page)->pg_var; (i) > 0; ++(cip), --(i)) /* * WT_COL_SLOT -- * Return the 0-based array offset based on a WT_COL reference. */ #define WT_COL_SLOT(page, cip) \ - ((uint32_t)(((WT_COL *)cip) - (page)->pg_var_d)) + ((uint32_t)(((WT_COL *)(cip)) - (page)->pg_var)) /* * WT_IKEY -- @@ -973,10 +977,10 @@ struct __wt_insert { } key; } u; -#define WT_INSERT_KEY_SIZE(ins) (((WT_INSERT *)ins)->u.key.size) +#define WT_INSERT_KEY_SIZE(ins) (((WT_INSERT *)(ins))->u.key.size) #define WT_INSERT_KEY(ins) \ - ((void *)((uint8_t *)(ins) + ((WT_INSERT *)ins)->u.key.offset)) -#define WT_INSERT_RECNO(ins) (((WT_INSERT *)ins)->u.recno) + ((void *)((uint8_t *)(ins) + ((WT_INSERT *)(ins))->u.key.offset)) +#define WT_INSERT_RECNO(ins) (((WT_INSERT *)(ins))->u.recno) WT_INSERT *next[0]; /* forward-linked skip list */ }; @@ -985,9 +989,9 @@ struct __wt_insert { * Skiplist helper macros. */ #define WT_SKIP_FIRST(ins_head) \ - (((ins_head) == NULL) ? NULL : ((WT_INSERT_HEAD *)ins_head)->head[0]) + (((ins_head) == NULL) ? NULL : ((WT_INSERT_HEAD *)(ins_head))->head[0]) #define WT_SKIP_LAST(ins_head) \ - (((ins_head) == NULL) ? NULL : ((WT_INSERT_HEAD *)ins_head)->tail[0]) + (((ins_head) == NULL) ? NULL : ((WT_INSERT_HEAD *)(ins_head))->tail[0]) #define WT_SKIP_NEXT(ins) ((ins)->next[0]) #define WT_SKIP_FOREACH(ins, ins_head) \ for ((ins) = WT_SKIP_FIRST(ins_head); \ @@ -1000,7 +1004,7 @@ struct __wt_insert { #define WT_PAGE_ALLOC_AND_SWAP(s, page, dest, v, count) do { \ if (((v) = (dest)) == NULL) { \ WT_ERR(__wt_calloc_def(s, count, &(v))); \ - if (__wt_atomic_cas_ptr(&dest, NULL, v)) \ + if (__wt_atomic_cas_ptr(&(dest), NULL, v)) \ __wt_cache_page_inmem_incr( \ s, page, (count) * sizeof(*(v))); \ else \ @@ -1041,7 +1045,7 @@ struct __wt_insert_head { #define WT_ROW_INSERT_SMALLEST(page) \ ((page)->modify == NULL || \ (page)->modify->mod_row_insert == NULL ? \ - NULL : (page)->modify->mod_row_insert[(page)->pg_row_entries]) + NULL : (page)->modify->mod_row_insert[(page)->entries]) /* * The column-store leaf page update lists are arrays of pointers to structures, diff --git a/src/include/btree.h b/src/include/btree.h index 595afc453c8..28fe1b94b23 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -58,6 +58,12 @@ #define WT_BTREE_DELETE_THRESHOLD 1000 /* + * Minimum size of the chunks (in percentage of the page size) a page gets split + * into during reconciliation. + */ +#define WT_BTREE_MIN_SPLIT_PCT 50 + +/* * WT_BTREE -- * A btree handle. */ @@ -114,23 +120,26 @@ struct __wt_btree { int split_pct; /* Split page percent */ WT_COMPRESSOR *compressor; /* Page compressor */ WT_KEYED_ENCRYPTOR *kencryptor; /* Page encryptor */ - WT_RWLOCK *ovfl_lock; /* Overflow lock */ + WT_RWLOCK ovfl_lock; /* Overflow lock */ uint64_t last_recno; /* Column-store last record number */ - WT_REF root; /* Root page reference */ - bool modified; /* If the tree ever modified */ - bool bulk_load_ok; /* Bulk-load is a possibility */ + WT_REF root; /* Root page reference */ + bool modified; /* If the tree ever modified */ + uint8_t original; /* Newly created: bulk-load possible + (want a bool but needs atomic cas) */ + + bool lsm_primary; /* Handle is/was the LSM primary */ WT_BM *bm; /* Block manager reference */ u_int block_header; /* WT_PAGE_HEADER_BYTE_SIZE */ uint64_t checkpoint_gen; /* Checkpoint generation */ - bool include_checkpoint_txn;/* ID checks include checkpoint */ uint64_t rec_max_txn; /* Maximum txn seen (clean trees) */ uint64_t write_gen; /* Write generation */ uint64_t bytes_inmem; /* Cache bytes in memory. */ + uint64_t bytes_dirty_intl; /* Bytes in dirty internal pages. */ uint64_t bytes_dirty_leaf; /* Bytes in dirty leaf pages. */ WT_REF *evict_ref; /* Eviction thread's location */ @@ -138,10 +147,10 @@ struct __wt_btree { u_int evict_walk_period; /* Skip this many LRU walks */ u_int evict_walk_saved; /* Saved walk skips for checkpoints */ u_int evict_walk_skips; /* Number of walks skipped */ - u_int evict_disabled; /* Eviction disabled count */ + int evict_disabled; /* Eviction disabled count */ volatile uint32_t evict_busy; /* Count of threads in eviction */ - bool evict_walk_reverse; /* Walk direction */ - + int evict_start_type; /* Start position for eviction walk + (see WT_EVICT_WALK_START). */ enum { WT_CKPT_OFF, WT_CKPT_PREPARE, WT_CKPT_RUNNING } checkpointing; /* Checkpoint in progress */ @@ -154,15 +163,14 @@ struct __wt_btree { WT_SPINLOCK flush_lock; /* Lock to flush the tree's pages */ /* Flags values up to 0xff are reserved for WT_DHANDLE_* */ -#define WT_BTREE_BULK 0x000100 /* Bulk-load handle */ -#define WT_BTREE_IGNORE_CACHE 0x000200 /* Cache-resident object */ -#define WT_BTREE_IN_MEMORY 0x000400 /* Cache-resident object */ -#define WT_BTREE_LOOKASIDE 0x000800 /* Look-aside table */ -#define WT_BTREE_LSM_PRIMARY 0x001000 /* Handle is current LSM primary */ -#define WT_BTREE_NO_CHECKPOINT 0x002000 /* Disable checkpoints */ -#define WT_BTREE_NO_EVICTION 0x004000 /* Disable eviction */ +#define WT_BTREE_ALLOW_SPLITS 0x000100 /* Allow splits, even with no evict */ +#define WT_BTREE_BULK 0x000200 /* Bulk-load handle */ +#define WT_BTREE_CLOSED 0x000400 /* Handle closed */ +#define WT_BTREE_IGNORE_CACHE 0x000800 /* Cache-resident object */ +#define WT_BTREE_IN_MEMORY 0x001000 /* Cache-resident object */ +#define WT_BTREE_LOOKASIDE 0x002000 /* Look-aside table */ +#define WT_BTREE_NO_CHECKPOINT 0x004000 /* Disable checkpoints */ #define WT_BTREE_NO_LOGGING 0x008000 /* Disable logging */ -#define WT_BTREE_NO_RECONCILE 0x010000 /* Allow splits, even with no evict */ #define WT_BTREE_REBALANCE 0x020000 /* Handle is for rebalance */ #define WT_BTREE_SALVAGE 0x040000 /* Handle is for salvage */ #define WT_BTREE_SKIP_CKPT 0x080000 /* Handle skipped checkpoint */ diff --git a/src/include/btree.i b/src/include/btree.i index 4f69c258621..1d6fcd6272c 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -71,6 +71,47 @@ __wt_btree_bytes_inuse(WT_SESSION_IMPL *session) } /* + * __wt_btree_bytes_evictable -- + * Return the number of bytes that can be evicted (i.e. bytes apart from + * the pinned root page). + */ +static inline uint64_t +__wt_btree_bytes_evictable(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_CACHE *cache; + WT_PAGE *root_page; + uint64_t bytes_inmem, bytes_root; + + btree = S2BT(session); + cache = S2C(session)->cache; + root_page = btree->root.page; + + bytes_inmem = btree->bytes_inmem; + bytes_root = root_page == NULL ? 0 : root_page->memory_footprint; + + return (bytes_inmem <= bytes_root ? 0 : + __wt_cache_bytes_plus_overhead(cache, bytes_inmem - bytes_root)); +} + +/* + * __wt_btree_dirty_inuse -- + * Return the number of dirty bytes in use. + */ +static inline uint64_t +__wt_btree_dirty_inuse(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + WT_CACHE *cache; + + btree = S2BT(session); + cache = S2C(session)->cache; + + return (__wt_cache_bytes_plus_overhead(cache, + btree->bytes_dirty_intl + btree->bytes_dirty_leaf)); +} + +/* * __wt_btree_dirty_leaf_inuse -- * Return the number of bytes in use by dirty leaf pages. */ @@ -105,11 +146,12 @@ __wt_cache_page_inmem_incr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size) (void)__wt_atomic_addsize(&page->memory_footprint, size); if (__wt_page_is_modified(page)) { (void)__wt_atomic_addsize(&page->modify->bytes_dirty, size); - if (WT_PAGE_IS_INTERNAL(page)) + if (WT_PAGE_IS_INTERNAL(page)) { + (void)__wt_atomic_add64(&btree->bytes_dirty_intl, size); (void)__wt_atomic_add64(&cache->bytes_dirty_intl, size); - else if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { - (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size); + } else if (!btree->lsm_primary) { (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size); + (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size); } } /* Track internal size in cache. */ @@ -238,10 +280,12 @@ __wt_cache_page_byte_dirty_decr( if (i == 5) return; - if (WT_PAGE_IS_INTERNAL(page)) + if (WT_PAGE_IS_INTERNAL(page)) { + __wt_cache_decr_check_uint64(session, &btree->bytes_dirty_intl, + decr, "WT_BTREE.bytes_dirty_intl"); __wt_cache_decr_check_uint64(session, &cache->bytes_dirty_intl, decr, "WT_CACHE.bytes_dirty_intl"); - else if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { + } else if (!btree->lsm_primary) { __wt_cache_decr_check_uint64(session, &btree->bytes_dirty_leaf, decr, "WT_BTREE.bytes_dirty_leaf"); __wt_cache_decr_check_uint64(session, &cache->bytes_dirty_leaf, @@ -297,10 +341,11 @@ __wt_cache_dirty_incr(WT_SESSION_IMPL *session, WT_PAGE *page) */ size = page->memory_footprint; if (WT_PAGE_IS_INTERNAL(page)) { + (void)__wt_atomic_add64(&btree->bytes_dirty_intl, size); (void)__wt_atomic_add64(&cache->bytes_dirty_intl, size); (void)__wt_atomic_add64(&cache->pages_dirty_intl, 1); } else { - if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { + if (!btree->lsm_primary) { (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size); (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size); } @@ -368,7 +413,7 @@ __wt_cache_page_image_incr(WT_SESSION_IMPL *session, uint32_t size) * Evict pages from the cache. */ static inline void -__wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page) +__wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page, bool rewrite) { WT_BTREE *btree; WT_CACHE *cache; @@ -392,23 +437,34 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page) /* Update the cache's dirty-byte count. */ if (modify != NULL && modify->bytes_dirty != 0) { - if (WT_PAGE_IS_INTERNAL(page)) + if (WT_PAGE_IS_INTERNAL(page)) { + __wt_cache_decr_zero_uint64(session, + &btree->bytes_dirty_intl, + modify->bytes_dirty, "WT_BTREE.bytes_dirty_intl"); __wt_cache_decr_zero_uint64(session, &cache->bytes_dirty_intl, modify->bytes_dirty, "WT_CACHE.bytes_dirty_intl"); - else if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { - __wt_cache_decr_zero_uint64(session, - &cache->bytes_dirty_leaf, - modify->bytes_dirty, "WT_CACHE.bytes_dirty_leaf"); + } else if (!btree->lsm_primary) { __wt_cache_decr_zero_uint64(session, &btree->bytes_dirty_leaf, modify->bytes_dirty, "WT_BTREE.bytes_dirty_leaf"); + __wt_cache_decr_zero_uint64(session, + &cache->bytes_dirty_leaf, + modify->bytes_dirty, "WT_CACHE.bytes_dirty_leaf"); } } /* Update pages and bytes evicted. */ (void)__wt_atomic_add64(&cache->bytes_evict, page->memory_footprint); - (void)__wt_atomic_addv64(&cache->pages_evict, 1); + + /* + * Don't count rewrites as eviction: there's no guarantee we are making + * real progress. + */ + if (rewrite) + (void)__wt_atomic_subv64(&cache->pages_inmem, 1); + else + (void)__wt_atomic_addv64(&cache->pages_evict, 1); } /* @@ -984,7 +1040,7 @@ __wt_cursor_row_leaf_key(WT_CURSOR_BTREE *cbt, WT_ITEM *key) if (cbt->ins == NULL) { session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; - rip = &page->u.row.d[cbt->slot]; + rip = &page->pg_row[cbt->slot]; WT_RET(__wt_row_leaf_key(session, page, rip, key, false)); } else { key->data = WT_INSERT_KEY(cbt->ins); @@ -1181,11 +1237,10 @@ __wt_leaf_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) * data in the last skiplist on the page. Split if there are enough * items and the skiplist does not fit within a single disk page. */ - ins_head = page->type == WT_PAGE_ROW_LEAF ? - (page->pg_row_entries == 0 ? + (page->entries == 0 ? WT_ROW_INSERT_SMALLEST(page) : - WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1)) : + WT_ROW_INSERT_SLOT(page, page->entries - 1)) : WT_COL_APPEND(page); if (ins_head == NULL) return (false); @@ -1299,9 +1354,14 @@ __wt_page_can_evict( * the original parent page's index, because evicting an internal page * discards its WT_REF array, and a thread traversing the original * parent page index might see a freed WT_REF. + * + * One special case where we know this is safe is if the handle is + * locked exclusive (e.g., when the whole tree is being evicted). In + * that case, no readers can be looking at an old index. */ - if (WT_PAGE_IS_INTERNAL(page) && - F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_BLOCK)) + if (!F_ISSET(session->dhandle, WT_DHANDLE_EXCLUSIVE) && + WT_PAGE_IS_INTERNAL(page) && !__wt_split_obsolete( + session, page->pg_intl_split_gen)) return (false); /* @@ -1353,7 +1413,7 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) if (page->read_gen != WT_READGEN_OLDEST || LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(session, WT_SESSION_NO_EVICTION) || - F_ISSET(btree, WT_BTREE_NO_EVICTION) || + btree->evict_disabled > 0 || !__wt_page_can_evict(session, ref, NULL)) return (__wt_hazard_clear(session, ref)); @@ -1473,7 +1533,7 @@ __wt_btree_lsm_over_size(WT_SESSION_IMPL *session, uint64_t maxsize) return (false); /* A tree that can be evicted always requires a switch. */ - if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) + if (btree->evict_disabled == 0) return (true); /* Check for a tree with a single leaf page. */ @@ -1498,55 +1558,6 @@ __wt_btree_lsm_over_size(WT_SESSION_IMPL *session, uint64_t maxsize) } /* - * __wt_btree_lsm_switch_primary -- - * Switch a btree handle to/from the current primary chunk of an LSM tree. - */ -static inline void -__wt_btree_lsm_switch_primary(WT_SESSION_IMPL *session, bool on) -{ - WT_BTREE *btree; - WT_CACHE *cache; - WT_PAGE *child, *root; - WT_PAGE_INDEX *pindex; - WT_REF *first; - size_t size; - - btree = S2BT(session); - cache = S2C(session)->cache; - root = btree->root.page; - - if (!F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) - F_SET(btree, WT_BTREE_LSM_PRIMARY | WT_BTREE_NO_EVICTION); - if (!on && F_ISSET(btree, WT_BTREE_LSM_PRIMARY)) { - pindex = WT_INTL_INDEX_GET_SAFE(root); - if (!F_ISSET(btree, WT_BTREE_NO_EVICTION) || - pindex->entries != 1) - return; - first = pindex->index[0]; - - /* - * We're reaching down into the page without a hazard pointer, - * but that's OK because we know that no-eviction is set so the - * page can't disappear. - * - * While this tree was the primary, its dirty bytes were not - * included in the cache accounting. Fix that now before we - * open it up for eviction. - */ - child = first->page; - if (first->state == WT_REF_MEM && - child->type == WT_PAGE_ROW_LEAF && - __wt_page_is_modified(child)) { - size = child->modify->bytes_dirty; - (void)__wt_atomic_add64(&btree->bytes_dirty_leaf, size); - (void)__wt_atomic_add64(&cache->bytes_dirty_leaf, size); - } - - F_CLR(btree, WT_BTREE_LSM_PRIMARY | WT_BTREE_NO_EVICTION); - } -} - -/* * __wt_split_descent_race -- * Return if we raced with an internal page split when descending the tree. */ diff --git a/src/include/buf.i b/src/include/buf.i index ebbee6b4633..d192e292dcf 100644 --- a/src/include/buf.i +++ b/src/include/buf.i @@ -37,28 +37,30 @@ __wt_buf_extend(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) /* * __wt_buf_init -- - * Initialize a buffer at a specific size. + * Create an empty buffer at a specific size. */ static inline int __wt_buf_init(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) { + /* + * The buffer grow function does what we need, but anticipates data + * referenced by the buffer. Avoid any data copy by setting data to + * reference the buffer's allocated memory, and clearing it. + */ buf->data = buf->mem; - buf->size = 0; /* Clear existing data length */ - WT_RET(__wt_buf_grow(session, buf, size)); - - return (0); + buf->size = 0; + return (__wt_buf_grow(session, buf, size)); } /* * __wt_buf_initsize -- - * Initialize a buffer at a specific size, and set the data length. + * Create an empty buffer at a specific size, and set the data length. */ static inline int __wt_buf_initsize(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) { - buf->data = buf->mem; - buf->size = 0; /* Clear existing data length */ - WT_RET(__wt_buf_grow(session, buf, size)); + WT_RET(__wt_buf_init(session, buf, size)); + buf->size = size; /* Set the data length. */ return (0); @@ -72,14 +74,15 @@ static inline int __wt_buf_set( WT_SESSION_IMPL *session, WT_ITEM *buf, const void *data, size_t size) { - /* Ensure the buffer is large enough. */ - WT_RET(__wt_buf_initsize(session, buf, size)); - - /* Copy the data, allowing for overlapping strings. */ - if (size != 0) - memmove(buf->mem, data, size); - - return (0); + /* + * The buffer grow function does what we need, but expects the data to + * be referenced by the buffer. If we're copying data from outside the + * buffer, set it up so it makes sense to the buffer grow function. (No + * test needed, this works if WT_ITEM.data is already set to "data".) + */ + buf->data = data; + buf->size = size; + return (__wt_buf_grow(session, buf, size)); } /* diff --git a/src/include/cache.h b/src/include/cache.h index 70f6169200d..04920c3585a 100644 --- a/src/include/cache.h +++ b/src/include/cache.h @@ -18,6 +18,15 @@ #define WT_EVICT_MAX_TREES 1000 /* Maximum walk points */ +/* Ways to position when starting an eviction walk. */ +typedef enum { + WT_EVICT_WALK_NEXT, + WT_EVICT_WALK_PREV, + WT_EVICT_WALK_RAND_NEXT, + WT_EVICT_WALK_RAND_PREV +} WT_EVICT_WALK_START; +#define WT_EVICT_WALK_START_NUM (WT_EVICT_WALK_RAND_PREV + 1) + /* * WT_EVICT_ENTRY -- * Encapsulation of an eviction candidate. @@ -83,7 +92,7 @@ struct __wt_cache { uint64_t worker_evicts; /* Pages evicted by worker threads */ uint64_t evict_max_page_size; /* Largest page seen at eviction */ -#ifdef HAVE_DIAGNOSTIC +#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) struct timespec stuck_ts; /* Stuck timestamp */ #endif diff --git a/src/include/cache.i b/src/include/cache.i index 17ab39e97d2..90dd1bcdda8 100644 --- a/src/include/cache.i +++ b/src/include/cache.i @@ -360,11 +360,13 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool *didworkp) /* * LSM sets the no-cache-check flag when holding the LSM tree lock, in - * that case, or when holding the schema or handle list locks (which - * block eviction), we don't want to highjack the thread for eviction. + * that case, or when holding the handle list, schema or table locks + * (which can block checkpoints and eviction), don't block the thread + * for eviction. */ if (F_ISSET(session, WT_SESSION_NO_EVICTION | - WT_SESSION_LOCKED_HANDLE_LIST | WT_SESSION_LOCKED_SCHEMA)) + WT_SESSION_LOCKED_HANDLE_LIST | WT_SESSION_LOCKED_SCHEMA | + WT_SESSION_LOCKED_TABLE)) return (0); /* In memory configurations don't block when the cache is full. */ @@ -372,11 +374,14 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool *didworkp) return (0); /* - * Threads operating on cache-resident trees are ignored because they're - * not contributing to the problem. + * Threads operating on cache-resident trees are ignored because + * they're not contributing to the problem. We also don't block while + * reading metadata because we're likely to be holding some other + * resources that could block checkpoints or eviction. */ btree = S2BT_SAFE(session); - if (btree != NULL && F_ISSET(btree, WT_BTREE_IN_MEMORY)) + if (btree != NULL && (F_ISSET(btree, WT_BTREE_IN_MEMORY) || + WT_IS_METADATA(session->dhandle))) return (0); /* Check if eviction is needed. */ diff --git a/src/include/cell.i b/src/include/cell.i index c130768e595..71c2515daf0 100644 --- a/src/include/cell.i +++ b/src/include/cell.i @@ -361,14 +361,12 @@ __wt_cell_pack_leaf_key(WT_CELL *cell, uint8_t prefix, size_t size) cell->__chunk[0] = (uint8_t) ((byte << WT_CELL_SHORT_SHIFT) | WT_CELL_KEY_SHORT); return (1); - } else { - byte = (uint8_t)size; /* Type + length */ - cell->__chunk[0] = (uint8_t) - ((byte << WT_CELL_SHORT_SHIFT) | - WT_CELL_KEY_SHORT_PFX); - cell->__chunk[1] = prefix; /* Prefix */ - return (2); } + byte = (uint8_t)size; /* Type + length */ + cell->__chunk[0] = (uint8_t) + ((byte << WT_CELL_SHORT_SHIFT) | WT_CELL_KEY_SHORT_PFX); + cell->__chunk[1] = prefix; /* Prefix */ + return (2); } if (prefix == 0) { @@ -569,8 +567,8 @@ __wt_cell_unpack_safe( */ #define WT_CELL_LEN_CHK(t, len) do { \ if (start != NULL && \ - ((uint8_t *)t < (uint8_t *)start || \ - (((uint8_t *)t) + (len)) > (uint8_t *)end)) \ + ((uint8_t *)(t) < (uint8_t *)start || \ + (((uint8_t *)(t)) + (len)) > (uint8_t *)end)) \ return (WT_ERROR); \ } while (0) diff --git a/src/include/column.i b/src/include/column.i index d15f874b281..07b627315e6 100644 --- a/src/include/column.i +++ b/src/include/column.i @@ -108,7 +108,7 @@ __col_insert_search_match(WT_INSERT_HEAD *ins_head, uint64_t recno) /* Fast path the check for values at the end of the skiplist. */ if (recno > WT_INSERT_RECNO(ret_ins)) return (NULL); - else if (recno == WT_INSERT_RECNO(ret_ins)) + if (recno == WT_INSERT_RECNO(ret_ins)) return (ret_ins); /* @@ -127,7 +127,7 @@ __col_insert_search_match(WT_INSERT_HEAD *ins_head, uint64_t recno) if (cmp == 0) /* Exact match: return */ return (*insp); - else if (cmp > 0) /* Keep going at this level */ + if (cmp > 0) /* Keep going at this level */ insp = &(*insp)->next[i]; else { /* Drop down a level */ --i; @@ -221,13 +221,13 @@ __col_var_last_recno(WT_REF *ref) * This function ignores those records, our callers must handle that * explicitly, if they care. */ - if (page->pg_var_nrepeats == 0) - return (page->pg_var_entries == 0 ? 0 : - ref->ref_recno + (page->pg_var_entries - 1)); + if (!WT_COL_VAR_REPEAT_SET(page)) + return (page->entries == 0 ? 0 : + ref->ref_recno + (page->entries - 1)); repeat = &page->pg_var_repeats[page->pg_var_nrepeats - 1]; return ((repeat->recno + repeat->rle) - 1 + - (page->pg_var_entries - (repeat->indx + 1))); + (page->entries - (repeat->indx + 1))); } /* @@ -246,8 +246,7 @@ __col_fix_last_recno(WT_REF *ref) * This function ignores those records, our callers must handle that * explicitly, if they care. */ - return (page->pg_fix_entries == 0 ? - 0 : ref->ref_recno + (page->pg_fix_entries - 1)); + return (page->entries == 0 ? 0 : ref->ref_recno + (page->entries - 1)); } /* @@ -273,7 +272,9 @@ __col_var_search(WT_REF *ref, uint64_t recno, uint64_t *start_recnop) * slot for this record number, because we know any intervening records * have repeat counts of 1. */ - for (base = 0, limit = page->pg_var_nrepeats; limit != 0; limit >>= 1) { + for (base = 0, + limit = WT_COL_VAR_REPEAT_SET(page) ? page->pg_var_nrepeats : 0; + limit != 0; limit >>= 1) { indx = base + (limit >> 1); repeat = page->pg_var_repeats + indx; @@ -281,7 +282,7 @@ __col_var_search(WT_REF *ref, uint64_t recno, uint64_t *start_recnop) recno < repeat->recno + repeat->rle) { if (start_recnop != NULL) *start_recnop = repeat->recno; - return (page->pg_var_d + repeat->indx); + return (page->pg_var + repeat->indx); } if (recno < repeat->recno) continue; @@ -306,14 +307,14 @@ __col_var_search(WT_REF *ref, uint64_t recno, uint64_t *start_recnop) * !!! * The test could be written more simply as: * - * (recno >= start_recno + (page->pg_var_entries - start_indx)) + * (recno >= start_recno + (page->entries - start_indx)) * * It's split into two parts because the simpler test will overflow if * searching for large record numbers. */ if (recno >= start_recno && - recno - start_recno >= page->pg_var_entries - start_indx) + recno - start_recno >= page->entries - start_indx) return (NULL); - return (page->pg_var_d + start_indx + (uint32_t)(recno - start_recno)); + return (page->pg_var + start_indx + (uint32_t)(recno - start_recno)); } diff --git a/src/include/connection.h b/src/include/connection.h index 60ce5f55234..6c23492e926 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -107,7 +107,7 @@ struct __wt_named_extractor { * Allocate some additional slots for internal sessions so the user cannot * configure too few sessions for us to run. */ -#define WT_EXTRA_INTERNAL_SESSIONS 10 +#define WT_EXTRA_INTERNAL_SESSIONS 20 /* * WT_CONN_CHECK_PANIC -- @@ -123,15 +123,19 @@ struct __wt_named_extractor { * main queue and the hashed queue. */ #define WT_CONN_DHANDLE_INSERT(conn, dhandle, bucket) do { \ + WT_ASSERT(session, \ + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); \ TAILQ_INSERT_HEAD(&(conn)->dhqh, dhandle, q); \ TAILQ_INSERT_HEAD(&(conn)->dhhash[bucket], dhandle, hashq); \ - ++conn->dhandle_count; \ + ++(conn)->dhandle_count; \ } while (0) #define WT_CONN_DHANDLE_REMOVE(conn, dhandle, bucket) do { \ + WT_ASSERT(session, \ + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); \ TAILQ_REMOVE(&(conn)->dhqh, dhandle, q); \ TAILQ_REMOVE(&(conn)->dhhash[bucket], dhandle, hashq); \ - --conn->dhandle_count; \ + --(conn)->dhandle_count; \ } while (0) /* @@ -163,13 +167,13 @@ struct __wt_connection_impl { WT_SPINLOCK api_lock; /* Connection API spinlock */ WT_SPINLOCK checkpoint_lock; /* Checkpoint spinlock */ - WT_SPINLOCK dhandle_lock; /* Data handle list spinlock */ WT_SPINLOCK fh_lock; /* File handle queue spinlock */ WT_SPINLOCK metadata_lock; /* Metadata update spinlock */ WT_SPINLOCK reconfig_lock; /* Single thread reconfigure */ WT_SPINLOCK schema_lock; /* Schema operation spinlock */ - WT_SPINLOCK table_lock; /* Table creation spinlock */ + WT_RWLOCK table_lock; /* Table list lock */ WT_SPINLOCK turtle_lock; /* Turtle file spinlock */ + WT_RWLOCK dhandle_lock; /* Data handle list lock */ /* * We distribute the btree page locks across a set of spin locks. Don't @@ -262,7 +266,7 @@ struct __wt_connection_impl { WT_TXN_GLOBAL txn_global; /* Global transaction state */ - WT_RWLOCK *hot_backup_lock; /* Hot backup serialization */ + WT_RWLOCK hot_backup_lock; /* Hot backup serialization */ bool hot_backup; /* Hot backup in progress */ char **hot_backup_list; /* Hot backup file list */ @@ -301,6 +305,15 @@ struct __wt_connection_impl { uint32_t evict_threads_max;/* Max eviction threads */ uint32_t evict_threads_min;/* Min eviction threads */ + uint32_t evict_tune_datapts_needed;/* Data needed to tune */ + struct timespec evict_tune_last_action_time;/* Time of last action */ + struct timespec evict_tune_last_time; /* Time of last check */ + uint32_t evict_tune_num_points; /* Number of values tried */ + uint64_t evict_tune_pgs_last; /* Number of pages evicted */ + uint64_t evict_tune_pg_sec_max; /* Max throughput encountered */ + bool evict_tune_stable; /* Are we stable? */ + uint32_t evict_tune_workers_best;/* Best performing value */ + #define WT_STATLOG_FILENAME "WiredTigerStat.%d.%H" WT_SESSION_IMPL *stat_session; /* Statistics log session */ wt_thread_t stat_tid; /* Statistics log thread */ @@ -326,11 +339,11 @@ struct __wt_connection_impl { bool log_tid_set; /* Log server thread set */ WT_CONDVAR *log_file_cond; /* Log file thread wait mutex */ WT_SESSION_IMPL *log_file_session;/* Log file thread session */ - wt_thread_t log_file_tid; /* Log file thread thread */ + wt_thread_t log_file_tid; /* Log file thread */ bool log_file_tid_set;/* Log file thread set */ WT_CONDVAR *log_wrlsn_cond;/* Log write lsn thread wait mutex */ WT_SESSION_IMPL *log_wrlsn_session;/* Log write lsn thread session */ - wt_thread_t log_wrlsn_tid; /* Log write lsn thread thread */ + wt_thread_t log_wrlsn_tid; /* Log write lsn thread */ bool log_wrlsn_tid_set;/* Log write lsn thread set */ WT_LOG *log; /* Logging structure */ WT_COMPRESSOR *log_compressor;/* Logging compressor */ diff --git a/src/include/cursor.h b/src/include/cursor.h index d522abc2a56..f32b4250d30 100644 --- a/src/include/cursor.h +++ b/src/include/cursor.h @@ -52,8 +52,8 @@ { 0 }, /* recno raw buffer */ \ NULL, /* json_private */ \ NULL, /* lang_private */ \ - { NULL, 0, 0, NULL, 0 }, /* WT_ITEM key */ \ - { NULL, 0, 0, NULL, 0 }, /* WT_ITEM value */ \ + { NULL, 0, NULL, 0, 0 }, /* WT_ITEM key */ \ + { NULL, 0, NULL, 0, 0 }, /* WT_ITEM value */ \ 0, /* int saved_err */ \ NULL, /* internal_uri */ \ 0 /* uint32_t flags */ \ @@ -73,7 +73,7 @@ struct __wt_cursor_backup { #define WT_CURBACKUP_LOCKER 0x01 /* Hot-backup started */ uint8_t flags; }; -#define WT_CURSOR_BACKUP_ID(cursor) (((WT_CURSOR_BACKUP *)cursor)->maxid) +#define WT_CURSOR_BACKUP_ID(cursor) (((WT_CURSOR_BACKUP *)(cursor))->maxid) struct __wt_cursor_btree { WT_CURSOR iface; @@ -474,7 +474,7 @@ struct __wt_cursor_stat { * Return a reference to a statistic cursor's stats structures. */ #define WT_CURSOR_STATS(cursor) \ - (((WT_CURSOR_STAT *)cursor)->stats) + (((WT_CURSOR_STAT *)(cursor))->stats) struct __wt_cursor_table { WT_CURSOR iface; @@ -493,7 +493,7 @@ struct __wt_cursor_table { }; #define WT_CURSOR_PRIMARY(cursor) \ - (((WT_CURSOR_TABLE *)cursor)->cg_cursors[0]) + (((WT_CURSOR_TABLE *)(cursor))->cg_cursors[0]) #define WT_CURSOR_RECNO(cursor) WT_STREQ((cursor)->key_format, "r") @@ -550,4 +550,4 @@ struct __wt_cursor_table { } while (0) #define WT_CURSOR_RAW_OK \ - WT_CURSTD_DUMP_HEX | WT_CURSTD_DUMP_PRINT | WT_CURSTD_RAW + (WT_CURSTD_DUMP_HEX | WT_CURSTD_DUMP_PRINT | WT_CURSTD_RAW) diff --git a/src/include/cursor.i b/src/include/cursor.i index c3fcef9a13d..12044e0e228 100644 --- a/src/include/cursor.i +++ b/src/include/cursor.i @@ -76,34 +76,19 @@ __cursor_leave(WT_SESSION_IMPL *session) } /* - * __curfile_enter -- - * Activate a file cursor. - */ -static inline int -__curfile_enter(WT_CURSOR_BTREE *cbt) -{ - WT_SESSION_IMPL *session; - - session = (WT_SESSION_IMPL *)cbt->iface.session; - - if (!F_ISSET(cbt, WT_CBT_NO_TXN)) - WT_RET(__cursor_enter(session)); - F_SET(cbt, WT_CBT_ACTIVE); - return (0); -} - -/* - * __curfile_leave -- - * Clear a file cursor's position. + * __cursor_reset -- + * Reset the cursor, it no longer holds any position. */ static inline int -__curfile_leave(WT_CURSOR_BTREE *cbt) +__cursor_reset(WT_CURSOR_BTREE *cbt) { WT_DECL_RET; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)cbt->iface.session; + __cursor_pos_clear(cbt); + /* If the cursor was active, deactivate it. */ if (F_ISSET(cbt, WT_CBT_ACTIVE)) { if (!F_ISSET(cbt, WT_CBT_NO_TXN)) @@ -111,12 +96,15 @@ __curfile_leave(WT_CURSOR_BTREE *cbt) F_CLR(cbt, WT_CBT_ACTIVE); } + /* If we're not holding a cursor reference, we're done. */ + if (cbt->ref == NULL) + return (0); + /* * If we were scanning and saw a lot of deleted records on this page, * try to evict the page when we release it. */ - if (cbt->ref != NULL && - cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD) + if (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD) __wt_page_evict_soon(session, cbt->ref); cbt->page_deleted_count = 0; @@ -247,7 +235,7 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, bool reenter) #ifdef HAVE_DIAGNOSTIC __wt_cursor_key_order_reset(cbt); #endif - WT_RET(__curfile_leave(cbt)); + WT_RET(__cursor_reset(cbt)); } /* @@ -259,8 +247,12 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, bool reenter) /* If the transaction is idle, check that the cache isn't full. */ WT_RET(__wt_txn_idle_cache_check(session)); - if (!F_ISSET(cbt, WT_CBT_ACTIVE)) - WT_RET(__curfile_enter(cbt)); + /* Activate the file cursor. */ + if (!F_ISSET(cbt, WT_CBT_ACTIVE)) { + if (!F_ISSET(cbt, WT_CBT_NO_TXN)) + WT_RET(__cursor_enter(session)); + F_SET(cbt, WT_CBT_ACTIVE); + } /* * If this is an ordinary transactional cursor, make sure we are set up @@ -272,24 +264,6 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, bool reenter) } /* - * __cursor_reset -- - * Reset the cursor. - */ -static inline int -__cursor_reset(WT_CURSOR_BTREE *cbt) -{ - WT_DECL_RET; - - /* - * The cursor is leaving the API, and no longer holds any position, - * generally called to clean up the cursor after an error. - */ - ret = __curfile_leave(cbt); - __cursor_pos_clear(cbt); - return (ret); -} - -/* * __cursor_row_slot_return -- * Return a row-store leaf page slot's K/V pair. */ diff --git a/src/include/dhandle.h b/src/include/dhandle.h index d7802bb319b..8861e96112b 100644 --- a/src/include/dhandle.h +++ b/src/include/dhandle.h @@ -37,12 +37,30 @@ #define WT_SESSION_META_DHANDLE(s) \ (((WT_CURSOR_BTREE *)((s)->meta_cursor))->btree->dhandle) +#define WT_DHANDLE_ACQUIRE(dhandle) \ + (void)__wt_atomic_add32(&(dhandle)->session_ref, 1) + +#define WT_DHANDLE_RELEASE(dhandle) \ + (void)__wt_atomic_sub32(&(dhandle)->session_ref, 1) + +#define WT_DHANDLE_NEXT(session, dhandle, head, field) do { \ + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));\ + if ((dhandle) == NULL) \ + (dhandle) = TAILQ_FIRST(head); \ + else { \ + WT_DHANDLE_RELEASE(dhandle); \ + (dhandle) = TAILQ_NEXT(dhandle, field); \ + } \ + if ((dhandle) != NULL) \ + WT_DHANDLE_ACQUIRE(dhandle); \ +} while (0) + /* * WT_DATA_HANDLE -- * A handle for a generic named data source. */ struct __wt_data_handle { - WT_RWLOCK *rwlock; /* Lock for shared/exclusive ops */ + WT_RWLOCK rwlock; /* Lock for shared/exclusive ops */ TAILQ_ENTRY(__wt_data_handle) q; TAILQ_ENTRY(__wt_data_handle) hashq; diff --git a/src/include/error.h b/src/include/error.h index bbb7f989332..c338acb370f 100644 --- a/src/include/error.h +++ b/src/include/error.h @@ -67,14 +67,16 @@ int __ret; \ if ((__ret = (a)) != 0 && \ (__ret == WT_PANIC || \ - ret == 0 || ret == WT_DUPLICATE_KEY || ret == WT_NOTFOUND)) \ + ret == 0 || ret == WT_DUPLICATE_KEY || \ + ret == WT_NOTFOUND || ret == WT_RESTART)) \ ret = __ret; \ } while (0) #define WT_TRET_ERROR_OK(a, e) do { \ int __ret; \ if ((__ret = (a)) != 0 && __ret != (e) && \ (__ret == WT_PANIC || \ - ret == 0 || ret == WT_DUPLICATE_KEY || ret == WT_NOTFOUND)) \ + ret == 0 || ret == WT_DUPLICATE_KEY || \ + ret == WT_NOTFOUND || ret == WT_RESTART)) \ ret = __ret; \ } while (0) #define WT_TRET_NOTFOUND_OK(a) WT_TRET_ERROR_OK(a, WT_NOTFOUND) diff --git a/src/include/extern.h b/src/include/extern.h index be042bcd6cb..55ba1bada7c 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -98,14 +98,14 @@ extern void __wt_cursor_key_order_reset(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_A extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern bool __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_reset(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_insert(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_btcur_update_check(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_btcur_insert_check(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_remove(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_update(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_equals(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *equalp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -126,6 +126,7 @@ extern int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) extern void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_ref_out_int(WT_SESSION_IMPL *session, WT_REF *ref, bool rewrite) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_free_ref( WT_SESSION_IMPL *session, WT_REF *ref, int page_type, bool free_pages) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -133,10 +134,10 @@ extern void __wt_free_ref_index(WT_SESSION_IMPL *session, WT_PAGE *page, WT_PAGE extern void __wt_free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btree_close(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_btree_discard(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, bool is_recno) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btree_tree_open( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_btree_evictable(WT_SESSION_IMPL *session, bool on) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_btree_huffman_open(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_btree_huffman_close(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -150,6 +151,9 @@ extern int __wt_ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, void *cookie extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint32_t alloc_entries, bool alloc_refs, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_random_descent(WT_SESSION_IMPL *session, WT_REF **refp, bool eviction) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags @@ -158,8 +162,10 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #endif ) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern bool __wt_split_obsolete(WT_SESSION_IMPL *session, uint64_t split_gen) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_split_stash_discard(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_split_stash_discard_all( WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp, bool closing) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -192,8 +198,6 @@ extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_search_insert(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_las_stats_update(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_las_create(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_las_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -254,6 +258,7 @@ extern WT_THREAD_RET __wt_cache_pool_server(void *arg) WT_GCC_FUNC_DECL_ATTRIBUT extern int __wt_checkpoint_server_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_checkpoint_server_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_checkpoint_signal(WT_SESSION_IMPL *session, wt_off_t logsize) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_conn_dhandle_alloc( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_conn_dhandle_find( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_conn_btree_open( WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -262,7 +267,7 @@ extern int __wt_conn_dhandle_close_all( WT_SESSION_IMPL *session, const char *ur extern int __wt_conn_dhandle_discard_single( WT_SESSION_IMPL *session, bool final, bool force) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_connection_destroy(WT_CONNECTION_IMPL *conn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_connection_destroy(WT_CONNECTION_IMPL *conn) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_logmgr_reconfig(WT_SESSION_IMPL *session, const char **cfg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_log_truncate_files( WT_SESSION_IMPL *session, WT_CURSOR *cursor, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_log_wrlsn(WT_SESSION_IMPL *session, int *yield) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -285,7 +290,7 @@ extern int __wt_curconfig_open(WT_SESSION_IMPL *session, const char *uri, const extern int __wt_curds_open( WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_curfile_next_random(WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_curfile_update_check(WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_curfile_insert_check(WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_curindex_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_curjoin_joined(WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -294,7 +299,7 @@ extern int __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT extern int __wt_json_alloc_unpack(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, WT_CURSOR_JSON *json, bool iskey, va_list ap) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_json_close(WT_SESSION_IMPL *session, WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern size_t __wt_json_unpack_char(u_char ch, u_char *buf, size_t bufsz, bool force_unicode) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); -extern void __wt_json_column_init(WT_CURSOR *cursor, const char *keyformat, const WT_CONFIG_ITEM *idxconf, const WT_CONFIG_ITEM *colconf) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_json_column_init(WT_CURSOR *cursor, const char *uri, const char *keyformat, const WT_CONFIG_ITEM *idxconf, const WT_CONFIG_ITEM *colconf) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_json_token(WT_SESSION *wt_session, const char *src, int *toktype, const char **tokstart, size_t *toklen) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern const char *__wt_json_tokname(int toktype) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern int __wt_json_to_item(WT_SESSION_IMPL *session, const char *jstr, const char *format, WT_CURSOR_JSON *json, bool iskey, WT_ITEM *item) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -315,7 +320,7 @@ extern int __wt_cursor_equals_notsup(WT_CURSOR *cursor, WT_CURSOR *other, int *e extern int __wt_cursor_search_near_notsup(WT_CURSOR *cursor, int *exact) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_cursor_reconfigure_notsup(WT_CURSOR *cursor, const char *config) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_cursor_set_notsup(WT_CURSOR *cursor) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_cursor_kv_not_set(WT_CURSOR *cursor, bool key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_cursor_kv_not_set(WT_CURSOR *cursor, bool key) WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_cursor_get_key(WT_CURSOR *cursor, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_cursor_set_key(WT_CURSOR *cursor, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_cursor_get_raw_key(WT_CURSOR *cursor, WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -352,7 +357,7 @@ extern int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int extern bool __wt_page_evict_urgent(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_evict_priority_clear(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_verbose_dump_cache(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_curstat_cache_walk(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -399,11 +404,10 @@ extern int __wt_logop_row_truncate_unpack( WT_SESSION_IMPL *session, const uint8 extern int __wt_logop_row_truncate_print(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_txn_op_printlog(WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_log_slot_switch( WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool retry, bool forced) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_log_slot_new(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_log_slot_switch(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool retry, bool forced, bool *did_work) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_log_slot_init(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_log_slot_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int64_t __wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_clsm_request_switch(WT_CURSOR_LSM *clsm) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -453,13 +457,14 @@ extern int __wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) extern int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_lsm_worker_start(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_lsm_worker_stop(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_meta_apply_all(WT_SESSION_IMPL *session, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_meta_checkpoint(WT_SESSION_IMPL *session, const char *fname, const char *checkpoint, WT_CKPT *ckpt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_meta_checkpoint_last_name( WT_SESSION_IMPL *session, const char *fname, const char **namep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_meta_checkpoint_clear(WT_SESSION_IMPL *session, const char *fname) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_meta_ckptlist_get( WT_SESSION_IMPL *session, const char *fname, WT_CKPT **ckptbasep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_meta_ckptlist_set(WT_SESSION_IMPL *session, const char *fname, WT_CKPT *ckptbase, WT_LSN *ckptlsn) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT *ckptbase) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT **ckptbasep) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_meta_checkpoint_free(WT_SESSION_IMPL *session, WT_CKPT *ckpt) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_ext_metadata_insert(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key, const char *value) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_ext_metadata_remove( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -613,11 +618,9 @@ extern void __wt_session_close_cache(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ extern int __wt_session_get_btree(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, const char *cfg[], uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_cond_auto_alloc( WT_SESSION_IMPL *session, const char *name, bool is_signalled, uint64_t min, uint64_t max, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_cond_auto_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_cond_auto_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_cond_auto_wait( WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_cond_auto_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_cond_auto_alloc(WT_SESSION_IMPL *session, const char *name, uint64_t min, uint64_t max, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_cond_auto_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_cond_auto_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool (*run_func)(WT_SESSION_IMPL *)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_decrypt(WT_SESSION_IMPL *session, WT_ENCRYPTOR *encryptor, size_t skip, WT_ITEM *in, WT_ITEM *out) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_encrypt(WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t skip, WT_ITEM *in, WT_ITEM *out) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_encrypt_size(WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t incoming_size, size_t *sizep) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -671,16 +674,16 @@ extern void __wt_huffman_close(WT_SESSION_IMPL *session, void *huffman_arg) WT_G extern void __wt_print_huffman_code(void *huffman_arg, uint16_t symbol) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_huffman_encode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg, const uint8_t *from_arg, size_t from_len, WT_ITEM *to_buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_rwlock_alloc( WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern bool __wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern bool __wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern uint32_t __wt_nlpo2_round(uint32_t v) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern uint32_t __wt_nlpo2(uint32_t v) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern uint32_t __wt_log2_int(uint32_t n) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -689,6 +692,7 @@ extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2) WT_GCC_FUNC_DECL_ATTRIBUT extern void __wt_random_init(WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_random_init_seed( WT_SESSION_IMPL *session, WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern uint32_t __wt_random(WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); +extern uint64_t __wt_random64(WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -728,6 +732,7 @@ extern int __wt_thread_group_resize( WT_SESSION_IMPL *session, WT_THREAD_GROUP * extern int __wt_thread_group_create( WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, const char *name, uint32_t min, uint32_t max, uint32_t flags, int (*run_func)(WT_SESSION_IMPL *session, WT_THREAD *context)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_thread_group_destroy(WT_SESSION_IMPL *session, WT_THREAD_GROUP *group) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_thread_group_start_one( WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, bool wait) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_thread_group_stop_one( WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, bool wait) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_txn_get_snapshot(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -740,6 +745,7 @@ extern void __wt_txn_stats_update(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATT extern void __wt_txn_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_txn_global_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_verbose_dump_txn(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[], bool waiting) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/include/extern_posix.h b/src/include/extern_posix.h index 5acb7b0ed27..57d94e392d1 100644 --- a/src/include/extern_posix.h +++ b/src/include/extern_posix.h @@ -12,8 +12,8 @@ extern int __wt_posix_map(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, void *mapp extern int __wt_posix_map_preload(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, const void *map, size_t length, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_posix_map_discard(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, void *map, size_t length, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_posix_unmap(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, void *mapped_region, size_t len, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, bool is_signalled, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_cond_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_once(void (*init_routine)(void)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -24,8 +24,9 @@ extern bool __wt_has_priv(void) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden") extern void __wt_stream_set_line_buffer(FILE *fp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_stream_set_no_buffer(FILE *fp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_sleep(uint64_t seconds, uint64_t micro_seconds) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); +extern int __wt_vsnprintf_len_incr( char *buf, size_t size, size_t *retsizep, const char *fmt, va_list ap) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_thread_id(char *buf, size_t buflen) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); +extern int __wt_thread_id(char *buf, size_t buflen) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_yield(void) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); diff --git a/src/include/extern_win.h b/src/include/extern_win.h index 11b45f11304..43127a0c79f 100644 --- a/src/include/extern_win.h +++ b/src/include/extern_win.h @@ -10,8 +10,8 @@ extern int __wt_os_win(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((war extern int __wt_getenv(WT_SESSION_IMPL *session, const char *variable, const char **envp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_win_map(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, void *mapped_regionp, size_t *lenp, void *mapped_cookiep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_win_unmap(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, void *mapped_region, size_t length, void *mapped_cookie) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, bool is_signalled, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_cond_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern void __wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_once(void (*init_routine)(void)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); @@ -22,9 +22,10 @@ extern bool __wt_has_priv(void) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden") extern void __wt_stream_set_line_buffer(FILE *fp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_stream_set_no_buffer(FILE *fp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_sleep(uint64_t seconds, uint64_t micro_seconds) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_vsnprintf_len_incr( char *buf, size_t size, size_t *retsizep, const char *fmt, va_list ap) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); -extern void __wt_thread_id(char *buf, size_t buflen) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); +extern int __wt_thread_id(char *buf, size_t buflen) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_to_utf16_string( WT_SESSION_IMPL *session, const char*utf8, WT_ITEM **outbuf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); extern int __wt_to_utf8_string( WT_SESSION_IMPL *session, const wchar_t*wide, WT_ITEM **outbuf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("hidden"))); diff --git a/src/include/flags.h b/src/include/flags.h index e7a5ba066df..f26a45c68f5 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -6,19 +6,19 @@ #define WT_CONN_CACHE_POOL 0x00000001 #define WT_CONN_CKPT_SYNC 0x00000002 #define WT_CONN_CLOSING 0x00000004 -#define WT_CONN_EVICTION_RUN 0x00000008 -#define WT_CONN_IN_MEMORY 0x00000010 -#define WT_CONN_LAS_OPEN 0x00000020 -#define WT_CONN_LEAK_MEMORY 0x00000040 -#define WT_CONN_LOG_SERVER_RUN 0x00000080 +#define WT_CONN_CLOSING_NO_MORE_OPENS 0x00000008 +#define WT_CONN_EVICTION_RUN 0x00000010 +#define WT_CONN_IN_MEMORY 0x00000020 +#define WT_CONN_LAS_OPEN 0x00000040 +#define WT_CONN_LEAK_MEMORY 0x00000080 #define WT_CONN_LSM_MERGE 0x00000100 #define WT_CONN_PANIC 0x00000200 #define WT_CONN_READONLY 0x00000400 #define WT_CONN_RECOVERING 0x00000800 #define WT_CONN_SERVER_ASYNC 0x00001000 #define WT_CONN_SERVER_CHECKPOINT 0x00002000 -#define WT_CONN_SERVER_LSM 0x00004000 -#define WT_CONN_SERVER_RUN 0x00008000 +#define WT_CONN_SERVER_LOG 0x00004000 +#define WT_CONN_SERVER_LSM 0x00008000 #define WT_CONN_SERVER_STATISTICS 0x00010000 #define WT_CONN_SERVER_SWEEP 0x00020000 #define WT_CONN_WAS_BACKUP 0x00040000 @@ -53,22 +53,24 @@ #define WT_SESSION_CAN_WAIT 0x00000001 #define WT_SESSION_INTERNAL 0x00000002 #define WT_SESSION_LOCKED_CHECKPOINT 0x00000004 -#define WT_SESSION_LOCKED_HANDLE_LIST 0x00000008 -#define WT_SESSION_LOCKED_METADATA 0x00000010 -#define WT_SESSION_LOCKED_PASS 0x00000020 -#define WT_SESSION_LOCKED_SCHEMA 0x00000040 -#define WT_SESSION_LOCKED_SLOT 0x00000080 -#define WT_SESSION_LOCKED_TABLE 0x00000100 -#define WT_SESSION_LOCKED_TURTLE 0x00000200 -#define WT_SESSION_LOGGING_INMEM 0x00000400 -#define WT_SESSION_LOOKASIDE_CURSOR 0x00000800 -#define WT_SESSION_NO_CACHE 0x00001000 -#define WT_SESSION_NO_DATA_HANDLES 0x00002000 -#define WT_SESSION_NO_EVICTION 0x00004000 -#define WT_SESSION_NO_LOGGING 0x00008000 -#define WT_SESSION_NO_SCHEMA_LOCK 0x00010000 -#define WT_SESSION_QUIET_CORRUPT_FILE 0x00020000 -#define WT_SESSION_SERVER_ASYNC 0x00040000 +#define WT_SESSION_LOCKED_HANDLE_LIST_READ 0x00000008 +#define WT_SESSION_LOCKED_HANDLE_LIST_WRITE 0x00000010 +#define WT_SESSION_LOCKED_METADATA 0x00000020 +#define WT_SESSION_LOCKED_PASS 0x00000040 +#define WT_SESSION_LOCKED_SCHEMA 0x00000080 +#define WT_SESSION_LOCKED_SLOT 0x00000100 +#define WT_SESSION_LOCKED_TABLE_READ 0x00000200 +#define WT_SESSION_LOCKED_TABLE_WRITE 0x00000400 +#define WT_SESSION_LOCKED_TURTLE 0x00000800 +#define WT_SESSION_LOGGING_INMEM 0x00001000 +#define WT_SESSION_LOOKASIDE_CURSOR 0x00002000 +#define WT_SESSION_NO_CACHE 0x00004000 +#define WT_SESSION_NO_DATA_HANDLES 0x00008000 +#define WT_SESSION_NO_EVICTION 0x00010000 +#define WT_SESSION_NO_LOGGING 0x00020000 +#define WT_SESSION_NO_SCHEMA_LOCK 0x00040000 +#define WT_SESSION_QUIET_CORRUPT_FILE 0x00080000 +#define WT_SESSION_SERVER_ASYNC 0x00100000 #define WT_STAT_CLEAR 0x00000001 #define WT_STAT_JSON 0x00000002 #define WT_STAT_ON_CLOSE 0x00000004 @@ -90,27 +92,29 @@ #define WT_VERB_COMPACT 0x00000008 #define WT_VERB_EVICT 0x00000010 #define WT_VERB_EVICTSERVER 0x00000020 -#define WT_VERB_FILEOPS 0x00000040 -#define WT_VERB_HANDLEOPS 0x00000080 -#define WT_VERB_LOG 0x00000100 -#define WT_VERB_LSM 0x00000200 -#define WT_VERB_LSM_MANAGER 0x00000400 -#define WT_VERB_METADATA 0x00000800 -#define WT_VERB_MUTEX 0x00001000 -#define WT_VERB_OVERFLOW 0x00002000 -#define WT_VERB_READ 0x00004000 -#define WT_VERB_REBALANCE 0x00008000 -#define WT_VERB_RECONCILE 0x00010000 -#define WT_VERB_RECOVERY 0x00020000 -#define WT_VERB_SALVAGE 0x00040000 -#define WT_VERB_SHARED_CACHE 0x00080000 -#define WT_VERB_SPLIT 0x00100000 -#define WT_VERB_TEMPORARY 0x00200000 -#define WT_VERB_THREAD_GROUP 0x00400000 -#define WT_VERB_TRANSACTION 0x00800000 -#define WT_VERB_VERIFY 0x01000000 -#define WT_VERB_VERSION 0x02000000 -#define WT_VERB_WRITE 0x04000000 +#define WT_VERB_EVICT_STUCK 0x00000040 +#define WT_VERB_FILEOPS 0x00000080 +#define WT_VERB_HANDLEOPS 0x00000100 +#define WT_VERB_LOG 0x00000200 +#define WT_VERB_LSM 0x00000400 +#define WT_VERB_LSM_MANAGER 0x00000800 +#define WT_VERB_METADATA 0x00001000 +#define WT_VERB_MUTEX 0x00002000 +#define WT_VERB_OVERFLOW 0x00004000 +#define WT_VERB_READ 0x00008000 +#define WT_VERB_REBALANCE 0x00010000 +#define WT_VERB_RECONCILE 0x00020000 +#define WT_VERB_RECOVERY 0x00040000 +#define WT_VERB_RECOVERY_PROGRESS 0x00080000 +#define WT_VERB_SALVAGE 0x00100000 +#define WT_VERB_SHARED_CACHE 0x00200000 +#define WT_VERB_SPLIT 0x00400000 +#define WT_VERB_TEMPORARY 0x00800000 +#define WT_VERB_THREAD_GROUP 0x01000000 +#define WT_VERB_TRANSACTION 0x02000000 +#define WT_VERB_VERIFY 0x04000000 +#define WT_VERB_VERSION 0x08000000 +#define WT_VERB_WRITE 0x10000000 #define WT_VISIBILITY_ERR 0x00000080 /* * flags section: END diff --git a/src/include/intpack.i b/src/include/intpack.i index e8bea58cede..a534de9d9a8 100644 --- a/src/include/intpack.i +++ b/src/include/intpack.i @@ -59,21 +59,21 @@ /* Count the leading zero bytes. */ #if defined(__GNUC__) #define WT_LEADING_ZEROS(x, i) \ - (i = (x == 0) ? (int)sizeof(x) : __builtin_clzll(x) >> 3) + ((i) = ((x) == 0) ? (int)sizeof(x) : __builtin_clzll(x) >> 3) #elif defined(_MSC_VER) #define WT_LEADING_ZEROS(x, i) do { \ - if (x == 0) i = (int)sizeof(x); \ + if ((x) == 0) (i) = (int)sizeof(x); \ else { \ unsigned long __index; \ _BitScanReverse64(&__index, x); \ __index = 63 ^ __index; \ - i = (int)(__index >> 3); } \ + (i) = (int)(__index >> 3); } \ } while (0) #else #define WT_LEADING_ZEROS(x, i) do { \ uint64_t __x = (x); \ uint64_t __m = (uint64_t)0xff << 56; \ - for (i = 0; !(__x & __m) && i != 8; i++) \ + for ((i) = 0; !(__x & __m) && (i) != 8; (i)++) \ __m >>= 8; \ } while (0) #endif @@ -231,7 +231,8 @@ __wt_vpack_int(uint8_t **pp, size_t maxlen, int64_t x) if (x < NEG_2BYTE_MIN) { *p = NEG_MULTI_MARKER; return (__wt_vpack_negint(pp, maxlen, (uint64_t)x)); - } else if (x < NEG_1BYTE_MIN) { + } + if (x < NEG_1BYTE_MIN) { WT_SIZE_CHECK_PACK(2, maxlen); x -= NEG_2BYTE_MIN; *p++ = NEG_2BYTE_MARKER | GET_BITS(x, 13, 8); @@ -358,12 +359,10 @@ __wt_vsize_uint(uint64_t x) { if (x <= POS_1BYTE_MAX) return (1); - else if (x <= POS_2BYTE_MAX + 1) { + if (x <= POS_2BYTE_MAX + 1) return (2); - } else { - x -= POS_2BYTE_MAX + 1; - return (__wt_vsize_posint(x)); - } + x -= POS_2BYTE_MAX + 1; + return (__wt_vsize_posint(x)); } /* @@ -373,13 +372,12 @@ __wt_vsize_uint(uint64_t x) static inline size_t __wt_vsize_int(int64_t x) { - if (x < NEG_2BYTE_MIN) { + if (x < NEG_2BYTE_MIN) return (__wt_vsize_negint((uint64_t)x)); - } else if (x < NEG_1BYTE_MIN) { + if (x < NEG_1BYTE_MIN) return (2); - } else if (x < 0) { + if (x < 0) return (1); - } else - /* For non-negative values, use the unsigned code above. */ - return (__wt_vsize_uint((uint64_t)x)); + /* For non-negative values, use the unsigned code above. */ + return (__wt_vsize_uint((uint64_t)x)); } diff --git a/src/include/lint.h b/src/include/lint.h index e20a83144ee..2d0f47988b7 100644 --- a/src/include/lint.h +++ b/src/include/lint.h @@ -29,9 +29,9 @@ __wt_atomic_fetch_add##name(type *vp, type v) \ { \ type orig; \ \ - old = *vp; \ + orig = *vp; \ *vp += v; \ - return (old); \ + return (orig); \ } \ static inline ret \ __wt_atomic_store##name(type *vp, type v) \ @@ -40,7 +40,7 @@ __wt_atomic_store##name(type *vp, type v) \ \ orig = *vp; \ *vp = v; \ - return (old); \ + return (orig); \ } \ static inline ret \ __wt_atomic_sub##name(type *vp, type v) \ @@ -49,9 +49,9 @@ __wt_atomic_sub##name(type *vp, type v) \ return (*vp); \ } \ static inline bool \ -__wt_atomic_cas##name(type *vp, type old, type new) \ +__wt_atomic_cas##name(type *vp, type orig, type new) \ { \ - if (*vp == old) { \ + if (*vp == orig) { \ *vp = new; \ return (true); \ } \ @@ -75,8 +75,8 @@ WT_ATOMIC_FUNC(size, size_t, size_t) * Pointer compare and swap. */ static inline bool -__wt_atomic_cas_ptr(void *vp, void *old, void *new) { - if (*(void **)vp == old) { +__wt_atomic_cas_ptr(void *vp, void *orig, void *new) { + if (*(void **)vp == orig) { *(void **)vp = new; return (true); } diff --git a/src/include/log.h b/src/include/log.h index 3f2cb2ba8e6..fb3c961417f 100644 --- a/src/include/log.h +++ b/src/include/log.h @@ -86,8 +86,8 @@ union __wt_lsn { * The high bit is reserved for the special states. If the high bit is * set (WT_LOG_SLOT_RESERVED) then we are guaranteed to be in a special state. */ -#define WT_LOG_SLOT_FREE -1 /* Not in use */ -#define WT_LOG_SLOT_WRITTEN -2 /* Slot data written, not processed */ +#define WT_LOG_SLOT_FREE (-1) /* Not in use */ +#define WT_LOG_SLOT_WRITTEN (-2) /* Slot data written, not processed */ /* * We allocate the buffer size, but trigger a slot switch when we cross @@ -144,8 +144,8 @@ union __wt_lsn { /* Slot is in use, but closed to new joins */ #define WT_LOG_SLOT_CLOSED(state) \ (WT_LOG_SLOT_ACTIVE(state) && \ - (FLD64_ISSET((uint64_t)state, WT_LOG_SLOT_CLOSE) && \ - !FLD64_ISSET((uint64_t)state, WT_LOG_SLOT_RESERVED))) + (FLD64_ISSET((uint64_t)(state), WT_LOG_SLOT_CLOSE) && \ + !FLD64_ISSET((uint64_t)(state), WT_LOG_SLOT_RESERVED))) /* Slot is in use, all data copied into buffer */ #define WT_LOG_SLOT_INPROGRESS(state) \ (WT_LOG_SLOT_RELEASED(state) != WT_LOG_SLOT_JOINED(state)) @@ -163,7 +163,7 @@ struct __wt_logslot { WT_CACHE_LINE_PAD_BEGIN volatile int64_t slot_state; /* Slot state */ int64_t slot_unbuffered; /* Unbuffered data in this slot */ - int32_t slot_error; /* Error value */ + int slot_error; /* Error value */ wt_off_t slot_start_offset; /* Starting file offset */ wt_off_t slot_last_offset; /* Last record offset */ WT_LSN slot_release_lsn; /* Slot release LSN */ @@ -185,7 +185,7 @@ struct __wt_logslot { #define WT_WITH_SLOT_LOCK(session, log, op) do { \ WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT)); \ WT_WITH_LOCK_WAIT(session, \ - &log->log_slot_lock, WT_SESSION_LOCKED_SLOT, op); \ + &(log)->log_slot_lock, WT_SESSION_LOCKED_SLOT, op); \ } while (0) struct __wt_myslot { @@ -193,7 +193,8 @@ struct __wt_myslot { wt_off_t end_offset; /* My end offset in buffer */ wt_off_t offset; /* Slot buffer offset */ #define WT_MYSLOT_CLOSE 0x01 /* This thread is closing the slot */ -#define WT_MYSLOT_UNBUFFERED 0x02 /* Write directly */ +#define WT_MYSLOT_NEEDS_RELEASE 0x02 /* This thread is releasing the slot */ +#define WT_MYSLOT_UNBUFFERED 0x04 /* Write directly */ uint32_t flags; /* Flags */ }; @@ -235,7 +236,7 @@ struct __wt_log { WT_SPINLOCK log_sync_lock; /* Locked: Single-thread fsync */ WT_SPINLOCK log_writelsn_lock; /* Locked: write LSN */ - WT_RWLOCK *log_archive_lock; /* Archive and log cursors */ + WT_RWLOCK log_archive_lock;/* Archive and log cursors */ /* Notify any waiting threads when sync_lsn is updated. */ WT_CONDVAR *log_sync_cond; @@ -254,6 +255,7 @@ struct __wt_log { #define WT_SLOT_POOL 128 WT_LOGSLOT *active_slot; /* Active slot */ WT_LOGSLOT slot_pool[WT_SLOT_POOL]; /* Pool of all slots */ + int32_t pool_index; /* Index into slot pool */ size_t slot_buf_size; /* Buffer size for slots */ #ifdef HAVE_DIAGNOSTIC uint64_t write_calls; /* Calls to log_write */ diff --git a/src/include/lsm.h b/src/include/lsm.h index fefed9daa81..e3f6897ef9d 100644 --- a/src/include/lsm.h +++ b/src/include/lsm.h @@ -23,11 +23,14 @@ struct __wt_lsm_worker_cookie { struct __wt_lsm_worker_args { WT_SESSION_IMPL *session; /* Session */ WT_CONDVAR *work_cond; /* Owned by the manager */ + wt_thread_t tid; /* Thread id */ + bool tid_set; /* Thread id set */ + u_int id; /* My manager slot id */ uint32_t type; /* Types of operations handled */ -#define WT_LSM_WORKER_RUN 0x01 - uint32_t flags; /* Worker flags */ + + volatile bool running; /* Worker is running */ }; /* @@ -162,6 +165,9 @@ struct __wt_lsm_manager { #define WT_LSM_MAX_WORKERS 20 #define WT_LSM_MIN_WORKERS 3 WT_LSM_WORKER_ARGS lsm_worker_cookies[WT_LSM_MAX_WORKERS]; + +#define WT_LSM_MANAGER_SHUTDOWN 0x01 /* Manager has shut down */ + uint32_t flags; }; /* @@ -189,7 +195,7 @@ struct __wt_lsm_tree { #define LSM_TREE_MAX_QUEUE 100 uint32_t queue_ref; - WT_RWLOCK *rwlock; + WT_RWLOCK rwlock; TAILQ_ENTRY(__wt_lsm_tree) q; uint64_t dsk_gen; diff --git a/src/include/misc.h b/src/include/misc.h index 66d43496e93..9161a215fdc 100644 --- a/src/include/misc.h +++ b/src/include/misc.h @@ -63,7 +63,7 @@ #define WT_MAX(a, b) ((a) < (b) ? (b) : (a)) /* Elements in an array. */ -#define WT_ELEMENTS(a) (sizeof(a) / sizeof(a[0])) +#define WT_ELEMENTS(a) (sizeof(a) / sizeof((a)[0])) /* 10 level skip lists, 1/4 have a link to the next element. */ #define WT_SKIP_MAXDEPTH 10 @@ -140,6 +140,7 @@ #define F_CLR(p, mask) FLD_CLR((p)->flags, mask) #define F_ISSET(p, mask) FLD_ISSET((p)->flags, mask) +#define F_ISSET_ALL(p, mask) (FLD_MASK((p)->flags, mask) == (mask)) #define F_MASK(p, mask) FLD_MASK((p)->flags, mask) #define F_SET(p, mask) FLD_SET((p)->flags, mask) @@ -180,14 +181,14 @@ */ #define WT_BINARY_SEARCH(key, arrayp, n, found) do { \ uint32_t __base, __indx, __limit; \ - found = false; \ + (found) = false; \ for (__base = 0, __limit = (n); __limit != 0; __limit >>= 1) { \ __indx = __base + (__limit >> 1); \ - if ((arrayp)[__indx] < key) { \ + if ((arrayp)[__indx] < (key)) { \ __base = __indx + 1; \ --__limit; \ - } else if ((arrayp)[__indx] == key) { \ - found = true; \ + } else if ((arrayp)[__indx] == (key)) { \ + (found) = true; \ break; \ } \ } \ @@ -206,8 +207,8 @@ /* Check if a string matches a prefix. */ #define WT_PREFIX_MATCH(str, pfx) \ - (((const char *)(str))[0] == ((const char *)pfx)[0] && \ - strncmp((str), (pfx), strlen(pfx)) == 0) + (((const char *)(str))[0] == ((const char *)(pfx))[0] && \ + strncmp(str, pfx, strlen(pfx)) == 0) /* Check if a string matches a prefix, and move past it. */ #define WT_PREFIX_SKIP(str, pfx) \ @@ -224,8 +225,8 @@ /* Check if a string matches a byte string of len bytes. */ #define WT_STRING_MATCH(str, bytes, len) \ - (((const char *)str)[0] == ((const char *)bytes)[0] && \ - strncmp(str, bytes, len) == 0 && (str)[(len)] == '\0') + (((const char *)(str))[0] == ((const char *)(bytes))[0] && \ + strncmp(str, bytes, len) == 0 && (str)[len] == '\0') /* * Macro that produces a string literal that isn't wrapped in quotes, to avoid diff --git a/src/include/misc.i b/src/include/misc.i index f36be32d6a2..7040886cf82 100644 --- a/src/include/misc.i +++ b/src/include/misc.i @@ -11,11 +11,12 @@ * Wait on a mutex, optionally timing out. */ static inline void -__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs) +__wt_cond_wait(WT_SESSION_IMPL *session, + WT_CONDVAR *cond, uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *)) { bool notused; - __wt_cond_wait_signal(session, cond, usecs, ¬used); + __wt_cond_wait_signal(session, cond, usecs, run_func, ¬used); } /* @@ -85,3 +86,94 @@ __wt_verbose(WT_SESSION_IMPL *session, int flag, const char *fmt, ...) WT_UNUSED(fmt); #endif } + +/* + * __wt_snprintf -- + * snprintf convenience function, ignoring the returned size. + */ +static inline int +__wt_snprintf(char *buf, size_t size, const char *fmt, ...) + WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4))) +{ + WT_DECL_RET; + size_t len; + va_list ap; + + len = 0; + + va_start(ap, fmt); + ret = __wt_vsnprintf_len_incr(buf, size, &len, fmt, ap); + va_end(ap); + WT_RET(ret); + + /* It's an error if the buffer couldn't hold everything. */ + return (len >= size ? ERANGE : 0); +} + +/* + * __wt_vsnprintf -- + * vsnprintf convenience function, ignoring the returned size. + */ +static inline int +__wt_vsnprintf(char *buf, size_t size, const char *fmt, va_list ap) +{ + size_t len; + + len = 0; + + WT_RET(__wt_vsnprintf_len_incr(buf, size, &len, fmt, ap)); + + /* It's an error if the buffer couldn't hold everything. */ + return (len >= size ? ERANGE : 0); +} + +/* + * __wt_snprintf_len_set -- + * snprintf convenience function, setting the returned size. + */ +static inline int +__wt_snprintf_len_set( + char *buf, size_t size, size_t *retsizep, const char *fmt, ...) + WT_GCC_FUNC_ATTRIBUTE((format (printf, 4, 5))) +{ + WT_DECL_RET; + va_list ap; + + *retsizep = 0; + + va_start(ap, fmt); + ret = __wt_vsnprintf_len_incr(buf, size, retsizep, fmt, ap); + va_end(ap); + return (ret); +} + +/* + * __wt_vsnprintf_len_set -- + * vsnprintf convenience function, setting the returned size. + */ +static inline int +__wt_vsnprintf_len_set( + char *buf, size_t size, size_t *retsizep, const char *fmt, va_list ap) +{ + *retsizep = 0; + + return (__wt_vsnprintf_len_incr(buf, size, retsizep, fmt, ap)); +} + +/* + * __wt_snprintf_len_incr -- + * snprintf convenience function, incrementing the returned size. + */ +static inline int +__wt_snprintf_len_incr( + char *buf, size_t size, size_t *retsizep, const char *fmt, ...) + WT_GCC_FUNC_ATTRIBUTE((format (printf, 4, 5))) +{ + WT_DECL_RET; + va_list ap; + + va_start(ap, fmt); + ret = __wt_vsnprintf_len_incr(buf, size, retsizep, fmt, ap); + va_end(ap); + return (ret); +} diff --git a/src/include/mutex.h b/src/include/mutex.h index 6b81b1a6265..910eb7af5b9 100644 --- a/src/include/mutex.h +++ b/src/include/mutex.h @@ -21,8 +21,8 @@ struct __wt_condvar { int waiters; /* Numbers of waiters, or -1 if signalled with no waiters. */ /* - * The following fields are only used for automatically adjusting - * condition variables. They could be in a separate structure. + * The following fields are used for automatically adjusting condition + * variable wait times. */ uint64_t min_wait; /* Minimum wait duration */ uint64_t max_wait; /* Maximum wait duration */ @@ -30,11 +30,14 @@ struct __wt_condvar { }; /* + * Read/write locks: + * + * WiredTiger uses read/write locks for shared/exclusive access to resources. * !!! * Don't modify this structure without understanding the read/write locking * functions. */ -typedef union { /* Read/write lock */ +union __wt_rwlock { /* Read/write lock */ uint64_t u; struct { uint32_t wr; /* Writers and readers */ @@ -45,19 +48,6 @@ typedef union { /* Read/write lock */ uint16_t next; /* Next available ticket number */ uint16_t writers_active;/* Count of active writers */ } s; -} wt_rwlock_t; - -/* - * Read/write locks: - * - * WiredTiger uses read/write locks for shared/exclusive access to resources. - */ -struct __wt_rwlock { - WT_CACHE_LINE_PAD_BEGIN - const char *name; /* Lock name for debugging */ - - wt_rwlock_t rwlock; /* Read/write lock */ - WT_CACHE_LINE_PAD_END }; /* @@ -72,31 +62,17 @@ struct __wt_rwlock { #define SPINLOCK_PTHREAD_MUTEX 2 #define SPINLOCK_PTHREAD_MUTEX_ADAPTIVE 3 -#if SPINLOCK_TYPE == SPINLOCK_GCC - struct __wt_spinlock { WT_CACHE_LINE_PAD_BEGIN +#if SPINLOCK_TYPE == SPINLOCK_GCC volatile int lock; - - /* - * We track acquisitions and time spent waiting for some locks. For - * performance reasons and to make it possible to write generic code - * that tracks statistics for different locks, we store the offset - * of the statistics fields to be updated during lock acquisition. - */ - int16_t stat_count_off; /* acquisitions offset */ - int16_t stat_app_usecs_off; /* waiting application threads offset */ - int16_t stat_int_usecs_off; /* waiting server threads offset */ - WT_CACHE_LINE_PAD_END -}; - #elif SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\ SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE ||\ SPINLOCK_TYPE == SPINLOCK_MSVC - -struct __wt_spinlock { - WT_CACHE_LINE_PAD_BEGIN wt_mutex_t lock; +#else +#error Unknown spinlock type +#endif const char *name; /* Mutex name */ @@ -113,9 +89,3 @@ struct __wt_spinlock { int8_t initialized; /* Lock initialized, for cleanup */ WT_CACHE_LINE_PAD_END }; - -#else - -#error Unknown spinlock type - -#endif diff --git a/src/include/mutex.i b/src/include/mutex.i index a6309e0976b..2d483972ed2 100644 --- a/src/include/mutex.i +++ b/src/include/mutex.i @@ -14,6 +14,18 @@ * of instructions. */ +/* + * __spin_init_internal -- + * Initialize the WT portion of a spinlock. + */ +static inline void +__spin_init_internal(WT_SPINLOCK *t, const char *name) +{ + t->name = name; + t->stat_count_off = t->stat_app_usecs_off = t->stat_int_usecs_off = -1; + t->initialized = 1; +} + #if SPINLOCK_TYPE == SPINLOCK_GCC /* Default to spinning 1000 times before yielding. */ @@ -29,10 +41,9 @@ static inline int __wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name) { WT_UNUSED(session); - WT_UNUSED(name); t->lock = 0; - t->stat_count_off = t->stat_app_usecs_off = t->stat_int_usecs_off = -1; + __spin_init_internal(t, name); return (0); } @@ -110,10 +121,7 @@ __wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name) #else WT_RET(pthread_mutex_init(&t->lock, NULL)); #endif - - t->name = name; - t->stat_count_off = t->stat_app_usecs_off = t->stat_int_usecs_off = -1; - t->initialized = 1; + __spin_init_internal(t, name); WT_UNUSED(session); return (0); @@ -195,8 +203,7 @@ __wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name) return (__wt_map_windows_error(windows_error)); } - t->name = name; - t->initialized = 1; + __spin_init_internal(t, name); return (0); } @@ -300,3 +307,22 @@ __wt_spin_lock_track(WT_SESSION_IMPL *session, WT_SPINLOCK *t) } else __wt_spin_lock(session, t); } + +/* + * __wt_spin_trylock_track -- + * Try to lock a spinlock or fail immediately if it is busy. + * Track if successful. + */ +static inline int +__wt_spin_trylock_track(WT_SESSION_IMPL *session, WT_SPINLOCK *t) +{ + int64_t **stats; + + if (t->stat_count_off != -1 && WT_STAT_ENABLED(session)) { + WT_RET(__wt_spin_trylock(session, t)); + stats = (int64_t **)S2C(session)->stats; + stats[session->stat_bucket][t->stat_count_off]++; + return (0); + } + return (__wt_spin_trylock(session, t)); +} diff --git a/src/include/os.h b/src/include/os.h index 7a8e47ed81f..73d89268392 100644 --- a/src/include/os.h +++ b/src/include/os.h @@ -11,8 +11,14 @@ * A call returning 0 indicates success; any call where \ * 0 is not the only successful return must provide an \ * expression evaluating to 0 in all successful cases. \ + * \ + * XXX \ + * Casting the call's return to int is because CentOS 7.3.1611 \ + * complains about syscall returning a long and the loss of \ + * integer precision in the assignment to ret. The cast should \ + * be a no-op everywhere. \ */ \ - if (((ret) = (call)) == 0) \ + if (((ret) = (int)(call)) == 0) \ break; \ /* \ * The call's error was either returned by the call or \ @@ -61,7 +67,7 @@ #define WT_TIMECMP(t1, t2) \ ((t1).tv_sec < (t2).tv_sec ? -1 : \ - (t1).tv_sec == (t2.tv_sec) ? \ + (t1).tv_sec == (t2).tv_sec ? \ (t1).tv_nsec < (t2).tv_nsec ? -1 : \ (t1).tv_nsec == (t2).tv_nsec ? 0 : 1 : 1) diff --git a/src/include/os_windows.h b/src/include/os_windows.h index 65938ac9f17..c1e5f788dc6 100644 --- a/src/include/os_windows.h +++ b/src/include/os_windows.h @@ -43,16 +43,6 @@ typedef uint32_t u_int; typedef unsigned char u_char; typedef uint64_t u_long; -/* <= VS 2013 is not C99 compat */ -#if _MSC_VER < 1900 -#define snprintf _wt_snprintf - -_Check_return_opt_ int __cdecl _wt_snprintf( - _Out_writes_(_MaxCount) char * _DstBuf, - _In_ size_t _MaxCount, - _In_z_ _Printf_format_string_ const char * _Format, ...); -#endif - /* * Windows does have ssize_t * Python headers declare also though so we need to guard it @@ -61,18 +51,6 @@ _Check_return_opt_ int __cdecl _wt_snprintf( typedef int ssize_t; #endif -/* - * Provide a custom version of vsnprintf that returns the - * needed buffer length instead of -1 on truncation - */ -#define vsnprintf _wt_vsnprintf - -_Check_return_opt_ int __cdecl _wt_vsnprintf( - _Out_writes_(_MaxCount) char * _DstBuf, - _In_ size_t _MaxCount, - _In_z_ _Printf_format_string_ const char * _Format, - va_list _ArgList); - /* Provide a custom version of localtime_r */ struct tm *localtime_r(const time_t* timer, struct tm* result); diff --git a/src/include/packing.i b/src/include/packing.i index 17ca261bcfc..0eadb2f2027 100644 --- a/src/include/packing.i +++ b/src/include/packing.i @@ -104,8 +104,8 @@ __pack_name_next(WT_PACK_NAME *pn, WT_CONFIG_ITEM *name) WT_CONFIG_ITEM ignore; if (pn->genname) { - (void)snprintf(pn->buf, sizeof(pn->buf), - (pn->iskey ? "key%d" : "value%d"), pn->count); + WT_RET(__wt_snprintf(pn->buf, sizeof(pn->buf), + (pn->iskey ? "key%d" : "value%d"), pn->count)); WT_CLEAR(*name); name->str = pn->buf; name->len = strlen(pn->buf); @@ -168,10 +168,15 @@ next: if (pack->cur == pack->end) (int)(pack->end - pack->orig), pack->orig); return (0); case 'u': - case 'U': /* Special case for items with a size prefix. */ pv->type = (!pv->havesize && *pack->cur != '\0') ? 'U' : 'u'; return (0); + case 'U': + /* + * Don't change the type. 'U' is used internally, so this type + * was already changed to explicitly include the size. + */ + return (0); case 'b': case 'h': case 'i': @@ -193,7 +198,7 @@ next: if (pack->cur == pack->end) return (0); default: WT_RET_MSG(pack->session, EINVAL, - "Invalid type '%c' found in format '%.*s'", + "Invalid type '%c' found in format '%.*s'", pv->type, (int)(pack->end - pack->orig), pack->orig); } @@ -201,43 +206,43 @@ next: if (pack->cur == pack->end) #define WT_PACK_GET(session, pv, ap) do { \ WT_ITEM *__item; \ - switch (pv.type) { \ + switch ((pv).type) { \ case 'x': \ break; \ case 's': \ case 'S': \ - pv.u.s = va_arg(ap, const char *); \ + (pv).u.s = va_arg(ap, const char *); \ break; \ case 'U': \ case 'u': \ __item = va_arg(ap, WT_ITEM *); \ - pv.u.item.data = __item->data; \ - pv.u.item.size = __item->size; \ + (pv).u.item.data = __item->data; \ + (pv).u.item.size = __item->size; \ break; \ case 'b': \ case 'h': \ case 'i': \ - pv.u.i = va_arg(ap, int); \ + (pv).u.i = va_arg(ap, int); \ break; \ case 'B': \ case 'H': \ case 'I': \ case 't': \ - pv.u.u = va_arg(ap, unsigned int); \ + (pv).u.u = va_arg(ap, unsigned int); \ break; \ case 'l': \ - pv.u.i = va_arg(ap, long); \ + (pv).u.i = va_arg(ap, long); \ break; \ case 'L': \ - pv.u.u = va_arg(ap, unsigned long); \ + (pv).u.u = va_arg(ap, unsigned long); \ break; \ case 'q': \ - pv.u.i = va_arg(ap, int64_t); \ + (pv).u.i = va_arg(ap, int64_t); \ break; \ case 'Q': \ case 'r': \ case 'R': \ - pv.u.u = va_arg(ap, uint64_t); \ + (pv).u.u = va_arg(ap, uint64_t); \ break; \ /* User format strings have already been validated. */ \ WT_ILLEGAL_VALUE(session); \ @@ -551,47 +556,47 @@ __unpack_read(WT_SESSION_IMPL *session, #define WT_UNPACK_PUT(session, pv, ap) do { \ WT_ITEM *__item; \ - switch (pv.type) { \ + switch ((pv).type) { \ case 'x': \ break; \ case 's': \ case 'S': \ - *va_arg(ap, const char **) = pv.u.s; \ + *va_arg(ap, const char **) = (pv).u.s; \ break; \ case 'U': \ case 'u': \ __item = va_arg(ap, WT_ITEM *); \ - __item->data = pv.u.item.data; \ - __item->size = pv.u.item.size; \ + __item->data = (pv).u.item.data; \ + __item->size = (pv).u.item.size; \ break; \ case 'b': \ - *va_arg(ap, int8_t *) = (int8_t)pv.u.i; \ + *va_arg(ap, int8_t *) = (int8_t)(pv).u.i; \ break; \ case 'h': \ - *va_arg(ap, int16_t *) = (short)pv.u.i; \ + *va_arg(ap, int16_t *) = (short)(pv).u.i; \ break; \ case 'i': \ case 'l': \ - *va_arg(ap, int32_t *) = (int32_t)pv.u.i; \ + *va_arg(ap, int32_t *) = (int32_t)(pv).u.i; \ break; \ case 'q': \ - *va_arg(ap, int64_t *) = pv.u.i; \ + *va_arg(ap, int64_t *) = (pv).u.i; \ break; \ case 'B': \ case 't': \ - *va_arg(ap, uint8_t *) = (uint8_t)pv.u.u; \ + *va_arg(ap, uint8_t *) = (uint8_t)(pv).u.u; \ break; \ case 'H': \ - *va_arg(ap, uint16_t *) = (uint16_t)pv.u.u; \ + *va_arg(ap, uint16_t *) = (uint16_t)(pv).u.u; \ break; \ case 'I': \ case 'L': \ - *va_arg(ap, uint32_t *) = (uint32_t)pv.u.u; \ + *va_arg(ap, uint32_t *) = (uint32_t)(pv).u.u; \ break; \ case 'Q': \ case 'r': \ case 'R': \ - *va_arg(ap, uint64_t *) = pv.u.u; \ + *va_arg(ap, uint64_t *) = (pv).u.u; \ break; \ /* User format strings have already been validated. */ \ WT_ILLEGAL_VALUE(session); \ diff --git a/src/include/schema.h b/src/include/schema.h index a17affb7660..50e141d9921 100644 --- a/src/include/schema.h +++ b/src/include/schema.h @@ -78,6 +78,14 @@ struct __wt_table { */ #define WT_COLGROUPS(t) WT_MAX((t)->ncolgroups, 1) +/* Helpers for the locked state of the handle list and table locks. */ +#define WT_SESSION_LOCKED_HANDLE_LIST \ + (WT_SESSION_LOCKED_HANDLE_LIST_READ | \ + WT_SESSION_LOCKED_HANDLE_LIST_WRITE) +#define WT_SESSION_LOCKED_TABLE \ + (WT_SESSION_LOCKED_TABLE_READ | \ + WT_SESSION_LOCKED_TABLE_WRITE) + /* * WT_WITH_LOCK_WAIT -- * Wait for a lock, perform an operation, drop the lock. @@ -85,7 +93,7 @@ struct __wt_table { #define WT_WITH_LOCK_WAIT(session, lock, flag, op) do { \ if (F_ISSET(session, (flag))) { \ op; \ - } else { \ + } else { \ __wt_spin_lock_track(session, lock); \ F_SET(session, (flag)); \ op; \ @@ -99,10 +107,11 @@ struct __wt_table { * Acquire a lock if available, perform an operation, drop the lock. */ #define WT_WITH_LOCK_NOWAIT(session, ret, lock, flag, op) do { \ - ret = 0; \ + (ret) = 0; \ if (F_ISSET(session, (flag))) { \ op; \ - } else if ((ret = __wt_spin_trylock(session, lock)) == 0) { \ + } else if (((ret) = \ + __wt_spin_trylock_track(session, lock)) == 0) { \ F_SET(session, (flag)); \ op; \ F_CLR(session, (flag)); \ @@ -122,16 +131,46 @@ struct __wt_table { &S2C(session)->checkpoint_lock, WT_SESSION_LOCKED_CHECKPOINT, op) /* - * WT_WITH_HANDLE_LIST_LOCK -- - * Acquire the data handle list lock, perform an operation, drop the lock. + * WT_WITH_HANDLE_LIST_READ_LOCK -- + * Acquire the data handle list lock in shared mode, perform an operation, + * drop the lock. The handle list lock is a read-write lock so the + * implementation is different to the other lock macros. * * Note: always waits because some operations need the handle list lock to * discard handles, and we only expect it to be held across short * operations. */ -#define WT_WITH_HANDLE_LIST_LOCK(session, op) \ - WT_WITH_LOCK_WAIT(session, \ - &S2C(session)->dhandle_lock, WT_SESSION_LOCKED_HANDLE_LIST, op) +#define WT_WITH_HANDLE_LIST_READ_LOCK(session, op) do { \ + if (F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)) { \ + op; \ + } else { \ + __wt_readlock(session, &S2C(session)->dhandle_lock); \ + F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \ + op; \ + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \ + __wt_readunlock(session, &S2C(session)->dhandle_lock); \ + } \ +} while (0) + +/* + * WT_WITH_HANDLE_LIST_WRITE_LOCK -- + * Acquire the data handle list lock in exclusive mode, perform an + * operation, drop the lock. The handle list lock is a read-write lock so + * the implementation is different to the other lock macros. + */ +#define WT_WITH_HANDLE_LIST_WRITE_LOCK(session, op) do { \ + if (F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)) { \ + op; \ + } else { \ + WT_ASSERT(session, \ + !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ));\ + __wt_writelock(session, &S2C(session)->dhandle_lock); \ + F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \ + op; \ + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \ + __wt_writeunlock(session, &S2C(session)->dhandle_lock); \ + } \ +} while (0) /* * WT_WITH_METADATA_LOCK -- @@ -165,22 +204,58 @@ struct __wt_table { } while (0) /* - * WT_WITH_TABLE_LOCK, WT_WITH_TABLE_LOCK_NOWAIT -- + * WT_WITH_TABLE_READ_LOCK, WT_WITH_TABLE_WRITE_LOCK, + * WT_WITH_TABLE_WRITE_LOCK_NOWAIT -- * Acquire the table lock, perform an operation, drop the lock. + * The table lock is a read-write lock so the implementation is different + * to most other lock macros. + * + * Note: readlock always waits because some operations need the table lock + * to discard handles, and we only expect it to be held across short + * operations. */ -#define WT_WITH_TABLE_LOCK(session, op) do { \ - WT_ASSERT(session, \ - F_ISSET(session, WT_SESSION_LOCKED_TABLE) || \ - !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); \ - WT_WITH_LOCK_WAIT(session, \ - &S2C(session)->table_lock, WT_SESSION_LOCKED_TABLE, op); \ +#define WT_WITH_TABLE_READ_LOCK(session, op) do { \ + if (F_ISSET(session, WT_SESSION_LOCKED_TABLE)) { \ + op; \ + } else { \ + WT_ASSERT(session, \ + !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); \ + __wt_readlock(session, &S2C(session)->table_lock); \ + F_SET(session, WT_SESSION_LOCKED_TABLE_READ); \ + op; \ + F_CLR(session, WT_SESSION_LOCKED_TABLE_READ); \ + __wt_readunlock(session, &S2C(session)->table_lock); \ + } \ +} while (0) + +#define WT_WITH_TABLE_WRITE_LOCK(session, op) do { \ + if (F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE)) { \ + op; \ + } else { \ + WT_ASSERT(session, \ + !F_ISSET(session, WT_SESSION_LOCKED_TABLE_READ | \ + WT_SESSION_LOCKED_HANDLE_LIST)); \ + __wt_writelock(session, &S2C(session)->table_lock); \ + F_SET(session, WT_SESSION_LOCKED_TABLE_WRITE); \ + op; \ + F_CLR(session, WT_SESSION_LOCKED_TABLE_WRITE); \ + __wt_writeunlock(session, &S2C(session)->table_lock); \ + } \ } while (0) -#define WT_WITH_TABLE_LOCK_NOWAIT(session, ret, op) do { \ +#define WT_WITH_TABLE_WRITE_LOCK_NOWAIT(session, ret, op) do { \ WT_ASSERT(session, \ - F_ISSET(session, WT_SESSION_LOCKED_TABLE) || \ - !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); \ - WT_WITH_LOCK_NOWAIT(session, ret, \ - &S2C(session)->table_lock, WT_SESSION_LOCKED_TABLE, op); \ + F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE) || \ + !F_ISSET(session, WT_SESSION_LOCKED_TABLE_READ | \ + WT_SESSION_LOCKED_HANDLE_LIST)); \ + if (F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE)) { \ + op; \ + } else if (((ret) = __wt_try_writelock(session, \ + &S2C(session)->table_lock)) == 0) { \ + F_SET(session, WT_SESSION_LOCKED_TABLE_WRITE); \ + op; \ + F_CLR(session, WT_SESSION_LOCKED_TABLE_WRITE); \ + __wt_writeunlock(session, &S2C(session)->table_lock); \ + } \ } while (0) /* @@ -192,19 +267,31 @@ struct __wt_table { WT_CONNECTION_IMPL *__conn = S2C(session); \ bool __checkpoint_locked = \ F_ISSET(session, WT_SESSION_LOCKED_CHECKPOINT); \ - bool __handle_locked = \ - F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST); \ - bool __table_locked = \ - F_ISSET(session, WT_SESSION_LOCKED_TABLE); \ + bool __handle_read_locked = \ + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \ + bool __handle_write_locked = \ + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \ + bool __table_read_locked = \ + F_ISSET(session, WT_SESSION_LOCKED_TABLE_READ); \ + bool __table_write_locked = \ + F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE); \ bool __schema_locked = \ F_ISSET(session, WT_SESSION_LOCKED_SCHEMA); \ - if (__handle_locked) { \ - F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST); \ - __wt_spin_unlock(session, &__conn->dhandle_lock); \ + if (__handle_read_locked) { \ + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \ + __wt_readunlock(session, &__conn->dhandle_lock); \ } \ - if (__table_locked) { \ - F_CLR(session, WT_SESSION_LOCKED_TABLE); \ - __wt_spin_unlock(session, &__conn->table_lock); \ + if (__handle_write_locked) { \ + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \ + __wt_writeunlock(session, &__conn->dhandle_lock); \ + } \ + if (__table_read_locked) { \ + F_CLR(session, WT_SESSION_LOCKED_TABLE_READ); \ + __wt_readunlock(session, &__conn->table_lock); \ + } \ + if (__table_write_locked) { \ + F_CLR(session, WT_SESSION_LOCKED_TABLE_WRITE); \ + __wt_writeunlock(session, &__conn->table_lock); \ } \ if (__schema_locked) { \ F_CLR(session, WT_SESSION_LOCKED_SCHEMA); \ @@ -223,12 +310,20 @@ struct __wt_table { __wt_spin_lock(session, &__conn->schema_lock); \ F_SET(session, WT_SESSION_LOCKED_SCHEMA); \ } \ - if (__table_locked) { \ - __wt_spin_lock(session, &__conn->table_lock); \ - F_SET(session, WT_SESSION_LOCKED_TABLE); \ + if (__table_read_locked) { \ + __wt_readlock(session, &__conn->table_lock); \ + F_SET(session, WT_SESSION_LOCKED_TABLE_READ); \ + } \ + if (__table_write_locked) { \ + __wt_writelock(session, &__conn->table_lock); \ + F_SET(session, WT_SESSION_LOCKED_TABLE_WRITE); \ + } \ + if (__handle_read_locked) { \ + __wt_readlock(session, &__conn->dhandle_lock); \ + F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); \ } \ - if (__handle_locked) { \ - __wt_spin_lock(session, &__conn->dhandle_lock); \ - F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST); \ + if (__handle_write_locked) { \ + __wt_writelock(session, &__conn->dhandle_lock); \ + F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE); \ } \ } while (0) diff --git a/src/include/session.h b/src/include/session.h index 7dd523aea26..674e92671b1 100644 --- a/src/include/session.h +++ b/src/include/session.h @@ -52,8 +52,6 @@ struct __wt_session_impl { const char *lastop; /* Last operation */ uint32_t id; /* UID, offset in session array */ - WT_CONDVAR *cond; /* Condition variable */ - WT_EVENT_HANDLER *event_handler;/* Application's event handlers */ WT_DATA_HANDLE *dhandle; /* Current data handle */ @@ -69,7 +67,6 @@ struct __wt_session_impl { TAILQ_HEAD(__dhandles, __wt_data_handle_cache) dhandles; time_t last_sweep; /* Last sweep for dead handles */ - WT_CURSOR *cursor; /* Current cursor */ /* Cursors closed with the session */ TAILQ_HEAD(__cursors, __wt_cursor) cursors; @@ -90,7 +87,7 @@ struct __wt_session_impl { void *meta_track_sub; /* Child transaction / save point */ size_t meta_track_alloc; /* Currently allocated */ int meta_track_nest; /* Nesting level of meta transaction */ -#define WT_META_TRACKING(session) (session->meta_track_next != NULL) +#define WT_META_TRACKING(session) ((session)->meta_track_next != NULL) /* * Each session keeps a cache of table handles. The set of handles @@ -153,20 +150,16 @@ struct __wt_session_impl { uint32_t flags; /* - * The split stash memory and hazard information persist past session - * close because they are accessed by threads of control other than the - * thread owning the session. - * + * All of the following fields live at the end of the structure so it's + * easier to clear everything but the fields that persist. + */ +#define WT_SESSION_CLEAR_SIZE (offsetof(WT_SESSION_IMPL, rnd)) + + /* * The random number state persists past session close because we don't - * want to repeatedly allocate repeated values for skiplist depth if the + * want to repeatedly use the same values for skiplist depth when the * application isn't caching sessions. - * - * All of these fields live at the end of the structure so it's easier - * to clear everything but the fields that persist. */ -#define WT_SESSION_CLEAR_SIZE(s) \ - (WT_PTRDIFF(&(s)->rnd, s)) - WT_RAND_STATE rnd; /* Random number generation state */ /* Hashed handle reference list array */ @@ -175,6 +168,9 @@ struct __wt_session_impl { TAILQ_HEAD(__tables_hash, __wt_table) *tablehash; /* + * Split stash memory persists past session close because it's accessed + * by threads of control other than the thread owning the session. + * * Splits can "free" memory that may still be in use, and we use a * split generation number to track it, that is, the session stores a * reference to the memory and allocates a split generation; when no @@ -194,6 +190,9 @@ struct __wt_session_impl { /* * Hazard pointers. * + * Hazard information persists past session close because it's accessed + * by threads of control other than the thread owning the session. + * * Use the non-NULL state of the hazard field to know if the session has * previously been initialized. */ diff --git a/src/include/stat.h b/src/include/stat.h index 0daab83e166..6c274484bcb 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -72,7 +72,7 @@ * and the session ID is a small, monotonically increasing number. */ #define WT_STATS_SLOT_ID(session) \ - ((session)->id) % WT_COUNTER_SLOTS + (((session)->id) % WT_COUNTER_SLOTS) /* * Statistic structures are arrays of int64_t's. We have functions to read/write @@ -310,10 +310,15 @@ struct __wt_connection_stats { int64_t cache_eviction_slow; int64_t cache_eviction_state; int64_t cache_eviction_walks_abandoned; + int64_t cache_eviction_active_workers; + int64_t cache_eviction_worker_created; int64_t cache_eviction_worker_evicting; + int64_t cache_eviction_worker_removed; + int64_t cache_eviction_stable_state_workers; int64_t cache_eviction_force_fail; int64_t cache_eviction_walks_active; int64_t cache_eviction_walks_started; + int64_t cache_eviction_force_retune; int64_t cache_eviction_hazard; int64_t cache_hazard_checks; int64_t cache_hazard_walks; @@ -388,9 +393,7 @@ struct __wt_connection_stats { int64_t lock_checkpoint_count; int64_t lock_checkpoint_wait_application; int64_t lock_checkpoint_wait_internal; - int64_t lock_handle_list_count; - int64_t lock_handle_list_wait_application; - int64_t lock_handle_list_wait_internal; + int64_t lock_handle_list_wait_eviction; int64_t lock_metadata_count; int64_t lock_metadata_wait_application; int64_t lock_metadata_wait_internal; @@ -402,9 +405,11 @@ struct __wt_connection_stats { int64_t lock_table_wait_internal; int64_t log_slot_switch_busy; int64_t log_slot_closes; + int64_t log_slot_active_closed; int64_t log_slot_races; int64_t log_slot_transitions; int64_t log_slot_joins; + int64_t log_slot_no_free_slots; int64_t log_slot_unbuffered; int64_t log_bytes_payload; int64_t log_bytes_written; @@ -564,6 +569,7 @@ struct __wt_dsrc_stats { int64_t cache_pages_requested; int64_t cache_write; int64_t cache_write_restore; + int64_t cache_bytes_dirty; int64_t cache_eviction_clean; int64_t cache_state_gen_avg_gap; int64_t cache_state_avg_written_size; diff --git a/src/include/thread_group.h b/src/include/thread_group.h index 76758a090c4..77cff00dc8d 100644 --- a/src/include/thread_group.h +++ b/src/include/thread_group.h @@ -40,7 +40,7 @@ struct __wt_thread_group { const char *name; /* Name */ - WT_RWLOCK *lock; /* Protects group changes */ + WT_RWLOCK lock; /* Protects group changes */ /* * Condition signalled when wanting to wake up threads that are diff --git a/src/include/txn.h b/src/include/txn.h index 12fc2a0a5b7..7e802c188ab 100644 --- a/src/include/txn.h +++ b/src/include/txn.h @@ -92,7 +92,7 @@ struct __wt_txn_global { * Prevents the oldest ID moving forwards while threads are scanning * the global transaction state. */ - WT_RWLOCK *scan_rwlock; + WT_RWLOCK scan_rwlock; /* * Track information about the running checkpoint. The transaction @@ -114,7 +114,7 @@ struct __wt_txn_global { volatile uint64_t metadata_pinned; /* Oldest ID for metadata */ /* Named snapshot state. */ - WT_RWLOCK *nsnap_rwlock; + WT_RWLOCK nsnap_rwlock; volatile uint64_t nsnap_oldest_id; TAILQ_HEAD(__wt_nsnap_qh, __wt_named_snapshot) nsnaph; diff --git a/src/include/txn.i b/src/include/txn.i index 0cc4a6f8439..314c948e4d1 100644 --- a/src/include/txn.i +++ b/src/include/txn.i @@ -125,7 +125,8 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session) * minimum of it with the oldest ID, which is what we want. */ oldest_id = txn_global->oldest_id; - include_checkpoint_txn = btree == NULL || btree->include_checkpoint_txn; + include_checkpoint_txn = btree == NULL || + btree->checkpoint_gen != txn_global->checkpoint_gen; WT_READ_BARRIER(); checkpoint_pinned = txn_global->checkpoint_pinned; diff --git a/src/include/verify_build.h b/src/include/verify_build.h index 8abc192892e..640f5e4cf5f 100644 --- a/src/include/verify_build.h +++ b/src/include/verify_build.h @@ -59,7 +59,6 @@ __wt_verify_build(void) sizeof(s) > WT_CACHE_LINE_ALIGNMENT || \ sizeof(s) % WT_CACHE_LINE_ALIGNMENT == 0) WT_PADDING_CHECK(WT_LOGSLOT); - WT_PADDING_CHECK(WT_RWLOCK); WT_PADDING_CHECK(WT_SPINLOCK); WT_PADDING_CHECK(WT_TXN_STATE); diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index a6deed7e14e..ddecb2ac765 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -36,7 +36,7 @@ extern "C" { #if defined(DOXYGEN) || defined(SWIG) #define __F(func) func #else -#define __F(func) (*func) +#define __F(func) (*(func)) #endif #ifdef SWIG @@ -114,16 +114,16 @@ struct __wt_item { size_t size; #ifndef DOXYGEN -#define WT_ITEM_ALIGNED 0x00000001 -#define WT_ITEM_INUSE 0x00000002 - /* This appears in the middle of the struct to avoid padding. */ - /*! Object flags (internal use). */ - uint32_t flags; - /*! Managed memory chunk (internal use). */ void *mem; + /*! Managed memory size (internal use). */ size_t memsize; + +#define WT_ITEM_ALIGNED 0x00000001 +#define WT_ITEM_INUSE 0x00000002 + /*! Object flags (internal use). */ + uint32_t flags; #endif }; @@ -427,7 +427,7 @@ struct __wt_cursor { * * @param cursor the cursor handle * @errors - * In particular, if \c overwrite is not configured and a record with + * In particular, if \c overwrite=false is configured and a record with * the specified key already exists, ::WT_DUPLICATE_KEY is returned. * Also, if \c in_memory is configured for the database and the insert * requires more than the configured cache size to complete, @@ -452,7 +452,9 @@ struct __wt_cursor { * * On success, the cursor ends positioned at the modified record; to * minimize cursor resources, the WT_CURSOR::reset method should be - * called as soon as the cursor no longer needs that position. + * called as soon as the cursor no longer needs that position. (The + * WT_CURSOR::insert method never keeps a cursor position and may be + * more efficient for that reason.) * * The maximum length of a single column stored in a table is not fixed * (as it partially depends on the underlying file configuration), but @@ -460,7 +462,7 @@ struct __wt_cursor { * * @param cursor the cursor handle * @errors - * In particular, if \c overwrite is not configured and no record with + * In particular, if \c overwrite=false is configured and no record with * the specified key exists, ::WT_NOTFOUND is returned. * Also, if \c in_memory is configured for the database and the insert * requires more than the configured cache size to complete, @@ -477,8 +479,18 @@ struct __wt_cursor { * * @snippet ex_all.c Remove a record * - * If the cursor was not configured with "overwrite=true", the key must - * be set and the key's record must exist; the record will be removed. + * If the cursor was configured with "overwrite=false" (not the + * default), the key must be set and the key's record must exist; the + * record will be removed. + * + * Any cursor position does not change: if the cursor was positioned + * before the WT_CURSOR::remove call, the cursor remains positioned + * at the removed record; to minimize cursor resources, the + * WT_CURSOR::reset method should be called as soon as the cursor no + * longer needs that position. If the cursor was not positioned before + * the WT_CURSOR::remove call, the cursor ends with no position, and a + * subsequent call to the WT_CURSOR::next (WT_CURSOR::prev) method will + * iterate from the beginning (end) of the table. * * @snippet ex_all.c Remove a record and fail if DNE * @@ -486,14 +498,10 @@ struct __wt_cursor { * (that is, a store with an 'r' type key and 't' type value) is * identical to setting the record's value to 0. * - * On success, the cursor ends positioned at the removed record; to - * minimize cursor resources, the WT_CURSOR::reset method should be - * called as soon as the cursor no longer needs that position. - * * @param cursor the cursor handle * @errors - * In particular, if \c overwrite is not configured and no record with - * the specified key exists, ::WT_NOTFOUND is returned. + * In particular, if \c overwrite=false is configured and no record + * with the specified key exists, ::WT_NOTFOUND is returned. */ int __F(remove)(WT_CURSOR *cursor); /*! @} */ @@ -576,8 +584,9 @@ struct __wt_cursor { #define WT_CURSTD_OPEN 0x00200 #define WT_CURSTD_OVERWRITE 0x00400 #define WT_CURSTD_RAW 0x00800 -#define WT_CURSTD_VALUE_EXT 0x01000 /* Value points out of the tree. */ -#define WT_CURSTD_VALUE_INT 0x02000 /* Value points into the tree. */ +#define WT_CURSTD_RAW_SEARCH 0x01000 +#define WT_CURSTD_VALUE_EXT 0x02000 /* Value points out of the tree. */ +#define WT_CURSTD_VALUE_INT 0x04000 /* Value points into the tree. */ #define WT_CURSTD_VALUE_SET (WT_CURSTD_VALUE_EXT | WT_CURSTD_VALUE_INT) uint32_t flags; #endif @@ -1233,8 +1242,8 @@ struct __wt_session { * @config{split_pct, the Btree page split size as a percentage of the * maximum Btree page size\, that is\, when a Btree page is split\, it * will be split into smaller pages\, where each page is the specified - * percentage of the maximum Btree page size., an integer between 25 and - * 100; default \c 75.} + * percentage of the maximum Btree page size., an integer between 50 and + * 100; default \c 90.} * @config{type, set the type of data source used to store a column * group\, index or simple table. By default\, a \c "file:" URI is * derived from the object name. The \c type configuration can be used @@ -1471,6 +1480,10 @@ struct __wt_session { * contains. * @snippet ex_all.c Truncate a range * + * Any specified cursors end with no position, and subsequent calls to + * the WT_CURSOR::next (WT_CURSOR::prev) method will iterate from the + * beginning (end) of the table. + * * @param session the session handle * @param name the URI of the file or table to truncate * @param start optional cursor marking the first record discarded; @@ -1855,7 +1868,7 @@ struct __wt_connection { * threads WiredTiger will start to help evict pages from cache. The * number of threads started will vary depending on the current eviction * load. Each eviction worker thread uses a session from the configured - * session_max., an integer between 1 and 20; default \c 1.} + * session_max., an integer between 1 and 20; default \c 8.} * @config{ threads_min, minimum number of * threads WiredTiger will start to help evict pages from cache. The * number of threads currently running will vary depending on the @@ -1982,12 +1995,13 @@ struct __wt_connection { * as a list\, such as <code>"verbose=[evictserver\,read]"</code>., a * list\, with values chosen from the following options: \c "api"\, \c * "block"\, \c "checkpoint"\, \c "compact"\, \c "evict"\, \c - * "evictserver"\, \c "fileops"\, \c "handleops"\, \c "log"\, \c "lsm"\, - * \c "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c - * "read"\, \c "rebalance"\, \c "reconcile"\, \c "recovery"\, \c - * "salvage"\, \c "shared_cache"\, \c "split"\, \c "temporary"\, \c - * "thread_group"\, \c "transaction"\, \c "verify"\, \c "version"\, \c - * "write"; default empty.} + * "evict_stuck"\, \c "evictserver"\, \c "fileops"\, \c "handleops"\, \c + * "log"\, \c "lsm"\, \c "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c + * "overflow"\, \c "read"\, \c "rebalance"\, \c "reconcile"\, \c + * "recovery"\, \c "recovery_progress"\, \c "salvage"\, \c + * "shared_cache"\, \c "split"\, \c "temporary"\, \c "thread_group"\, \c + * "transaction"\, \c "verify"\, \c "version"\, \c "write"; default + * empty.} * @configend * @errors */ @@ -2331,7 +2345,7 @@ struct __wt_connection { * WiredTiger will start to help evict pages from cache. The number of threads * started will vary depending on the current eviction load. Each eviction * worker thread uses a session from the configured session_max., an integer - * between 1 and 20; default \c 1.} + * between 1 and 20; default \c 8.} * @config{ threads_min, * minimum number of threads WiredTiger will start to help evict pages from * cache. The number of threads currently running will vary depending on the @@ -2361,7 +2375,7 @@ struct __wt_connection { * @config{exclusive, fail if the database already exists\, generally used with * the \c create option., a boolean flag; default \c false.} * @config{extensions, list of shared library extensions to load (using dlopen). - * Any values specified to an library extension are passed to + * Any values specified to a library extension are passed to * WT_CONNECTION::load_extension as the \c config parameter (for example\, * <code>extensions=(/path/ext.so={entry=my_entry})</code>)., a list of strings; * default empty.} @@ -2513,12 +2527,13 @@ struct __wt_connection { * WiredTiger is configured with --enable-verbose. Options are given as a * list\, such as <code>"verbose=[evictserver\,read]"</code>., a list\, with * values chosen from the following options: \c "api"\, \c "block"\, \c - * "checkpoint"\, \c "compact"\, \c "evict"\, \c "evictserver"\, \c "fileops"\, - * \c "handleops"\, \c "log"\, \c "lsm"\, \c "lsm_manager"\, \c "metadata"\, \c - * "mutex"\, \c "overflow"\, \c "read"\, \c "rebalance"\, \c "reconcile"\, \c - * "recovery"\, \c "salvage"\, \c "shared_cache"\, \c "split"\, \c "temporary"\, - * \c "thread_group"\, \c "transaction"\, \c "verify"\, \c "version"\, \c - * "write"; default empty.} + * "checkpoint"\, \c "compact"\, \c "evict"\, \c "evict_stuck"\, \c + * "evictserver"\, \c "fileops"\, \c "handleops"\, \c "log"\, \c "lsm"\, \c + * "lsm_manager"\, \c "metadata"\, \c "mutex"\, \c "overflow"\, \c "read"\, \c + * "rebalance"\, \c "reconcile"\, \c "recovery"\, \c "recovery_progress"\, \c + * "salvage"\, \c "shared_cache"\, \c "split"\, \c "temporary"\, \c + * "thread_group"\, \c "transaction"\, \c "verify"\, \c "version"\, \c "write"; + * default empty.} * @config{write_through, Use \c FILE_FLAG_WRITE_THROUGH on Windows to write to * files. Ignored on non-Windows systems. Options are given as a list\, such * as <code>"write_through=[data]"</code>. Configuring \c write_through requires @@ -3062,27 +3077,27 @@ const char *wiredtiger_version(int *majorp, int *minorp, int *patchp); * transaction is in progress, it should be rolled back and the operation * retried in a new transaction. */ -#define WT_ROLLBACK -31800 +#define WT_ROLLBACK (-31800) /*! * Attempt to insert an existing key. * This error is generated when the application attempts to insert a record with * the same key as an existing record without the 'overwrite' configuration to * WT_SESSION::open_cursor. */ -#define WT_DUPLICATE_KEY -31801 +#define WT_DUPLICATE_KEY (-31801) /*! * Non-specific WiredTiger error. * This error is returned when an error is not covered by a specific error * return. */ -#define WT_ERROR -31802 +#define WT_ERROR (-31802) /*! * Item not found. * This error indicates an operation did not find a value to return. This * includes cursor search and other operations where no record matched the * cursor's search key such as WT_CURSOR::update or WT_CURSOR::remove. */ -#define WT_NOTFOUND -31803 +#define WT_NOTFOUND (-31803) /*! * WiredTiger library panic. * This error indicates an underlying problem that requires the application exit @@ -3090,17 +3105,17 @@ const char *wiredtiger_version(int *majorp, int *minorp, int *patchp); * returned from a WiredTiger interface, no further WiredTiger calls are * required. */ -#define WT_PANIC -31804 +#define WT_PANIC (-31804) /*! @cond internal */ /*! Restart the operation (internal). */ -#define WT_RESTART -31805 +#define WT_RESTART (-31805) /*! @endcond */ /*! * Recovery must be run to continue. * This error is generated when wiredtiger_open is configured to return an error * if recovery is required to use the database. */ -#define WT_RUN_RECOVERY -31806 +#define WT_RUN_RECOVERY (-31806) /*! * Operation would overflow cache. * This error is only generated when wiredtiger_open is configured to run in- @@ -3109,7 +3124,7 @@ const char *wiredtiger_version(int *majorp, int *minorp, int *patchp); * progress, it should be rolled back and the operation retried in a new * transaction. */ -#define WT_CACHE_FULL -31807 +#define WT_CACHE_FULL (-31807) /* * Error return section: END * DO NOT EDIT: automatically built by dist/api_err.py. @@ -4429,396 +4444,406 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_CACHE_EVICTION_STATE 1051 /*! cache: eviction walks abandoned */ #define WT_STAT_CONN_CACHE_EVICTION_WALKS_ABANDONED 1052 +/*! cache: eviction worker thread active */ +#define WT_STAT_CONN_CACHE_EVICTION_ACTIVE_WORKERS 1053 +/*! cache: eviction worker thread created */ +#define WT_STAT_CONN_CACHE_EVICTION_WORKER_CREATED 1054 /*! cache: eviction worker thread evicting pages */ -#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1053 +#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1055 +/*! cache: eviction worker thread removed */ +#define WT_STAT_CONN_CACHE_EVICTION_WORKER_REMOVED 1056 +/*! cache: eviction worker thread stable number */ +#define WT_STAT_CONN_CACHE_EVICTION_STABLE_STATE_WORKERS 1057 /*! cache: failed eviction of pages that exceeded the in-memory maximum */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1054 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1058 /*! cache: files with active eviction walks */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ACTIVE 1055 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_ACTIVE 1059 /*! cache: files with new eviction walks started */ -#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STARTED 1056 +#define WT_STAT_CONN_CACHE_EVICTION_WALKS_STARTED 1060 +/*! cache: force re-tuning of eviction workers once in a while */ +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_RETUNE 1061 /*! cache: hazard pointer blocked page eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1057 +#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1062 /*! cache: hazard pointer check calls */ -#define WT_STAT_CONN_CACHE_HAZARD_CHECKS 1058 +#define WT_STAT_CONN_CACHE_HAZARD_CHECKS 1063 /*! cache: hazard pointer check entries walked */ -#define WT_STAT_CONN_CACHE_HAZARD_WALKS 1059 +#define WT_STAT_CONN_CACHE_HAZARD_WALKS 1064 /*! cache: hazard pointer maximum array length */ -#define WT_STAT_CONN_CACHE_HAZARD_MAX 1060 +#define WT_STAT_CONN_CACHE_HAZARD_MAX 1065 /*! cache: in-memory page passed criteria to be split */ -#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1061 +#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1066 /*! cache: in-memory page splits */ -#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1062 +#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1067 /*! cache: internal pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1063 +#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1068 /*! cache: internal pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1064 +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1069 /*! cache: leaf pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1065 +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1070 /*! cache: lookaside table insert calls */ -#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1066 +#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1071 /*! cache: lookaside table remove calls */ -#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1067 +#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1072 /*! cache: maximum bytes configured */ -#define WT_STAT_CONN_CACHE_BYTES_MAX 1068 +#define WT_STAT_CONN_CACHE_BYTES_MAX 1073 /*! cache: maximum page size at eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1069 +#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1074 /*! cache: modified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1070 +#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1075 /*! cache: modified pages evicted by application threads */ -#define WT_STAT_CONN_CACHE_EVICTION_APP_DIRTY 1071 +#define WT_STAT_CONN_CACHE_EVICTION_APP_DIRTY 1076 /*! cache: overflow pages read into cache */ -#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1072 +#define WT_STAT_CONN_CACHE_READ_OVERFLOW 1077 /*! cache: overflow values cached in memory */ -#define WT_STAT_CONN_CACHE_OVERFLOW_VALUE 1073 +#define WT_STAT_CONN_CACHE_OVERFLOW_VALUE 1078 /*! cache: page split during eviction deepened the tree */ -#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1074 +#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1079 /*! cache: page written requiring lookaside records */ -#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1075 +#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1080 /*! cache: pages currently held in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_INUSE 1076 +#define WT_STAT_CONN_CACHE_PAGES_INUSE 1081 /*! cache: pages evicted because they exceeded the in-memory maximum */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1077 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1082 /*! cache: pages evicted because they had chains of deleted items */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1078 +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1083 /*! cache: pages evicted by application threads */ -#define WT_STAT_CONN_CACHE_EVICTION_APP 1079 +#define WT_STAT_CONN_CACHE_EVICTION_APP 1084 /*! cache: pages queued for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED 1080 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED 1085 /*! cache: pages queued for urgent eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_URGENT 1081 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_URGENT 1086 /*! cache: pages queued for urgent eviction during walk */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1082 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_QUEUED_OLDEST 1087 /*! cache: pages read into cache */ -#define WT_STAT_CONN_CACHE_READ 1083 +#define WT_STAT_CONN_CACHE_READ 1088 /*! cache: pages read into cache requiring lookaside entries */ -#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1084 +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1089 /*! cache: pages requested from the cache */ -#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1085 +#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1090 /*! cache: pages seen by eviction walk */ -#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1086 +#define WT_STAT_CONN_CACHE_EVICTION_PAGES_SEEN 1091 /*! cache: pages selected for eviction unable to be evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1087 +#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1092 /*! cache: pages walked for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK 1088 +#define WT_STAT_CONN_CACHE_EVICTION_WALK 1093 /*! cache: pages written from cache */ -#define WT_STAT_CONN_CACHE_WRITE 1089 +#define WT_STAT_CONN_CACHE_WRITE 1094 /*! cache: pages written requiring in-memory restoration */ -#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1090 +#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1095 /*! cache: percentage overhead */ -#define WT_STAT_CONN_CACHE_OVERHEAD 1091 +#define WT_STAT_CONN_CACHE_OVERHEAD 1096 /*! cache: tracked bytes belonging to internal pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1092 +#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1097 /*! cache: tracked bytes belonging to leaf pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_LEAF 1093 +#define WT_STAT_CONN_CACHE_BYTES_LEAF 1098 /*! cache: tracked dirty bytes in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1094 +#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1099 /*! cache: tracked dirty pages in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1095 +#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1100 /*! cache: unmodified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1096 +#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1101 /*! connection: auto adjusting condition resets */ -#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1097 +#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1102 /*! connection: auto adjusting condition wait calls */ -#define WT_STAT_CONN_COND_AUTO_WAIT 1098 +#define WT_STAT_CONN_COND_AUTO_WAIT 1103 /*! connection: files currently open */ -#define WT_STAT_CONN_FILE_OPEN 1099 +#define WT_STAT_CONN_FILE_OPEN 1104 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1100 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1105 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1101 +#define WT_STAT_CONN_MEMORY_FREE 1106 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1102 +#define WT_STAT_CONN_MEMORY_GROW 1107 /*! connection: pthread mutex condition wait calls */ -#define WT_STAT_CONN_COND_WAIT 1103 +#define WT_STAT_CONN_COND_WAIT 1108 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1104 +#define WT_STAT_CONN_RWLOCK_READ 1109 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1105 +#define WT_STAT_CONN_RWLOCK_WRITE 1110 /*! connection: total fsync I/Os */ -#define WT_STAT_CONN_FSYNC_IO 1106 +#define WT_STAT_CONN_FSYNC_IO 1111 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1107 +#define WT_STAT_CONN_READ_IO 1112 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1108 +#define WT_STAT_CONN_WRITE_IO 1113 /*! cursor: cursor create calls */ -#define WT_STAT_CONN_CURSOR_CREATE 1109 +#define WT_STAT_CONN_CURSOR_CREATE 1114 /*! cursor: cursor insert calls */ -#define WT_STAT_CONN_CURSOR_INSERT 1110 +#define WT_STAT_CONN_CURSOR_INSERT 1115 /*! cursor: cursor next calls */ -#define WT_STAT_CONN_CURSOR_NEXT 1111 +#define WT_STAT_CONN_CURSOR_NEXT 1116 /*! cursor: cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV 1112 +#define WT_STAT_CONN_CURSOR_PREV 1117 /*! cursor: cursor remove calls */ -#define WT_STAT_CONN_CURSOR_REMOVE 1113 +#define WT_STAT_CONN_CURSOR_REMOVE 1118 /*! cursor: cursor reset calls */ -#define WT_STAT_CONN_CURSOR_RESET 1114 +#define WT_STAT_CONN_CURSOR_RESET 1119 /*! cursor: cursor restarted searches */ -#define WT_STAT_CONN_CURSOR_RESTART 1115 +#define WT_STAT_CONN_CURSOR_RESTART 1120 /*! cursor: cursor search calls */ -#define WT_STAT_CONN_CURSOR_SEARCH 1116 +#define WT_STAT_CONN_CURSOR_SEARCH 1121 /*! cursor: cursor search near calls */ -#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1117 +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1122 /*! cursor: cursor update calls */ -#define WT_STAT_CONN_CURSOR_UPDATE 1118 +#define WT_STAT_CONN_CURSOR_UPDATE 1123 /*! cursor: truncate calls */ -#define WT_STAT_CONN_CURSOR_TRUNCATE 1119 +#define WT_STAT_CONN_CURSOR_TRUNCATE 1124 /*! data-handle: connection data handles currently active */ -#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1120 +#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1125 /*! data-handle: connection sweep candidate became referenced */ -#define WT_STAT_CONN_DH_SWEEP_REF 1121 +#define WT_STAT_CONN_DH_SWEEP_REF 1126 /*! data-handle: connection sweep dhandles closed */ -#define WT_STAT_CONN_DH_SWEEP_CLOSE 1122 +#define WT_STAT_CONN_DH_SWEEP_CLOSE 1127 /*! data-handle: connection sweep dhandles removed from hash list */ -#define WT_STAT_CONN_DH_SWEEP_REMOVE 1123 +#define WT_STAT_CONN_DH_SWEEP_REMOVE 1128 /*! data-handle: connection sweep time-of-death sets */ -#define WT_STAT_CONN_DH_SWEEP_TOD 1124 +#define WT_STAT_CONN_DH_SWEEP_TOD 1129 /*! data-handle: connection sweeps */ -#define WT_STAT_CONN_DH_SWEEPS 1125 +#define WT_STAT_CONN_DH_SWEEPS 1130 /*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1126 +#define WT_STAT_CONN_DH_SESSION_HANDLES 1131 /*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1127 +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1132 /*! lock: checkpoint lock acquisitions */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1128 +#define WT_STAT_CONN_LOCK_CHECKPOINT_COUNT 1133 /*! lock: checkpoint lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1129 +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_APPLICATION 1134 /*! lock: checkpoint lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1130 -/*! lock: handle-list lock acquisitions */ -#define WT_STAT_CONN_LOCK_HANDLE_LIST_COUNT 1131 -/*! lock: handle-list lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_APPLICATION 1132 -/*! lock: handle-list lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_INTERNAL 1133 +#define WT_STAT_CONN_LOCK_CHECKPOINT_WAIT_INTERNAL 1135 +/*! lock: handle-list lock eviction thread wait time (usecs) */ +#define WT_STAT_CONN_LOCK_HANDLE_LIST_WAIT_EVICTION 1136 /*! lock: metadata lock acquisitions */ -#define WT_STAT_CONN_LOCK_METADATA_COUNT 1134 +#define WT_STAT_CONN_LOCK_METADATA_COUNT 1137 /*! lock: metadata lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1135 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_APPLICATION 1138 /*! lock: metadata lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1136 +#define WT_STAT_CONN_LOCK_METADATA_WAIT_INTERNAL 1139 /*! lock: schema lock acquisitions */ -#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1137 +#define WT_STAT_CONN_LOCK_SCHEMA_COUNT 1140 /*! lock: schema lock application thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1138 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_APPLICATION 1141 /*! lock: schema lock internal thread wait time (usecs) */ -#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1139 +#define WT_STAT_CONN_LOCK_SCHEMA_WAIT_INTERNAL 1142 /*! lock: table lock acquisitions */ -#define WT_STAT_CONN_LOCK_TABLE_COUNT 1140 +#define WT_STAT_CONN_LOCK_TABLE_COUNT 1143 /*! * lock: table lock application thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1141 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_APPLICATION 1144 /*! * lock: table lock internal thread time waiting for the table lock * (usecs) */ -#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1142 +#define WT_STAT_CONN_LOCK_TABLE_WAIT_INTERNAL 1145 /*! log: busy returns attempting to switch slots */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1143 +#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1146 /*! log: consolidated slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1144 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1147 +/*! log: consolidated slot join active slot closed */ +#define WT_STAT_CONN_LOG_SLOT_ACTIVE_CLOSED 1148 /*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1145 +#define WT_STAT_CONN_LOG_SLOT_RACES 1149 /*! log: consolidated slot join transitions */ -#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1146 +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1150 /*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1147 +#define WT_STAT_CONN_LOG_SLOT_JOINS 1151 +/*! log: consolidated slot transitions unable to find free slot */ +#define WT_STAT_CONN_LOG_SLOT_NO_FREE_SLOTS 1152 /*! log: consolidated slot unbuffered writes */ -#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1148 +#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1153 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1149 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1154 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1150 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1155 /*! log: log files manually zero-filled */ -#define WT_STAT_CONN_LOG_ZERO_FILLS 1151 +#define WT_STAT_CONN_LOG_ZERO_FILLS 1156 /*! log: log flush operations */ -#define WT_STAT_CONN_LOG_FLUSH 1152 +#define WT_STAT_CONN_LOG_FLUSH 1157 /*! log: log force write operations */ -#define WT_STAT_CONN_LOG_FORCE_WRITE 1153 +#define WT_STAT_CONN_LOG_FORCE_WRITE 1158 /*! log: log force write operations skipped */ -#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1154 +#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1159 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1155 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1160 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1156 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1161 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1157 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1162 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1158 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1163 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1159 +#define WT_STAT_CONN_LOG_SCANS 1164 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1160 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1165 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1161 +#define WT_STAT_CONN_LOG_WRITE_LSN 1166 /*! log: log server thread write LSN walk skipped */ -#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1162 +#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1167 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1163 +#define WT_STAT_CONN_LOG_SYNC 1168 /*! log: log sync time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DURATION 1164 +#define WT_STAT_CONN_LOG_SYNC_DURATION 1169 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1165 +#define WT_STAT_CONN_LOG_SYNC_DIR 1170 /*! log: log sync_dir time duration (usecs) */ -#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1166 +#define WT_STAT_CONN_LOG_SYNC_DIR_DURATION 1171 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1167 +#define WT_STAT_CONN_LOG_WRITES 1172 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1168 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1173 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1169 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1174 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1170 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1175 /*! log: pre-allocated log files not ready and missed */ -#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1171 +#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1176 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1172 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1177 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1173 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1178 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1174 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1179 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1175 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1180 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1176 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1181 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1177 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1182 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1178 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1183 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1179 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1184 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1180 +#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1185 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1181 +#define WT_STAT_CONN_REC_PAGES 1186 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1182 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1187 /*! reconciliation: pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE 1183 +#define WT_STAT_CONN_REC_PAGE_DELETE 1188 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1184 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1189 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1185 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1190 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1186 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1191 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1187 +#define WT_STAT_CONN_SESSION_OPEN 1192 /*! session: table alter failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1188 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_FAIL 1193 /*! session: table alter successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1189 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SUCCESS 1194 /*! session: table alter unchanged and skipped */ -#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1190 +#define WT_STAT_CONN_SESSION_TABLE_ALTER_SKIP 1195 /*! session: table compact failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1191 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_FAIL 1196 /*! session: table compact successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1192 +#define WT_STAT_CONN_SESSION_TABLE_COMPACT_SUCCESS 1197 /*! session: table create failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1193 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_FAIL 1198 /*! session: table create successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1194 +#define WT_STAT_CONN_SESSION_TABLE_CREATE_SUCCESS 1199 /*! session: table drop failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1195 +#define WT_STAT_CONN_SESSION_TABLE_DROP_FAIL 1200 /*! session: table drop successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1196 +#define WT_STAT_CONN_SESSION_TABLE_DROP_SUCCESS 1201 /*! session: table rebalance failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1197 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_FAIL 1202 /*! session: table rebalance successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1198 +#define WT_STAT_CONN_SESSION_TABLE_REBALANCE_SUCCESS 1203 /*! session: table rename failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1199 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_FAIL 1204 /*! session: table rename successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1200 +#define WT_STAT_CONN_SESSION_TABLE_RENAME_SUCCESS 1205 /*! session: table salvage failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1201 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_FAIL 1206 /*! session: table salvage successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1202 +#define WT_STAT_CONN_SESSION_TABLE_SALVAGE_SUCCESS 1207 /*! session: table truncate failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1203 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_FAIL 1208 /*! session: table truncate successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1204 +#define WT_STAT_CONN_SESSION_TABLE_TRUNCATE_SUCCESS 1209 /*! session: table verify failed calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1205 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_FAIL 1210 /*! session: table verify successful calls */ -#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1206 +#define WT_STAT_CONN_SESSION_TABLE_VERIFY_SUCCESS 1211 /*! thread-state: active filesystem fsync calls */ -#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1207 +#define WT_STAT_CONN_THREAD_FSYNC_ACTIVE 1212 /*! thread-state: active filesystem read calls */ -#define WT_STAT_CONN_THREAD_READ_ACTIVE 1208 +#define WT_STAT_CONN_THREAD_READ_ACTIVE 1213 /*! thread-state: active filesystem write calls */ -#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1209 +#define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1214 /*! thread-yield: application thread time evicting (usecs) */ -#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1210 +#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1215 /*! thread-yield: application thread time waiting for cache (usecs) */ -#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1211 +#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1216 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1212 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1217 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1213 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1218 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1214 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1219 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1215 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1220 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1216 +#define WT_STAT_CONN_PAGE_SLEEP 1221 /*! transaction: number of named snapshots created */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1217 +#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1222 /*! transaction: number of named snapshots dropped */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1218 +#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1223 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1219 +#define WT_STAT_CONN_TXN_BEGIN 1224 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1220 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1225 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1221 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1226 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1222 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1227 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1223 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1228 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1224 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1229 /*! transaction: transaction checkpoint scrub dirty target */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1225 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1230 /*! transaction: transaction checkpoint scrub time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1226 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1231 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1227 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1232 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1228 +#define WT_STAT_CONN_TXN_CHECKPOINT 1233 /*! * transaction: transaction checkpoints skipped because database was * clean */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1229 +#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1234 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1230 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1235 /*! * transaction: transaction fsync calls for checkpoint after allocating * the transaction ID */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1231 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1236 /*! * transaction: transaction fsync duration for checkpoint after * allocating the transaction ID (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1232 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1237 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1233 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1238 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1234 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1239 /*! * transaction: transaction range of IDs currently pinned by named * snapshots */ -#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1235 +#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1240 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1236 +#define WT_STAT_CONN_TXN_SYNC 1241 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1237 +#define WT_STAT_CONN_TXN_COMMIT 1242 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1238 +#define WT_STAT_CONN_TXN_ROLLBACK 1243 /*! * @} @@ -4978,181 +5003,183 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_DSRC_CACHE_WRITE 2059 /*! cache: pages written requiring in-memory restoration */ #define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2060 +/*! cache: tracked dirty bytes in the cache */ +#define WT_STAT_DSRC_CACHE_BYTES_DIRTY 2061 /*! cache: unmodified pages evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2061 +#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2062 /*! * cache_walk: Average difference between current eviction generation * when the page was last considered, only reported if cache_walk or all * statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_GEN_AVG_GAP 2062 +#define WT_STAT_DSRC_CACHE_STATE_GEN_AVG_GAP 2063 /*! * cache_walk: Average on-disk page image size seen, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_AVG_WRITTEN_SIZE 2063 +#define WT_STAT_DSRC_CACHE_STATE_AVG_WRITTEN_SIZE 2064 /*! * cache_walk: Clean pages currently in cache, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_PAGES_CLEAN 2064 +#define WT_STAT_DSRC_CACHE_STATE_PAGES_CLEAN 2065 /*! * cache_walk: Current eviction generation, only reported if cache_walk * or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_GEN_CURRENT 2065 +#define WT_STAT_DSRC_CACHE_STATE_GEN_CURRENT 2066 /*! * cache_walk: Dirty pages currently in cache, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_PAGES_DIRTY 2066 +#define WT_STAT_DSRC_CACHE_STATE_PAGES_DIRTY 2067 /*! * cache_walk: Entries in the root page, only reported if cache_walk or * all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_ROOT_ENTRIES 2067 +#define WT_STAT_DSRC_CACHE_STATE_ROOT_ENTRIES 2068 /*! * cache_walk: Internal pages currently in cache, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_PAGES_INTERNAL 2068 +#define WT_STAT_DSRC_CACHE_STATE_PAGES_INTERNAL 2069 /*! * cache_walk: Leaf pages currently in cache, only reported if cache_walk * or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_PAGES_LEAF 2069 +#define WT_STAT_DSRC_CACHE_STATE_PAGES_LEAF 2070 /*! * cache_walk: Maximum difference between current eviction generation * when the page was last considered, only reported if cache_walk or all * statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_GEN_MAX_GAP 2070 +#define WT_STAT_DSRC_CACHE_STATE_GEN_MAX_GAP 2071 /*! * cache_walk: Maximum page size seen, only reported if cache_walk or all * statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_MAX_PAGESIZE 2071 +#define WT_STAT_DSRC_CACHE_STATE_MAX_PAGESIZE 2072 /*! * cache_walk: Minimum on-disk page image size seen, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_MIN_WRITTEN_SIZE 2072 +#define WT_STAT_DSRC_CACHE_STATE_MIN_WRITTEN_SIZE 2073 /*! * cache_walk: On-disk page image sizes smaller than a single allocation * unit, only reported if cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_SMALLER_ALLOC_SIZE 2073 +#define WT_STAT_DSRC_CACHE_STATE_SMALLER_ALLOC_SIZE 2074 /*! * cache_walk: Pages created in memory and never written, only reported * if cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_MEMORY 2074 +#define WT_STAT_DSRC_CACHE_STATE_MEMORY 2075 /*! * cache_walk: Pages currently queued for eviction, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_QUEUED 2075 +#define WT_STAT_DSRC_CACHE_STATE_QUEUED 2076 /*! * cache_walk: Pages that could not be queued for eviction, only reported * if cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_NOT_QUEUEABLE 2076 +#define WT_STAT_DSRC_CACHE_STATE_NOT_QUEUEABLE 2077 /*! * cache_walk: Refs skipped during cache traversal, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_REFS_SKIPPED 2077 +#define WT_STAT_DSRC_CACHE_STATE_REFS_SKIPPED 2078 /*! * cache_walk: Size of the root page, only reported if cache_walk or all * statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_ROOT_SIZE 2078 +#define WT_STAT_DSRC_CACHE_STATE_ROOT_SIZE 2079 /*! * cache_walk: Total number of pages currently in cache, only reported if * cache_walk or all statistics are enabled */ -#define WT_STAT_DSRC_CACHE_STATE_PAGES 2079 +#define WT_STAT_DSRC_CACHE_STATE_PAGES 2080 /*! compression: compressed pages read */ -#define WT_STAT_DSRC_COMPRESS_READ 2080 +#define WT_STAT_DSRC_COMPRESS_READ 2081 /*! compression: compressed pages written */ -#define WT_STAT_DSRC_COMPRESS_WRITE 2081 +#define WT_STAT_DSRC_COMPRESS_WRITE 2082 /*! compression: page written failed to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2082 +#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2083 /*! compression: page written was too small to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2083 +#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2084 /*! compression: raw compression call failed, additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2084 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2085 /*! compression: raw compression call failed, no additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2085 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2086 /*! compression: raw compression call succeeded */ -#define WT_STAT_DSRC_COMPRESS_RAW_OK 2086 +#define WT_STAT_DSRC_COMPRESS_RAW_OK 2087 /*! cursor: bulk-loaded cursor-insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2087 +#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2088 /*! cursor: create calls */ -#define WT_STAT_DSRC_CURSOR_CREATE 2088 +#define WT_STAT_DSRC_CURSOR_CREATE 2089 /*! cursor: cursor-insert key and value bytes inserted */ -#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2089 +#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2090 /*! cursor: cursor-remove key bytes removed */ -#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2090 +#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2091 /*! cursor: cursor-update value bytes updated */ -#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2091 +#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2092 /*! cursor: insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT 2092 +#define WT_STAT_DSRC_CURSOR_INSERT 2093 /*! cursor: next calls */ -#define WT_STAT_DSRC_CURSOR_NEXT 2093 +#define WT_STAT_DSRC_CURSOR_NEXT 2094 /*! cursor: prev calls */ -#define WT_STAT_DSRC_CURSOR_PREV 2094 +#define WT_STAT_DSRC_CURSOR_PREV 2095 /*! cursor: remove calls */ -#define WT_STAT_DSRC_CURSOR_REMOVE 2095 +#define WT_STAT_DSRC_CURSOR_REMOVE 2096 /*! cursor: reset calls */ -#define WT_STAT_DSRC_CURSOR_RESET 2096 +#define WT_STAT_DSRC_CURSOR_RESET 2097 /*! cursor: restarted searches */ -#define WT_STAT_DSRC_CURSOR_RESTART 2097 +#define WT_STAT_DSRC_CURSOR_RESTART 2098 /*! cursor: search calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH 2098 +#define WT_STAT_DSRC_CURSOR_SEARCH 2099 /*! cursor: search near calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2099 +#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2100 /*! cursor: truncate calls */ -#define WT_STAT_DSRC_CURSOR_TRUNCATE 2100 +#define WT_STAT_DSRC_CURSOR_TRUNCATE 2101 /*! cursor: update calls */ -#define WT_STAT_DSRC_CURSOR_UPDATE 2101 +#define WT_STAT_DSRC_CURSOR_UPDATE 2102 /*! reconciliation: dictionary matches */ -#define WT_STAT_DSRC_REC_DICTIONARY 2102 +#define WT_STAT_DSRC_REC_DICTIONARY 2103 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2103 +#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2104 /*! * reconciliation: internal page key bytes discarded using suffix * compression */ -#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2104 +#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2105 /*! reconciliation: internal page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2105 +#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2106 /*! reconciliation: internal-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2106 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2107 /*! reconciliation: leaf page key bytes discarded using prefix compression */ -#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2107 +#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2108 /*! reconciliation: leaf page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2108 +#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2109 /*! reconciliation: leaf-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2109 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2110 /*! reconciliation: maximum blocks required for a page */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2110 +#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2111 /*! reconciliation: overflow values written */ -#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2111 +#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2112 /*! reconciliation: page checksum matches */ -#define WT_STAT_DSRC_REC_PAGE_MATCH 2112 +#define WT_STAT_DSRC_REC_PAGE_MATCH 2113 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_DSRC_REC_PAGES 2113 +#define WT_STAT_DSRC_REC_PAGES 2114 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_DSRC_REC_PAGES_EVICTION 2114 +#define WT_STAT_DSRC_REC_PAGES_EVICTION 2115 /*! reconciliation: pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE 2115 +#define WT_STAT_DSRC_REC_PAGE_DELETE 2116 /*! session: object compaction */ -#define WT_STAT_DSRC_SESSION_COMPACT 2116 +#define WT_STAT_DSRC_SESSION_COMPACT 2117 /*! session: open cursor count */ -#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2117 +#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2118 /*! transaction: update conflicts */ -#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2118 +#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2119 /*! * @} diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h index e18563dd2d2..da318ad8a86 100644 --- a/src/include/wt_internal.h +++ b/src/include/wt_internal.h @@ -106,6 +106,8 @@ struct __wt_col; typedef struct __wt_col WT_COL; struct __wt_col_rle; typedef struct __wt_col_rle WT_COL_RLE; +struct __wt_col_var_repeat; + typedef struct __wt_col_var_repeat WT_COL_VAR_REPEAT; struct __wt_colgroup; typedef struct __wt_colgroup WT_COLGROUP; struct __wt_compact_state; @@ -266,8 +268,6 @@ struct __wt_ref; typedef struct __wt_ref WT_REF; struct __wt_row; typedef struct __wt_row WT_ROW; -struct __wt_rwlock; - typedef struct __wt_rwlock WT_RWLOCK; struct __wt_salvage_cookie; typedef struct __wt_salvage_cookie WT_SALVAGE_COOKIE; struct __wt_save_upd; @@ -302,6 +302,8 @@ union __wt_lsn; typedef union __wt_lsn WT_LSN; union __wt_rand_state; typedef union __wt_rand_state WT_RAND_STATE; +union __wt_rwlock; + typedef union __wt_rwlock WT_RWLOCK; /* * Forward type declarations for internal types: END * DO NOT EDIT: automatically built by dist/s_typedef. diff --git a/src/log/log.c b/src/log/log.c index 413df312a15..803d3e8dfab 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -24,7 +24,7 @@ static int __log_write_internal( * __log_wait_for_earlier_slot -- * Wait for write_lsn to catch up to this slot. */ -static void +static int __log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { WT_CONNECTION_IMPL *conn; @@ -41,16 +41,18 @@ __log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) * unlock in case an earlier thread is trying to switch its * slot and complete its operation. */ + WT_RET(WT_SESSION_CHECK_PANIC(session)); if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) __wt_spin_unlock(session, &log->log_slot_lock); - __wt_cond_auto_signal(session, conn->log_wrlsn_cond); + __wt_cond_signal(session, conn->log_wrlsn_cond); if (++yield_count < WT_THOUSAND) __wt_yield(); else - __wt_cond_wait(session, log->log_write_cond, 200); + __wt_cond_wait(session, log->log_write_cond, 200, NULL); if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) __wt_spin_lock(session, &log->log_slot_lock); } + return (0); } /* @@ -62,16 +64,21 @@ static int __log_fs_write(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, wt_off_t offset, size_t len, const void *buf) { + WT_DECL_RET; + /* * If we're writing into a new log file, we have to wait for all * writes to the previous log file to complete otherwise there could * be a hole at the end of the previous log file that we cannot detect. */ if (slot->slot_release_lsn.l.file < slot->slot_start_lsn.l.file) { - __log_wait_for_earlier_slot(session, slot); + WT_RET(__log_wait_for_earlier_slot(session, slot)); WT_RET(__wt_log_force_sync(session, &slot->slot_release_lsn)); } - return (__wt_write(session, slot->slot_fh, offset, len, buf)); + if ((ret = __wt_write(session, slot->slot_fh, offset, len, buf)) != 0) + WT_PANIC_MSG(session, ret, + "%s: fatal log failure", slot->slot_fh->name); + return (ret); } /* @@ -89,7 +96,7 @@ __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn) log = conn->log; log->ckpt_lsn = *ckp_lsn; if (conn->log_cond != NULL) - __wt_cond_auto_signal(session, conn->log_cond); + __wt_cond_signal(session, conn->log_cond); } /* @@ -105,6 +112,7 @@ __wt_log_flush_lsn(WT_SESSION_IMPL *session, WT_LSN *lsn, bool start) conn = S2C(session); log = conn->log; + WT_RET(WT_SESSION_CHECK_PANIC(session)); WT_RET(__wt_log_force_write(session, 1, NULL)); __wt_log_wrlsn(session, NULL); if (start) @@ -169,8 +177,9 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) * log file ready to close. */ while (log->sync_lsn.l.file < min_lsn->l.file) { + WT_RET(WT_SESSION_CHECK_PANIC(session)); __wt_cond_signal(session, S2C(session)->log_file_cond); - __wt_cond_wait(session, log->log_sync_cond, 10000); + __wt_cond_wait(session, log->log_sync_cond, 10000, NULL); } __wt_spin_lock(session, &log->log_sync_lock); WT_ASSERT(session, log->log_dir_fh != NULL); @@ -300,14 +309,11 @@ void __wt_log_written_reset(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; - WT_LOG *log; conn = S2C(session); - if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) - return; - log = conn->log; - log->log_written = 0; - return; + + if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) + conn->log->log_written = 0; } /* @@ -777,8 +783,8 @@ __log_openfile(WT_SESSION_IMPL *session, __wt_log_desc_byteswap(desc); if (desc->log_magic != WT_LOG_MAGIC) WT_PANIC_RET(session, WT_ERROR, - "log file %s corrupted: Bad magic number %" PRIu32, - (*fhp)->name, desc->log_magic); + "log file %s corrupted: Bad magic number %" PRIu32, + (*fhp)->name, desc->log_magic); if (desc->majorv > WT_LOG_MAJOR_VERSION || (desc->majorv == WT_LOG_MAJOR_VERSION && desc->minorv > WT_LOG_MINOR_VERSION)) @@ -895,12 +901,12 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created) */ create_log = true; if (conn->log_prealloc > 0 && !conn->hot_backup) { - __wt_readlock(session, conn->hot_backup_lock); + __wt_readlock(session, &conn->hot_backup_lock); if (conn->hot_backup) - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); else { ret = __log_alloc_prealloc(session, log->fileid); - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); /* * If ret is 0 it means we found a pre-allocated file. @@ -915,7 +921,7 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created) else { WT_STAT_CONN_INCR(session, log_prealloc_missed); if (conn->log_cond != NULL) - __wt_cond_auto_signal( + __wt_cond_signal( session, conn->log_cond); } } @@ -1029,12 +1035,12 @@ __log_truncate_file(WT_SESSION_IMPL *session, WT_FH *log_fh, wt_off_t offset) log = conn->log; if (!F_ISSET(log, WT_LOG_TRUNCATE_NOTSUP) && !conn->hot_backup) { - __wt_readlock(session, conn->hot_backup_lock); + __wt_readlock(session, &conn->hot_backup_lock); if (conn->hot_backup) - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); else { ret = __wt_ftruncate(session, log_fh, offset); - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); if (ret != ENOTSUP) return (ret); F_SET(log, WT_LOG_TRUNCATE_NOTSUP); @@ -1462,7 +1468,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) * be holes in the log file. */ WT_STAT_CONN_INCR(session, log_release_write_lsn); - __log_wait_for_earlier_slot(session, slot); + WT_ERR(__log_wait_for_earlier_slot(session, slot)); log->write_start_lsn = slot->slot_start_lsn; log->write_lsn = slot->slot_end_lsn; @@ -1483,6 +1489,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) * current fsync completes and advance log->sync_lsn. */ while (F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) { + WT_ERR(WT_SESSION_CHECK_PANIC(session)); /* * We have to wait until earlier log files have finished their * sync operations. The most recent one will set the LSN to the @@ -1490,7 +1497,8 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) */ if (log->sync_lsn.l.file < slot->slot_end_lsn.l.file || __wt_spin_trylock(session, &log->log_sync_lock) != 0) { - __wt_cond_wait(session, log->log_sync_cond, 10000); + __wt_cond_wait( + session, log->log_sync_cond, 10000, NULL); continue; } locked = true; @@ -1655,10 +1663,7 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, WT_RET(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount)); if (logcount == 0) - /* - * Return it is not supported if none don't exist. - */ - return (ENOTSUP); + WT_RET_MSG(session, ENOTSUP, "no log files found"); for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); @@ -1674,6 +1679,10 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, &log_fh, WT_LOG_FILENAME, start_lsn.l.file, WT_LOG_OPEN_VERIFY)); WT_ERR(__wt_filesize(session, log_fh, &log_size)); rd_lsn = start_lsn; + if (LF_ISSET(WT_LOGSCAN_RECOVER)) + __wt_verbose(session, WT_VERB_RECOVERY_PROGRESS, + "Recovering log %" PRIu32 " through %" PRIu32, + rd_lsn.l.file, end_lsn.l.file); WT_ERR(__wt_scr_alloc(session, WT_LOG_ALIGN, &buf)); WT_ERR(__wt_scr_alloc(session, 0, &decryptitem)); @@ -1722,6 +1731,11 @@ advance: WT_ERR(__log_openfile(session, &log_fh, WT_LOG_FILENAME, rd_lsn.l.file, WT_LOG_OPEN_VERIFY)); + if (LF_ISSET(WT_LOGSCAN_RECOVER)) + __wt_verbose(session, WT_VERB_RECOVERY_PROGRESS, + "Recovering log %" PRIu32 + " through %" PRIu32, + rd_lsn.l.file, end_lsn.l.file); WT_ERR(__wt_filesize(session, log_fh, &log_size)); eol = false; continue; @@ -1758,9 +1772,8 @@ advance: if (eol) /* Found a hole. This LSN is the end. */ break; - else - /* Last record in log. Look for more. */ - goto advance; + /* Last record in log. Look for more. */ + goto advance; } rdup_len = __wt_rduppo2(reclen, allocsize); if (reclen > allocsize) { @@ -1906,7 +1919,6 @@ __wt_log_force_write(WT_SESSION_IMPL *session, bool retry, bool *did_work) { WT_LOG *log; WT_MYSLOT myslot; - uint32_t joined; log = S2C(session)->log; memset(&myslot, 0, sizeof(myslot)); @@ -1914,14 +1926,7 @@ __wt_log_force_write(WT_SESSION_IMPL *session, bool retry, bool *did_work) if (did_work != NULL) *did_work = true; myslot.slot = log->active_slot; - joined = WT_LOG_SLOT_JOINED(log->active_slot->slot_state); - if (joined == 0) { - WT_STAT_CONN_INCR(session, log_force_write_skip); - if (did_work != NULL) - *did_work = false; - return (0); - } - return (__wt_log_slot_switch(session, &myslot, retry, true)); + return (__wt_log_slot_switch(session, &myslot, retry, true, did_work)); } /* @@ -2120,7 +2125,11 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, WT_STAT_CONN_INCR(session, log_writes); - __wt_log_slot_join(session, rdup_len, flags, &myslot); + /* + * The only time joining a slot should ever return an error is if it + * detects a panic. + */ + WT_ERR(__wt_log_slot_join(session, rdup_len, flags, &myslot)); /* * If the addition of this record crosses the buffer boundary, * switch in a new slot. @@ -2129,7 +2138,7 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, ret = 0; if (myslot.end_offset >= WT_LOG_SLOT_BUF_MAX || F_ISSET(&myslot, WT_MYSLOT_UNBUFFERED) || force) - ret = __wt_log_slot_switch(session, &myslot, true, false); + ret = __wt_log_slot_switch(session, &myslot, true, false, NULL); if (ret == 0) ret = __log_fill(session, &myslot, false, record, &lsn); release_size = __wt_log_slot_release( @@ -2154,7 +2163,7 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, * XXX I've seen times when conditions are NULL. */ if (conn->log_cond != NULL) { - __wt_cond_auto_signal(session, conn->log_cond); + __wt_cond_signal(session, conn->log_cond); __wt_yield(); } else WT_ERR(__wt_log_force_write(session, 1, NULL)); @@ -2162,13 +2171,19 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, if (LF_ISSET(WT_LOG_FLUSH)) { /* Wait for our writes to reach the OS */ while (__wt_log_cmp(&log->write_lsn, &lsn) <= 0 && - myslot.slot->slot_error == 0) - __wt_cond_wait(session, log->log_write_cond, 10000); + myslot.slot->slot_error == 0) { + WT_ERR(WT_SESSION_CHECK_PANIC(session)); + __wt_cond_wait( + session, log->log_write_cond, 10000, NULL); + } } else if (LF_ISSET(WT_LOG_FSYNC)) { /* Wait for our writes to reach disk */ while (__wt_log_cmp(&log->sync_lsn, &lsn) <= 0 && - myslot.slot->slot_error == 0) - __wt_cond_wait(session, log->log_sync_cond, 10000); + myslot.slot->slot_error == 0) { + WT_ERR(WT_SESSION_CHECK_PANIC(session)); + __wt_cond_wait( + session, log->log_sync_cond, 10000, NULL); + } } /* @@ -2193,12 +2208,12 @@ err: /* * If one of the sync flags is set, assert the proper LSN has moved to - * match. + * match on success. */ - WT_ASSERT(session, !LF_ISSET(WT_LOG_FLUSH) || + WT_ASSERT(session, ret != 0 || !LF_ISSET(WT_LOG_FLUSH) || __wt_log_cmp(&log->write_lsn, &lsn) >= 0); - WT_ASSERT(session, - !LF_ISSET(WT_LOG_FSYNC) || __wt_log_cmp(&log->sync_lsn, &lsn) >= 0); + WT_ASSERT(session, ret != 0 || !LF_ISSET(WT_LOG_FSYNC) || + __wt_log_cmp(&log->sync_lsn, &lsn) >= 0); return (ret); } @@ -2223,8 +2238,10 @@ __wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap) return (0); va_copy(ap_copy, ap); - len = (size_t)vsnprintf(NULL, 0, fmt, ap_copy) + 1; + len = 1; + ret = __wt_vsnprintf_len_incr(NULL, 0, &len, fmt, ap_copy); va_end(ap_copy); + WT_RET(ret); WT_RET( __wt_logrec_alloc(session, sizeof(WT_LOG_RECORD) + len, &logrec)); @@ -2241,7 +2258,8 @@ __wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap) rec_fmt, rectype)); logrec->size += (uint32_t)header_size; - (void)vsnprintf((char *)logrec->data + logrec->size, len, fmt, ap); + WT_ERR(__wt_vsnprintf( + (char *)logrec->data + logrec->size, len, fmt, ap)); __wt_verbose(session, WT_VERB_LOG, "log_printf: %s", (char *)logrec->data + logrec->size); diff --git a/src/log/log_slot.c b/src/log/log_slot.c index a29a34e5652..97e317ce68c 100644 --- a/src/log/log_slot.c +++ b/src/log/log_slot.c @@ -8,6 +8,49 @@ #include "wt_internal.h" +#ifdef HAVE_DIAGNOSTIC +/* + * __log_slot_dump -- + * Dump the entire slot state. + */ +static void +__log_slot_dump(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + WT_LOGSLOT *slot; + int earliest, i; + + conn = S2C(session); + log = conn->log; + earliest = 0; + for (i = 0; i < WT_SLOT_POOL; i++) { + slot = &log->slot_pool[i]; + if (__wt_log_cmp(&slot->slot_release_lsn, + &log->slot_pool[earliest].slot_release_lsn) < 0) + earliest = i; + __wt_errx(session, "Slot %d:", i); + __wt_errx(session, " State: %" PRIx64 " Flags: %" PRIx32, + slot->slot_state, slot->flags); + __wt_errx(session, " Start LSN: %" PRIu32 "/%" PRIu32, + slot->slot_start_lsn.l.file, slot->slot_start_lsn.l.offset); + __wt_errx(session, " End LSN: %" PRIu32 "/%" PRIu32, + slot->slot_end_lsn.l.file, slot->slot_end_lsn.l.offset); + __wt_errx(session, " Release LSN: %" PRIu32 "/%" PRIu32, + slot->slot_release_lsn.l.file, + slot->slot_release_lsn.l.offset); + __wt_errx(session, " Offset: start: %" PRIuMAX + " last:%" PRIuMAX, (uintmax_t)slot->slot_start_offset, + (uintmax_t)slot->slot_last_offset); + __wt_errx(session, " Unbuffered: %" PRId64 + " error: %" PRId32, slot->slot_unbuffered, + slot->slot_error); + } + __wt_errx(session, "Earliest slot: %d", earliest); + +} +#endif + /* * __wt_log_slot_activate -- * Initialize a slot to become active. @@ -21,7 +64,6 @@ __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) conn = S2C(session); log = conn->log; - slot->slot_state = 0; /* * !!! slot_release_lsn must be set outside this function because * this function may be called after a log file switch and the @@ -30,12 +72,19 @@ __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) * set for closing the file handle on a log file switch. The flags * are reset when the slot is freed. See log_slot_free. */ + slot->slot_unbuffered = 0; slot->slot_start_lsn = slot->slot_end_lsn = log->alloc_lsn; slot->slot_start_offset = log->alloc_lsn.l.offset; slot->slot_last_offset = log->alloc_lsn.l.offset; slot->slot_fh = log->log_fh; slot->slot_error = 0; - slot->slot_unbuffered = 0; + WT_DIAGNOSTIC_YIELD; + /* + * Set the slot state last. Other threads may have a stale pointer + * to this slot and could try to alter the state and other fields once + * they see the state cleared. + */ + WT_PUBLISH(slot->slot_state, 0); } /* @@ -50,6 +99,10 @@ __log_slot_close( WT_CONNECTION_IMPL *conn; WT_LOG *log; int64_t end_offset, new_state, old_state; +#ifdef HAVE_DIAGNOSTIC + struct timespec begin, now; + int count; +#endif WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); WT_ASSERT(session, releasep != NULL); @@ -101,9 +154,33 @@ retry: * that value. If the state is unbuffered, wait for the unbuffered * size to be set. */ - while (WT_LOG_SLOT_UNBUFFERED_ISSET(old_state) && - slot->slot_unbuffered == 0) - __wt_yield(); +#ifdef HAVE_DIAGNOSTIC + count = 0; + __wt_epoch(session, &begin); +#endif + if (WT_LOG_SLOT_UNBUFFERED_ISSET(old_state)) { + while (slot->slot_unbuffered == 0) { + WT_RET(WT_SESSION_CHECK_PANIC(session)); + __wt_yield(); +#ifdef HAVE_DIAGNOSTIC + ++count; + if (count > WT_MILLION) { + __wt_epoch(session, &now); + if (WT_TIMEDIFF_SEC(now, begin) > 10) { + __wt_errx(session, "SLOT_CLOSE: Slot %" + PRIu32 " Timeout unbuffered, state 0x%" + PRIx64 " unbuffered %" PRIu64, + (uint32_t)(slot - &log->slot_pool[0]), + slot->slot_state, + slot->slot_unbuffered); + __log_slot_dump(session); + __wt_abort(session); + } + count = 0; + } +#endif + } + } end_offset = WT_LOG_SLOT_JOINED_BUFFERED(old_state) + slot->slot_unbuffered; @@ -118,17 +195,104 @@ retry: } /* + * __log_slot_new -- + * Find a free slot and switch it as the new active slot. + * Must be called holding the slot lock. + */ +static int +__log_slot_new(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + WT_LOGSLOT *slot; + int32_t i, pool_i; +#ifdef HAVE_DIAGNOSTIC + struct timespec begin, now; + int count; +#endif + + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); + conn = S2C(session); + log = conn->log; + /* + * Although this function is single threaded, multiple threads could + * be trying to set a new active slot sequentially. If we find an + * active slot that is valid, return. + */ + if ((slot = log->active_slot) != NULL && + WT_LOG_SLOT_OPEN(slot->slot_state)) + return (0); + +#ifdef HAVE_DIAGNOSTIC + count = 0; + __wt_epoch(session, &begin); +#endif + /* + * Keep trying until we can find a free slot. + */ + for (;;) { + /* + * Rotate among the slots to lessen collisions. + */ + for (i = 0, pool_i = log->pool_index; i < WT_SLOT_POOL; + i++, pool_i++) { + if (pool_i >= WT_SLOT_POOL) + pool_i = 0; + slot = &log->slot_pool[pool_i]; + if (slot->slot_state == WT_LOG_SLOT_FREE) { + /* + * Acquire our starting position in the + * log file. Assume the full buffer size. + */ + WT_RET(__wt_log_acquire(session, + log->slot_buf_size, slot)); + /* + * We have a new, initialized slot to use. + * Set it as the active slot. + */ + WT_STAT_CONN_INCR(session, + log_slot_transitions); + log->active_slot = slot; + log->pool_index = pool_i; + return (0); + } + } + /* + * If we didn't find any free slots signal the worker thread. + */ + WT_STAT_CONN_INCR(session, log_slot_no_free_slots); + __wt_cond_signal(session, conn->log_wrlsn_cond); + __wt_yield(); +#ifdef HAVE_DIAGNOSTIC + ++count; + if (count > WT_MILLION) { + __wt_epoch(session, &now); + if (WT_TIMEDIFF_SEC(now, begin) > 10) { + __wt_errx(session, + "SLOT_NEW: Timeout free slot"); + __log_slot_dump(session); + __wt_abort(session); + } + count = 0; + } +#endif + } + /* NOTREACHED */ +} + +/* * __log_slot_switch_internal -- * Switch out the current slot and set up a new one. */ static int __log_slot_switch_internal( - WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool forced) + WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool forced, bool *did_work) { WT_DECL_RET; WT_LOG *log; WT_LOGSLOT *slot; bool free_slot, release; + uint32_t joined; log = S2C(session)->log; release = false; @@ -142,10 +306,23 @@ __log_slot_switch_internal( */ if (slot != log->active_slot) return (0); + /* + * If the current active slot is unused and this is a forced switch, + * we're done. If this is a non-forced switch we always switch + * because the slot could be part of an unbuffered operation. + */ + joined = WT_LOG_SLOT_JOINED(slot->slot_state); + if (joined == 0 && forced) { + WT_STAT_CONN_INCR(session, log_force_write_skip); + if (did_work != NULL) + *did_work = false; + return (0); + } + WT_RET(WT_SESSION_CHECK_PANIC(session)); /* - * We may come through here multiple times if we were able to close - * a slot but could not set up a new one. If we closed it already, + * We may come through here multiple times if we were not able to + * set up a new one. If we closed it already, * don't try to do it again but still set up the new slot. */ if (!F_ISSET(myslot, WT_MYSLOT_CLOSE)) { @@ -157,20 +334,30 @@ __log_slot_switch_internal( if (ret == WT_NOTFOUND) return (0); WT_RET(ret); - if (release) { - WT_RET(__wt_log_release(session, slot, &free_slot)); - if (free_slot) - __wt_log_slot_free(session, slot); - } + /* + * Set that we have closed this slot because we may call in here + * multiple times if we retry creating a new slot. Similarly + * set retain whether this slot needs releasing so that we don't + * lose that information if we retry. + */ + F_SET(myslot, WT_MYSLOT_CLOSE); + if (release) + F_SET(myslot, WT_MYSLOT_NEEDS_RELEASE); } /* - * Set that we have closed this slot because we may call in here - * multiple times if we retry creating a new slot. + * Now that the slot is closed, set up a new one so that joining + * threads don't have to wait on writing the previous slot if we + * release it. Release after setting a new one. */ - F_SET(myslot, WT_MYSLOT_CLOSE); - WT_RET(__wt_log_slot_new(session)); + WT_RET(__log_slot_new(session)); F_CLR(myslot, WT_MYSLOT_CLOSE); - return (0); + if (F_ISSET(myslot, WT_MYSLOT_NEEDS_RELEASE)) { + WT_RET(__wt_log_release(session, slot, &free_slot)); + F_CLR(myslot, WT_MYSLOT_NEEDS_RELEASE); + if (free_slot) + __wt_log_slot_free(session, slot); + } + return (ret); } /* @@ -178,13 +365,14 @@ __log_slot_switch_internal( * Switch out the current slot and set up a new one. */ int -__wt_log_slot_switch( - WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool retry, bool forced) +__wt_log_slot_switch(WT_SESSION_IMPL *session, + WT_MYSLOT *myslot, bool retry, bool forced, bool *did_work) { WT_DECL_RET; WT_LOG *log; log = S2C(session)->log; + /* * !!! Since the WT_WITH_SLOT_LOCK macro is a do-while loop, the * compiler does not like it combined directly with the while loop @@ -198,7 +386,8 @@ __wt_log_slot_switch( */ do { WT_WITH_SLOT_LOCK(session, log, - ret = __log_slot_switch_internal(session, myslot, forced)); + ret = __log_slot_switch_internal( + session, myslot, forced, did_work)); if (ret == EBUSY) { WT_STAT_CONN_INCR(session, log_slot_switch_busy); __wt_yield(); @@ -208,67 +397,6 @@ __wt_log_slot_switch( } /* - * __wt_log_slot_new -- - * Find a free slot and switch it as the new active slot. - * Must be called holding the slot lock. - */ -int -__wt_log_slot_new(WT_SESSION_IMPL *session) -{ - WT_CONNECTION_IMPL *conn; - WT_LOG *log; - WT_LOGSLOT *slot; - int32_t i; - - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); - conn = S2C(session); - log = conn->log; - /* - * Although this function is single threaded, multiple threads could - * be trying to set a new active slot sequentially. If we find an - * active slot that is valid, return. - */ - if ((slot = log->active_slot) != NULL && - WT_LOG_SLOT_OPEN(slot->slot_state)) - return (0); - - /* - * Keep trying until we can find a free slot. - */ - for (;;) { - /* - * For now just restart at 0. We could use log->pool_index - * if that is inefficient. - */ - for (i = 0; i < WT_SLOT_POOL; i++) { - slot = &log->slot_pool[i]; - if (slot->slot_state == WT_LOG_SLOT_FREE) { - /* - * Acquire our starting position in the - * log file. Assume the full buffer size. - */ - WT_RET(__wt_log_acquire(session, - log->slot_buf_size, slot)); - /* - * We have a new, initialized slot to use. - * Set it as the active slot. - */ - WT_STAT_CONN_INCR(session, - log_slot_transitions); - log->active_slot = slot; - return (0); - } - } - /* - * If we didn't find any free slots signal the worker thread. - */ - __wt_cond_auto_signal(session, conn->log_wrlsn_cond); - __wt_yield(); - } - /* NOTREACHED */ -} - -/* * __wt_log_slot_init -- * Initialize the slot array. */ @@ -311,10 +439,13 @@ __wt_log_slot_init(WT_SESSION_IMPL *session) /* * We cannot initialize the release LSN in the activate function * because that function can be called after a log file switch. + * The release LSN is usually the same as the slot_start_lsn except + * around a log file switch. */ slot->slot_release_lsn = log->alloc_lsn; __wt_log_slot_activate(session, slot); log->active_slot = slot; + log->pool_index = 0; if (0) { err: while (--i >= 0) @@ -361,7 +492,7 @@ __wt_log_slot_destroy(WT_SESSION_IMPL *session) * __wt_log_slot_join -- * Join a consolidated logging slot. */ -void +int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot) { @@ -370,66 +501,76 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, WT_LOGSLOT *slot; int64_t flag_state, new_state, old_state, released; int32_t join_offset, new_join; -#ifdef HAVE_DIAGNOSTIC - bool unbuf_force; -#endif + bool unbuffered, yld; conn = S2C(session); log = conn->log; WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT)); + WT_ASSERT(session, mysize != 0); /* * There should almost always be a slot open. */ + unbuffered = false; #ifdef HAVE_DIAGNOSTIC - unbuf_force = (++log->write_calls % WT_THOUSAND) == 0; + yld = (++log->write_calls % 7) == 0; + if ((log->write_calls % WT_THOUSAND) == 0 || + mysize > WT_LOG_SLOT_BUF_MAX) { +#else + yld = false; + if (mysize > WT_LOG_SLOT_BUF_MAX) { #endif + unbuffered = true; + F_SET(myslot, WT_MYSLOT_UNBUFFERED); + } for (;;) { WT_BARRIER(); + WT_RET(WT_SESSION_CHECK_PANIC(session)); slot = log->active_slot; old_state = slot->slot_state; - /* - * Try to join our size into the existing size and - * atomically write it back into the state. - */ - flag_state = WT_LOG_SLOT_FLAGS(old_state); - released = WT_LOG_SLOT_RELEASED(old_state); - join_offset = WT_LOG_SLOT_JOINED(old_state); -#ifdef HAVE_DIAGNOSTIC - if (unbuf_force || mysize > WT_LOG_SLOT_BUF_MAX) { -#else - if (mysize > WT_LOG_SLOT_BUF_MAX) { -#endif - new_join = join_offset + WT_LOG_SLOT_UNBUFFERED; - F_SET(myslot, WT_MYSLOT_UNBUFFERED); - myslot->slot = slot; + if (WT_LOG_SLOT_OPEN(old_state)) { + /* + * Try to join our size into the existing size and + * atomically write it back into the state. + */ + flag_state = WT_LOG_SLOT_FLAGS(old_state); + released = WT_LOG_SLOT_RELEASED(old_state); + join_offset = WT_LOG_SLOT_JOINED(old_state); + if (unbuffered) + new_join = join_offset + WT_LOG_SLOT_UNBUFFERED; + else + new_join = join_offset + (int32_t)mysize; + new_state = (int64_t)WT_LOG_SLOT_JOIN_REL( + (int64_t)new_join, (int64_t)released, + (int64_t)flag_state); + + /* + * Braces used due to potential empty body warning. + */ + if (yld) { + WT_DIAGNOSTIC_YIELD; + } + /* + * Attempt to swap our size into the state. + */ + if (__wt_atomic_casiv64( + &slot->slot_state, old_state, new_state)) + break; + WT_STAT_CONN_INCR(session, log_slot_races); } else - new_join = join_offset + (int32_t)mysize; - new_state = (int64_t)WT_LOG_SLOT_JOIN_REL( - (int64_t)new_join, (int64_t)released, (int64_t)flag_state); - - /* - * Check if the slot is open for joining and we are able to - * swap in our size into the state. - */ - if (WT_LOG_SLOT_OPEN(old_state) && - __wt_atomic_casiv64( - &slot->slot_state, old_state, new_state)) - break; + WT_STAT_CONN_INCR(session, log_slot_active_closed); /* * The slot is no longer open or we lost the race to * update it. Yield and try again. */ - WT_STAT_CONN_INCR(session, log_slot_races); __wt_yield(); } /* * We joined this slot. Fill in our information to return to * the caller. */ - if (mysize != 0) - WT_STAT_CONN_INCR(session, log_slot_joins); + WT_STAT_CONN_INCR(session, log_slot_joins); if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC)) F_SET(slot, WT_SLOT_SYNC_DIR); if (LF_ISSET(WT_LOG_FLUSH)) @@ -444,6 +585,7 @@ __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, myslot->slot = slot; myslot->offset = join_offset; myslot->end_offset = (wt_off_t)((uint64_t)join_offset + mysize); + return (0); } /* @@ -459,7 +601,6 @@ __wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size) wt_off_t cur_offset, my_start; int64_t my_size, rel_size; - WT_UNUSED(session); slot = myslot->slot; my_start = slot->slot_start_offset + myslot->offset; /* @@ -468,6 +609,7 @@ __wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size) * was written rather than the beginning record of the slot. */ while ((cur_offset = slot->slot_last_offset) < my_start) { + WT_RET(WT_SESSION_CHECK_PANIC(session)); /* * Set our offset if we are larger. */ diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index 839648b97d7..52265f02e62 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -10,10 +10,10 @@ #define WT_FORALL_CURSORS(clsm, c, i) \ for ((i) = (clsm)->nchunks; (i) > 0;) \ - if (((c) = (clsm)->chunks[--i]->cursor) != NULL) + if (((c) = (clsm)->chunks[--(i)]->cursor) != NULL) #define WT_LSM_CURCMP(s, lsm_tree, c1, c2, cmp) \ - __wt_compare(s, (lsm_tree)->collator, &(c1)->key, &(c2)->key, &cmp) + __wt_compare(s, (lsm_tree)->collator, &(c1)->key, &(c2)->key, &(cmp)) static int __clsm_lookup(WT_CURSOR_LSM *, WT_ITEM *); static int __clsm_open_cursors(WT_CURSOR_LSM *, bool, u_int, uint32_t); @@ -178,20 +178,12 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update) if (reset) { WT_ASSERT(session, !F_ISSET(&clsm->iface, - WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT)); + WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT)); WT_RET(__clsm_reset_cursors(clsm, NULL)); } for (;;) { - /* - * If the cursor looks up-to-date, check if the cache is full. - * In case this call blocks, the check will be repeated before - * proceeding. - */ - if (clsm->dsk_gen != lsm_tree->dsk_gen && - lsm_tree->nchunks != 0) - goto open; - + /* Check if the cursor looks up-to-date. */ if (clsm->dsk_gen != lsm_tree->dsk_gen && lsm_tree->nchunks != 0) goto open; @@ -304,7 +296,7 @@ __clsm_leave(WT_CURSOR_LSM *clsm) * byte, if the application uses two leading DC4 byte for some reason, we'll do * a wasted data copy each time a new value is inserted into the object. */ -static const WT_ITEM __tombstone = { "\x14\x14", 2, 0, NULL, 0 }; +static const WT_ITEM __tombstone = { "\x14\x14", 2, NULL, 0, 0 }; /* * __clsm_deleted -- @@ -666,7 +658,7 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { */ if (i != nchunks - 1) clsm->chunks[i]->cursor->insert = - __wt_curfile_update_check; + __wt_curfile_insert_check; if (!F_ISSET(clsm, WT_CLSM_MERGE) && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) @@ -688,19 +680,29 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { if (chunk != NULL && !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && chunk->switch_txn == WT_TXN_NONE) { - clsm->primary_chunk = chunk; primary = clsm->chunks[clsm->nchunks - 1]->cursor; + btree = ((WT_CURSOR_BTREE *)primary)->btree; + /* - * Disable eviction for the in-memory chunk. Also clear the - * bulk load flag here, otherwise eviction will be enabled by - * the first update. + * If the primary is not yet set as the primary, do that now. + * Note that eviction was configured off when the underlying + * object was created, which is what we want, leave it alone. + * + * We don't have to worry about races here: every thread that + * modifies the tree will have to come through here, at worse + * we set the flag repeatedly. We don't use a WT_BTREE handle + * flag, however, we could race doing the read-modify-write of + * the flags field. + * + * If something caused the chunk to be closed and reopened + * since it was created, we can no longer use it as a primary + * chunk and we need to force a switch. We detect the tree was + * created when it was opened by checking the "original" flag. */ - btree = ((WT_CURSOR_BTREE *)(primary))->btree; - if (btree->bulk_load_ok) { - btree->bulk_load_ok = false; - WT_WITH_BTREE(session, btree, - __wt_btree_lsm_switch_primary(session, true)); - } + if (!btree->lsm_primary && btree->original) + btree->lsm_primary = true; + if (btree->lsm_primary) + clsm->primary_chunk = chunk; } clsm->dsk_gen = lsm_tree->dsk_gen; @@ -1213,7 +1215,8 @@ __clsm_lookup(WT_CURSOR_LSM *clsm, WT_ITEM *value) WT_LSM_TREE_STAT_INCR( session, clsm->lsm_tree->bloom_miss); continue; - } else if (ret == 0) + } + if (ret == 0) WT_LSM_TREE_STAT_INCR( session, clsm->lsm_tree->bloom_hit); WT_ERR(ret); @@ -1239,10 +1242,10 @@ __clsm_lookup(WT_CURSOR_LSM *clsm, WT_ITEM *value) WT_ERR(WT_NOTFOUND); done: -err: F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); - if (ret == 0) { - clsm->current = c; +err: if (ret == 0) { + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); F_SET(cursor, WT_CURSTD_KEY_INT); + clsm->current = c; if (value == &cursor->value) F_SET(cursor, WT_CURSTD_VALUE_INT); } else if (c != NULL) @@ -1318,7 +1321,8 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp) if ((ret = c->search_near(c, &cmp)) == WT_NOTFOUND) { ret = 0; continue; - } else if (ret != 0) + } + if (ret != 0) goto err; /* Do we have an exact match? */ @@ -1338,7 +1342,8 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp) if ((ret = c->next(c)) == WT_NOTFOUND) { ret = 0; continue; - } else if (ret != 0) + } + if (ret != 0) goto err; } @@ -1564,12 +1569,23 @@ __clsm_update(WT_CURSOR *cursor) WT_CURSOR_NEEDVALUE(cursor); WT_ERR(__clsm_enter(clsm, false, true)); - if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) || - (ret = __clsm_lookup(clsm, &value)) == 0) { - WT_ERR(__clsm_deleted_encode( - session, &cursor->value, &value, &buf)); - ret = __clsm_put(session, clsm, &cursor->key, &value, true); - } + if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) + WT_ERR(__clsm_lookup(clsm, &value)); + WT_ERR(__clsm_deleted_encode(session, &cursor->value, &value, &buf)); + WT_ERR(__clsm_put(session, clsm, &cursor->key, &value, true)); + + /* + * Set the cursor to reference the internal key/value of the positioned + * cursor. + */ + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + WT_ITEM_SET(cursor->key, clsm->current->key); + WT_ITEM_SET(cursor->value, clsm->current->value); + WT_ASSERT(session, + F_MASK(clsm->current, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT); + WT_ASSERT(session, + F_MASK(clsm->current, WT_CURSTD_VALUE_SET) == WT_CURSTD_VALUE_INT); + F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); err: __wt_scr_free(session, &buf); __clsm_leave(clsm); @@ -1588,18 +1604,34 @@ __clsm_remove(WT_CURSOR *cursor) WT_DECL_RET; WT_ITEM value; WT_SESSION_IMPL *session; + bool positioned; clsm = (WT_CURSOR_LSM *)cursor; + /* Check if the cursor is positioned. */ + positioned = F_ISSET(cursor, WT_CURSTD_KEY_INT); + CURSOR_REMOVE_API_CALL(cursor, session, NULL); WT_CURSOR_NEEDKEY(cursor); WT_CURSOR_NOVALUE(cursor); WT_ERR(__clsm_enter(clsm, false, true)); - if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) || - (ret = __clsm_lookup(clsm, &value)) == 0) - ret = __clsm_put( - session, clsm, &cursor->key, &__tombstone, true); + if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) + WT_ERR(__clsm_lookup(clsm, &value)); + WT_ERR(__clsm_put( + session, clsm, &cursor->key, &__tombstone, positioned)); + + /* + * If the cursor was positioned, it stays positioned with a key but no + * no value, otherwise, there's no position, key or value. This isn't + * just cosmetic, without a reset, iteration on this cursor won't start + * at the beginning/end of the table. + */ + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + if (positioned) + F_SET(cursor, WT_CURSTD_KEY_INT); + else + WT_TRET(cursor->reset(cursor)); err: __clsm_leave(clsm); CURSOR_UPDATE_API_END(session, ret); @@ -1692,8 +1724,8 @@ __wt_clsm_open(WT_SESSION_IMPL *session, bulk = cval.val != 0; /* Get the LSM tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, uri, bulk, &lsm_tree)); + ret = __wt_lsm_tree_get(session, uri, bulk, &lsm_tree); + /* * Check whether the exclusive open for a bulk load succeeded, and * if it did ensure that it's safe to bulk load into the tree. diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c index cbd83a5cd30..e33e119aa41 100644 --- a/src/lsm/lsm_manager.c +++ b/src/lsm/lsm_manager.c @@ -89,7 +89,6 @@ __lsm_general_worker_start(WT_SESSION_IMPL *session) if (manager->lsm_workers % 2 == 0) FLD_SET(worker_args->type, WT_LSM_WORK_MERGE); } - F_SET(worker_args, WT_LSM_WORKER_RUN); WT_RET(__wt_lsm_worker_start(session, worker_args)); } @@ -129,17 +128,13 @@ __lsm_stop_workers(WT_SESSION_IMPL *session) manager->lsm_workers--) { worker_args = &manager->lsm_worker_cookies[manager->lsm_workers - 1]; - /* - * Clear this worker's flag so it stops. - */ - F_CLR(worker_args, WT_LSM_WORKER_RUN); - WT_ASSERT(session, worker_args->tid != 0); - WT_RET(__wt_thread_join(session, worker_args->tid)); - worker_args->tid = 0; + WT_ASSERT(session, worker_args->tid_set); + + WT_RET(__wt_lsm_worker_stop(session, worker_args)); worker_args->type = 0; - worker_args->flags = 0; + /* - * We do not clear the session because they are allocated + * We do not clear the other fields because they are allocated * statically when the connection was opened. */ } @@ -237,12 +232,12 @@ __wt_lsm_manager_start(WT_SESSION_IMPL *session) manager->lsm_worker_cookies[i].session = worker_session; } + F_SET(conn, WT_CONN_SERVER_LSM); + /* Start the LSM manager thread. */ WT_ERR(__wt_thread_create(session, &manager->lsm_worker_cookies[0].tid, __lsm_worker_manager, &manager->lsm_worker_cookies[0])); - F_SET(conn, WT_CONN_SERVER_LSM); - if (0) { err: for (i = 0; (worker_session = @@ -289,13 +284,18 @@ __wt_lsm_manager_destroy(WT_SESSION_IMPL *session) manager = &conn->lsm_manager; removed = 0; + /* + * Clear the LSM server flag and flush to ensure running threads see + * the state change. + */ + F_CLR(conn, WT_CONN_SERVER_LSM); + WT_FULL_BARRIER(); + WT_ASSERT(session, !F_ISSET(conn, WT_CONN_READONLY) || manager->lsm_workers == 0); if (manager->lsm_workers > 0) { - /* - * Stop the main LSM manager thread first. - */ - while (F_ISSET(conn, WT_CONN_SERVER_LSM)) + /* Wait for the main LSM manager thread to finish. */ + while (!F_ISSET(manager, WT_LSM_MANAGER_SHUTDOWN)) __wt_yield(); /* Clean up open LSM handles. */ @@ -303,7 +303,6 @@ __wt_lsm_manager_destroy(WT_SESSION_IMPL *session) WT_TRET(__wt_thread_join( session, manager->lsm_worker_cookies[0].tid)); - manager->lsm_worker_cookies[0].tid = 0; /* Release memory from any operations left on the queue. */ while ((current = TAILQ_FIRST(&manager->switchqh)) != NULL) { @@ -342,7 +341,7 @@ __wt_lsm_manager_destroy(WT_SESSION_IMPL *session) /* * __lsm_manager_worker_shutdown -- - * Shutdown the LSM manager and worker threads. + * Shutdown the LSM worker threads. */ static int __lsm_manager_worker_shutdown(WT_SESSION_IMPL *session) @@ -354,14 +353,13 @@ __lsm_manager_worker_shutdown(WT_SESSION_IMPL *session) manager = &S2C(session)->lsm_manager; /* - * Wait for the rest of the LSM workers to shutdown. Stop at index + * Wait for the rest of the LSM workers to shutdown. Start at index * one - since we (the manager) are at index 0. */ for (i = 1; i < manager->lsm_workers; i++) { - WT_ASSERT(session, manager->lsm_worker_cookies[i].tid != 0); - __wt_cond_signal(session, manager->work_cond); - WT_TRET(__wt_thread_join( - session, manager->lsm_worker_cookies[i].tid)); + WT_ASSERT(session, manager->lsm_worker_cookies[i].tid_set); + WT_TRET(__wt_lsm_worker_stop( + session, &manager->lsm_worker_cookies[i])); } return (ret); } @@ -383,12 +381,12 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) conn = S2C(session); dhandle_locked = false; - while (F_ISSET(conn, WT_CONN_SERVER_RUN)) { + while (F_ISSET(conn, WT_CONN_SERVER_LSM)) { __wt_sleep(0, 10000); if (TAILQ_EMPTY(&conn->lsmqh)) continue; - __wt_spin_lock(session, &conn->dhandle_lock); - F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST); + __wt_readlock(session, &conn->dhandle_lock); + F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); dhandle_locked = true; TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) { if (!lsm_tree->active) @@ -448,14 +446,14 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) session, WT_LSM_WORK_MERGE, 0, lsm_tree)); } } - __wt_spin_unlock(session, &conn->dhandle_lock); - F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST); + __wt_readunlock(session, &conn->dhandle_lock); + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); dhandle_locked = false; } err: if (dhandle_locked) { - __wt_spin_unlock(session, &conn->dhandle_lock); - F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST); + __wt_readunlock(session, &conn->dhandle_lock); + F_CLR(session, WT_SESSION_LOCKED_HANDLE_LIST_READ); } return (ret); } @@ -469,11 +467,13 @@ static WT_THREAD_RET __lsm_worker_manager(void *arg) { WT_DECL_RET; + WT_LSM_MANAGER *manager; WT_LSM_WORKER_ARGS *cookie; WT_SESSION_IMPL *session; cookie = (WT_LSM_WORKER_ARGS *)arg; session = cookie->session; + manager = &S2C(session)->lsm_manager; WT_ERR(__lsm_general_worker_start(session)); WT_ERR(__lsm_manager_run_server(session)); @@ -482,7 +482,11 @@ __lsm_worker_manager(void *arg) if (ret != 0) { err: WT_PANIC_MSG(session, ret, "LSM worker manager thread error"); } - F_CLR(S2C(session), WT_CONN_SERVER_LSM); + + /* Connection close waits on us to shutdown, let it know we're done. */ + F_SET(manager, WT_LSM_MANAGER_SHUTDOWN); + WT_FULL_BARRIER(); + return (WT_THREAD_RET_VALUE); } diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c index ceb5f03a2f5..8838638f388 100644 --- a/src/lsm/lsm_merge.c +++ b/src/lsm/lsm_merge.c @@ -187,7 +187,7 @@ __lsm_merge_span(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, continue; if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) || chunk->generation > 0) break; - else if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) && + if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) break; } @@ -625,7 +625,7 @@ err: if (locked) else __wt_verbose(session, WT_VERB_LSM, "Merge failed with %s", - __wt_strerror(session, ret, NULL, 0)); + __wt_strerror(session, ret, NULL, 0)); } F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); return (ret); diff --git a/src/lsm/lsm_meta.c b/src/lsm/lsm_meta.c index 46ead6d6ac4..fc4dde82470 100644 --- a/src/lsm/lsm_meta.c +++ b/src/lsm/lsm_meta.c @@ -229,7 +229,7 @@ __lsm_meta_read_v1( cv.len -= 2; } WT_ERR(__wt_config_check(session, - WT_CONFIG_REF(session, WT_SESSION_create), cv.str, cv.len)); + WT_CONFIG_REF(session, WT_SESSION_create), cv.str, cv.len)); WT_ERR(__wt_strndup(session, cv.str, cv.len, &lsm_tree->bloom_config)); WT_ERR(__wt_config_getones( session, lsmconf, "lsm.bloom_hash_count", &cv)); diff --git a/src/lsm/lsm_stat.c b/src/lsm/lsm_stat.c index 150de968722..411655878af 100644 --- a/src/lsm/lsm_stat.c +++ b/src/lsm/lsm_stat.c @@ -29,24 +29,22 @@ __curstat_lsm_init( const char *cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL, NULL }; const char *disk_cfg[] = { - WT_CONFIG_BASE(session, WT_SESSION_open_cursor), - "checkpoint=" WT_CHECKPOINT, NULL, NULL }; + WT_CONFIG_BASE(session, WT_SESSION_open_cursor), + "checkpoint=" WT_CHECKPOINT, NULL, NULL }; locked = false; - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, uri, false, &lsm_tree)); WT_ERR(__wt_scr_alloc(session, 0, &uribuf)); /* Propagate all, fast and/or clear to the cursors we open. */ if (cst->flags != 0) { - (void)snprintf(config, sizeof(config), + WT_ERR(__wt_snprintf(config, sizeof(config), "statistics=(%s%s%s%s)", F_ISSET(cst, WT_STAT_TYPE_ALL) ? "all," : "", F_ISSET(cst, WT_STAT_CLEAR) ? "clear," : "", !F_ISSET(cst, WT_STAT_TYPE_ALL) && F_ISSET(cst, WT_STAT_TYPE_FAST) ? "fast," : "", - F_ISSET(cst, WT_STAT_TYPE_SIZE) ? "size," : ""); + F_ISSET(cst, WT_STAT_TYPE_SIZE) ? "size," : "")); cfg[1] = disk_cfg[1] = config; } diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c index 38d87dd852b..a9275976023 100644 --- a/src/lsm/lsm_tree.c +++ b/src/lsm/lsm_tree.c @@ -38,7 +38,7 @@ __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final) /* We may be destroying an lsm_tree before it was added. */ if (F_ISSET(lsm_tree, WT_LSM_TREE_OPEN)) { WT_ASSERT(session, final || - F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); TAILQ_REMOVE(&S2C(session)->lsmqh, lsm_tree, q); } @@ -321,9 +321,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session, metadata = NULL; /* If the tree can be opened, it already exists. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)); - if (ret == 0) { + if ((ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)) == 0) { __wt_lsm_tree_release(session, lsm_tree); return (exclusive ? EEXIST : 0); } @@ -339,7 +337,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session, * error: the returned handle is NULL on error, and the metadata * tracking macros handle cleaning up on failure. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __lsm_tree_open(session, uri, true, &lsm_tree)); if (ret == 0) __wt_lsm_tree_release(session, lsm_tree); @@ -404,6 +402,9 @@ __lsm_tree_find(WT_SESSION_IMPL *session, } *treep = lsm_tree; + + WT_ASSERT(session, lsm_tree->excl_session == + (exclusive ? session : NULL)); return (0); } @@ -456,7 +457,8 @@ __lsm_tree_open(WT_SESSION_IMPL *session, conn = S2C(session); lsm_tree = NULL; - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); + WT_ASSERT(session, + F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST_WRITE)); /* Start the LSM manager thread if it isn't running. */ if (__wt_atomic_cas32(&conn->lsm_manager.lsm_workers, 0, 1)) @@ -469,7 +471,7 @@ __lsm_tree_open(WT_SESSION_IMPL *session, /* Try to open the tree. */ WT_RET(__wt_calloc_one(session, &lsm_tree)); - WT_ERR(__wt_rwlock_alloc(session, &lsm_tree->rwlock, "lsm tree")); + __wt_rwlock_init(session, &lsm_tree->rwlock); WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri)); @@ -520,14 +522,21 @@ __wt_lsm_tree_get(WT_SESSION_IMPL *session, { WT_DECL_RET; - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); - - ret = __lsm_tree_find(session, uri, exclusive, treep); + /* + * Dropping and re-acquiring the lock is safe here, since the tree open + * call checks to see if another thread beat it to opening the tree + * before proceeding. + */ + if (exclusive) + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, + ret = __lsm_tree_find(session, uri, exclusive, treep)); + else + WT_WITH_HANDLE_LIST_READ_LOCK(session, + ret = __lsm_tree_find(session, uri, exclusive, treep)); if (ret == WT_NOTFOUND) - ret = __lsm_tree_open(session, uri, exclusive, treep); + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, + ret = __lsm_tree_open(session, uri, exclusive, treep)); - WT_ASSERT(session, ret != 0 || - (*treep)->excl_session == (exclusive ? session : NULL)); return (ret); } @@ -857,9 +866,7 @@ __wt_lsm_tree_alter( locked = false; /* Get the LSM tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, uri, false, &lsm_tree)); /* Prevent any new opens. */ __wt_lsm_tree_writelock(session, lsm_tree); @@ -899,9 +906,7 @@ __wt_lsm_tree_drop( locked = false; /* Get the LSM tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, name, true, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, name, true, &lsm_tree)); WT_ASSERT(session, !lsm_tree->active); /* Prevent any new opens. */ @@ -934,7 +939,7 @@ __wt_lsm_tree_drop( WT_ASSERT(session, !lsm_tree->active); err: if (locked) __wt_lsm_tree_writeunlock(session, lsm_tree); - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, tret = __lsm_tree_discard(session, lsm_tree, false)); WT_TRET(tret); return (ret); @@ -960,9 +965,7 @@ __wt_lsm_tree_rename(WT_SESSION_IMPL *session, locked = false; /* Get the LSM tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, olduri, true, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, olduri, true, &lsm_tree)); /* Prevent any new opens. */ __wt_lsm_tree_writelock(session, lsm_tree); @@ -1007,7 +1010,7 @@ err: if (locked) * Discard this LSM tree structure. The first operation on the renamed * tree will create a new one. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, tret = __lsm_tree_discard(session, lsm_tree, false)); WT_TRET(tret); return (ret); @@ -1032,9 +1035,7 @@ __wt_lsm_tree_truncate( locked = false; /* Get the LSM tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, name, true, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, name, true, &lsm_tree)); /* Prevent any new opens. */ __wt_lsm_tree_writelock(session, lsm_tree); @@ -1068,7 +1069,7 @@ err: if (locked) * the last good version of the metadata will be used, resulting * in a valid (not truncated) tree. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, tret = __lsm_tree_discard(session, lsm_tree, false)); WT_TRET(tret); } @@ -1082,7 +1083,7 @@ err: if (locked) void __wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { - __wt_readlock(session, lsm_tree->rwlock); + __wt_readlock(session, &lsm_tree->rwlock); /* * Diagnostic: avoid deadlocks with the schema lock: if we need it for @@ -1100,7 +1101,7 @@ __wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { F_CLR(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); - __wt_readunlock(session, lsm_tree->rwlock); + __wt_readunlock(session, &lsm_tree->rwlock); } /* @@ -1110,7 +1111,7 @@ __wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) void __wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { - __wt_writelock(session, lsm_tree->rwlock); + __wt_writelock(session, &lsm_tree->rwlock); /* * Diagnostic: avoid deadlocks with the schema lock: if we need it for @@ -1128,7 +1129,7 @@ __wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { F_CLR(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); - __wt_writeunlock(session, lsm_tree->rwlock); + __wt_writeunlock(session, &lsm_tree->rwlock); } /* @@ -1157,9 +1158,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp) /* Tell __wt_schema_worker not to look inside the LSM tree. */ *skipp = true; - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, name, false, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, name, false, &lsm_tree)); if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE)) WT_ERR_MSG(session, EINVAL, @@ -1356,9 +1355,7 @@ __wt_lsm_tree_worker(WT_SESSION_IMPL *session, locked = false; exclusive = FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE); - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_lsm_tree_get(session, uri, exclusive, &lsm_tree)); - WT_RET(ret); + WT_RET(__wt_lsm_tree_get(session, uri, exclusive, &lsm_tree)); /* * We mark that we're busy using the tree to coordinate diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c index d9c185a3f58..e6a29666094 100644 --- a/src/lsm/lsm_work_unit.c +++ b/src/lsm/lsm_work_unit.c @@ -230,7 +230,7 @@ __wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) if (__wt_atomic_cas32(&chunk->bloom_busy, 0, 1)) { if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) { ret = __lsm_bloom_create( - session, lsm_tree, chunk, (u_int)i); + session, lsm_tree, chunk, i); /* * Record if we were successful so that we can * later push a merge work unit. @@ -265,9 +265,9 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, { WT_DECL_RET; WT_TXN_ISOLATION saved_isolation; - bool flush_set; + bool flush_set, release_btree; - flush_set = false; + flush_set = release_btree = false; /* * If the chunk is already checkpointed, make sure it is also evicted. @@ -276,7 +276,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) && !chunk->evicted) { - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __lsm_discard_handle(session, chunk->uri, NULL)); if (ret == 0) chunk->evicted = 1; @@ -318,20 +318,18 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, * We can wait here for checkpoints and fsyncs to complete, which can * take a long time. */ - if ((ret = __wt_session_get_btree( - session, chunk->uri, NULL, NULL, 0)) == 0) { - /* - * Set read-uncommitted: we have already checked that all of the - * updates in this chunk are globally visible, use the cheapest - * possible check in reconciliation. - */ - saved_isolation = session->txn.isolation; - session->txn.isolation = WT_ISO_READ_UNCOMMITTED; - ret = __wt_cache_op(session, WT_SYNC_WRITE_LEAVES); - session->txn.isolation = saved_isolation; - WT_TRET(__wt_session_release_btree(session)); - } - WT_ERR(ret); + WT_ERR(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0)); + release_btree = true; + + /* + * Set read-uncommitted: we have already checked that all of the updates + * in this chunk are globally visible, use the cheapest possible check + * in reconciliation. + */ + saved_isolation = session->txn.isolation; + session->txn.isolation = WT_ISO_READ_UNCOMMITTED; + WT_ERR(__wt_cache_op(session, WT_SYNC_WRITE_LEAVES)); + session->txn.isolation = saved_isolation; __wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s", chunk->uri); @@ -348,12 +346,14 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_ERR(__wt_meta_track_on(session)); WT_WITH_CHECKPOINT_LOCK(session, WT_WITH_SCHEMA_LOCK(session, - ret = __wt_schema_worker( - session, chunk->uri, __wt_checkpoint, NULL, NULL, 0))); + ret = __wt_checkpoint(session, NULL))); WT_TRET(__wt_meta_track_off(session, false, ret != 0)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM checkpoint"); + release_btree = false; + WT_ERR(__wt_session_release_btree(session)); + /* Now the file is written, get the chunk size. */ WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); @@ -376,16 +376,6 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_PUBLISH(chunk->flushing, 0); flush_set = false; - /* - * Clear the no-eviction flag so the primary can be evicted and - * eventually closed. Only do this once the checkpoint has succeeded: - * otherwise, accessing the leaf page during the checkpoint can trigger - * forced eviction. - */ - WT_ERR(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0)); - __wt_btree_lsm_switch_primary(session, false); - WT_ERR(__wt_session_release_btree(session)); - /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); @@ -402,6 +392,8 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, err: if (flush_set) WT_PUBLISH(chunk->flushing, 0); + if (release_btree) + WT_TRET(__wt_session_release_btree(session)); return (ret); } @@ -517,8 +509,8 @@ __lsm_drop_file(WT_SESSION_IMPL *session, const char *uri) * * This will fail with EBUSY if the file is still in use. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __lsm_discard_handle(session, uri, WT_CHECKPOINT)); + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, + ret = __lsm_discard_handle(session, uri, WT_CHECKPOINT)); WT_RET(ret); /* @@ -610,7 +602,8 @@ __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) if (drop_ret == EBUSY) { ++skipped; continue; - } else if (drop_ret != ENOENT) + } + if (drop_ret != ENOENT) WT_ERR(drop_ret); flush_metadata = true; @@ -621,7 +614,8 @@ __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) if (drop_ret == EBUSY) { ++skipped; continue; - } else if (drop_ret != ENOENT) + } + if (drop_ret != ENOENT) WT_ERR(drop_ret); flush_metadata = true; } diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c index b0d0758775d..1cabbd4888d 100644 --- a/src/lsm/lsm_worker.c +++ b/src/lsm/lsm_worker.c @@ -21,7 +21,23 @@ __wt_lsm_worker_start(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args) { __wt_verbose(session, WT_VERB_LSM_MANAGER, "Start LSM worker %u type %#" PRIx32, args->id, args->type); - return (__wt_thread_create(session, &args->tid, __lsm_worker, args)); + + args->running = true; + WT_RET(__wt_thread_create(session, &args->tid, __lsm_worker, args)); + args->tid_set = true; + return (0); +} + +/* + * __wt_lsm_worker_stop -- + * A wrapper around the LSM worker thread stop. + */ +int +__wt_lsm_worker_stop(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args) +{ + args->running = false; + args->tid_set = false; + return (__wt_thread_join(session, args->tid)); } /* @@ -84,7 +100,6 @@ err: __wt_lsm_manager_free_work_unit(session, entry); static WT_THREAD_RET __lsm_worker(void *arg) { - WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LSM_WORK_UNIT *entry; WT_LSM_WORKER_ARGS *cookie; @@ -93,11 +108,9 @@ __lsm_worker(void *arg) cookie = (WT_LSM_WORKER_ARGS *)arg; session = cookie->session; - conn = S2C(session); entry = NULL; - while (F_ISSET(conn, WT_CONN_SERVER_RUN) && - F_ISSET(cookie, WT_LSM_WORKER_RUN)) { + while (cookie->running) { progress = false; /* @@ -154,7 +167,7 @@ __lsm_worker(void *arg) /* Don't busy wait if there was any work to do. */ if (!progress) { - __wt_cond_wait(session, cookie->work_cond, 10000); + __wt_cond_wait(session, cookie->work_cond, 10000, NULL); continue; } } diff --git a/src/meta/meta_apply.c b/src/meta/meta_apply.c index fb483c21dd9..dc93180a5e5 100644 --- a/src/meta/meta_apply.c +++ b/src/meta/meta_apply.c @@ -45,11 +45,7 @@ __meta_btree_apply(WT_SESSION_IMPL *session, WT_CURSOR *cursor, session, uri, NULL, NULL, 0)) != 0) return (ret == EBUSY ? 0 : ret); WT_SAVE_DHANDLE(session, ret = file_func(session, cfg)); - if (WT_META_TRACKING(session)) - WT_TRET(__wt_meta_track_handle_lock( - session, false)); - else - WT_TRET(__wt_session_release_btree(session)); + WT_TRET(__wt_session_release_btree(session)); WT_RET(ret); } WT_RET_NOTFOUND_OK(ret); diff --git a/src/meta/meta_ckpt.c b/src/meta/meta_ckpt.c index b985104c2eb..151bbe0e081 100644 --- a/src/meta/meta_ckpt.c +++ b/src/meta/meta_ckpt.c @@ -297,7 +297,7 @@ __wt_meta_ckptlist_get( *ckptbasep = ckptbase; if (0) { -err: __wt_meta_ckptlist_free(session, ckptbase); +err: __wt_meta_ckptlist_free(session, &ckptbase); } __wt_free(session, config); __wt_scr_free(session, &buf); @@ -463,16 +463,16 @@ err: __wt_scr_free(session, &buf); * Discard the checkpoint array. */ void -__wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT *ckptbase) +__wt_meta_ckptlist_free(WT_SESSION_IMPL *session, WT_CKPT **ckptbasep) { - WT_CKPT *ckpt; + WT_CKPT *ckpt, *ckptbase; - if (ckptbase == NULL) + if ((ckptbase = *ckptbasep) == NULL) return; WT_CKPT_FOREACH(ckptbase, ckpt) __wt_meta_checkpoint_free(session, ckpt); - __wt_free(session, ckptbase); + __wt_free(session, *ckptbasep); } /* diff --git a/src/meta/meta_ext.c b/src/meta/meta_ext.c index 50e7568fe77..aa1ea8b974d 100644 --- a/src/meta/meta_ext.c +++ b/src/meta/meta_ext.c @@ -102,5 +102,5 @@ void __wt_metadata_free_ckptlist(WT_SESSION *session, WT_CKPT *ckptbase) WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) { - __wt_meta_ckptlist_free((WT_SESSION_IMPL *)session, ckptbase); + __wt_meta_ckptlist_free((WT_SESSION_IMPL *)session, &ckptbase); } diff --git a/src/meta/meta_table.c b/src/meta/meta_table.c index 4f60728b2d2..aca69d0e6a2 100644 --- a/src/meta/meta_table.c +++ b/src/meta/meta_table.c @@ -68,9 +68,6 @@ __wt_metadata_cursor_open( if (F_ISSET(btree, WT_BTREE_NO_LOGGING)) F_CLR(btree, WT_BTREE_NO_LOGGING); - /* The metadata file always uses checkpoint IDs in visibility checks. */ - btree->include_checkpoint_txn = true; - return (0); } diff --git a/src/meta/meta_turtle.c b/src/meta/meta_turtle.c index 66e34c728f2..5a089471059 100644 --- a/src/meta/meta_turtle.c +++ b/src/meta/meta_turtle.c @@ -242,7 +242,7 @@ __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, char **valuep) WT_DECL_ITEM(buf); WT_DECL_RET; WT_FSTREAM *fs; - bool exist, match; + bool exist; *valuep = NULL; @@ -258,22 +258,19 @@ __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, char **valuep) __metadata_config(session, valuep) : WT_NOTFOUND); WT_RET(__wt_fopen(session, WT_METADATA_TURTLE, 0, WT_STREAM_READ, &fs)); - /* Search for the key. */ WT_ERR(__wt_scr_alloc(session, 512, &buf)); - for (match = false;;) { + + /* Search for the key. */ + do { WT_ERR(__wt_getline(session, fs, buf)); if (buf->size == 0) WT_ERR(WT_NOTFOUND); - if (strcmp(key, buf->data) == 0) - match = true; + } while (strcmp(key, buf->data) != 0); - /* Key matched: read the subsequent line for the value. */ - WT_ERR(__wt_getline(session, fs, buf)); - if (buf->size == 0) - WT_ERR(__wt_illegal_value(session, WT_METADATA_TURTLE)); - if (match) - break; - } + /* Key matched: read the subsequent line for the value. */ + WT_ERR(__wt_getline(session, fs, buf)); + if (buf->size == 0) + WT_ERR(WT_NOTFOUND); /* Copy the value for the caller. */ WT_ERR(__wt_strdup(session, buf->data, valuep)); @@ -283,7 +280,12 @@ err: WT_TRET(__wt_fclose(session, &fs)); if (ret != 0) __wt_free(session, *valuep); - return (ret); + + /* + * A file error or a missing key/value pair in the turtle file means + * something has gone horribly wrong -- we're done. + */ + return (ret == 0 ? 0 : __wt_illegal_value(session, WT_METADATA_TURTLE)); } /* @@ -322,5 +324,9 @@ __wt_turtle_update(WT_SESSION_IMPL *session, const char *key, const char *value) err: WT_TRET(__wt_fclose(session, &fs)); WT_TRET(__wt_remove_if_exists(session, WT_METADATA_TURTLE_SET, false)); - return (ret); + /* + * An error updating the turtle file means something has gone horribly + * wrong -- we're done. + */ + return (ret == 0 ? 0 : __wt_illegal_value(session, WT_METADATA_TURTLE)); } diff --git a/src/os_common/filename.c b/src/os_common/filename.c index 5aeb64bb51e..d5695f63d91 100644 --- a/src/os_common/filename.c +++ b/src/os_common/filename.c @@ -29,6 +29,7 @@ int __wt_nfilename( WT_SESSION_IMPL *session, const char *name, size_t namelen, char **path) { + WT_DECL_RET; size_t len; char *buf; @@ -39,16 +40,17 @@ __wt_nfilename( * the exists API which is used by the test utilities. */ if (session == NULL || __wt_absolute_path(name)) - WT_RET(__wt_strndup(session, name, namelen, path)); - else { - len = strlen(S2C(session)->home) + 1 + namelen + 1; - WT_RET(__wt_calloc(session, 1, len, &buf)); - snprintf(buf, len, "%s%s%.*s", S2C(session)->home, - __wt_path_separator(), (int)namelen, name); - *path = buf; - } + return (__wt_strndup(session, name, namelen, path)); + len = strlen(S2C(session)->home) + 1 + namelen + 1; + WT_RET(__wt_calloc(session, 1, len, &buf)); + WT_ERR(__wt_snprintf(buf, len, "%s%s%.*s", + S2C(session)->home, __wt_path_separator(), (int)namelen, name)); + *path = buf; return (0); + +err: __wt_free(session, buf); + return (ret); } /* diff --git a/src/os_common/os_errno.c b/src/os_common/os_errno.c index a8e56b7f1aa..7ac89536e79 100644 --- a/src/os_common/os_errno.c +++ b/src/os_common/os_errno.c @@ -44,7 +44,7 @@ __wt_strerror(WT_SESSION_IMPL *session, int error, char *errbuf, size_t errlen) * Fallback to a generic message. */ if (session == NULL && - snprintf(errbuf, errlen, "error return: %d", error) > 0) + __wt_snprintf(errbuf, errlen, "error return: %d", error) == 0) return (errbuf); if (session != NULL && __wt_buf_fmt( session, &session->err, "error return: %d", error) == 0) diff --git a/src/os_common/os_fstream.c b/src/os_common/os_fstream.c index 5a368ea75e6..744da732d84 100644 --- a/src/os_common/os_fstream.c +++ b/src/os_common/os_fstream.c @@ -144,7 +144,7 @@ __fstream_printf( p = (char *)((uint8_t *)buf->mem + buf->size); WT_ASSERT(session, buf->memsize >= buf->size); space = buf->memsize - buf->size; - len = (size_t)vsnprintf(p, space, fmt, ap_copy); + WT_RET(__wt_vsnprintf_len_set(p, space, &len, fmt, ap_copy)); va_end(ap_copy); if (len < space) { diff --git a/src/os_posix/os_mtx_cond.c b/src/os_posix/os_mtx_cond.c index be8b1abda31..a5ee78f9e3e 100644 --- a/src/os_posix/os_mtx_cond.c +++ b/src/os_posix/os_mtx_cond.c @@ -13,8 +13,7 @@ * Allocate and initialize a condition variable. */ int -__wt_cond_alloc(WT_SESSION_IMPL *session, - const char *name, bool is_signalled, WT_CONDVAR **condp) +__wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) { WT_CONDVAR *cond; WT_DECL_RET; @@ -27,7 +26,7 @@ __wt_cond_alloc(WT_SESSION_IMPL *session, WT_ERR(pthread_cond_init(&cond->cond, NULL)); cond->name = name; - cond->waiters = is_signalled ? -1 : 0; + cond->waiters = 0; *condp = cond; return (0); @@ -42,8 +41,8 @@ err: __wt_free(session, cond); * out period expires, let the caller know. */ void -__wt_cond_wait_signal( - WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled) +__wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, + uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) { struct timespec ts; WT_DECL_RET; @@ -62,6 +61,23 @@ __wt_cond_wait_signal( WT_ERR(pthread_mutex_lock(&cond->mtx)); locked = true; + /* + * It's possible to race with threads waking us up. That's not a problem + * if there are multiple wakeups because the next wakeup will get us, or + * if we're only pausing for a short period. It's a problem if there's + * only a single wakeup, our waker is likely waiting for us to exit. + * After acquiring the mutex (so we're guaranteed to be awakened by any + * future wakeup call), optionally check if we're OK to keep running. + * This won't ensure our caller won't just loop and call us again, but + * at least it's not our fault. + * + * Assert we're not waiting longer than a second if not checking the + * run status. + */ + WT_ASSERT(session, run_func != NULL || usecs <= WT_MILLION); + if (run_func != NULL && !run_func(session)) + goto skipping; + if (usecs > 0) { __wt_epoch(session, &ts); ts.tv_sec += (time_t) @@ -81,7 +97,7 @@ __wt_cond_wait_signal( ret == ETIME || #endif ret == ETIMEDOUT) { - *signalled = false; +skipping: *signalled = false; ret = 0; } diff --git a/src/os_posix/os_snprintf.c b/src/os_posix/os_snprintf.c new file mode 100644 index 00000000000..390e2e0334a --- /dev/null +++ b/src/os_posix/os_snprintf.c @@ -0,0 +1,27 @@ +/*- + * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_vsnprintf_len_incr -- + * POSIX vsnprintf convenience function, incrementing the returned size. + */ +int +__wt_vsnprintf_len_incr( + char *buf, size_t size, size_t *retsizep, const char *fmt, va_list ap) + WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) +{ + WT_DECL_RET; + + if ((ret = vsnprintf(buf, size, fmt, ap)) >= 0) { + *retsizep += (size_t)ret; + return (0); + } + return (__wt_errno()); +} diff --git a/src/os_posix/os_thread.c b/src/os_posix/os_thread.c index 9bf36cc2686..18e4c347436 100644 --- a/src/os_posix/os_thread.c +++ b/src/os_posix/os_thread.c @@ -18,6 +18,13 @@ __wt_thread_create(WT_SESSION_IMPL *session, { WT_DECL_RET; + /* + * Creating a thread isn't a memory barrier, but WiredTiger commonly + * sets flags and or state and then expects worker threads to start. + * Include a barrier to ensure safety in those cases. + */ + WT_FULL_BARRIER(); + /* Spawn a new thread of control. */ WT_SYSCALL_RETRY(pthread_create(tidret, NULL, func, arg), ret); if (ret == 0) @@ -34,6 +41,13 @@ __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) { WT_DECL_RET; + /* + * Joining a thread isn't a memory barrier, but WiredTiger commonly + * sets flags and or state and then expects worker threads to halt. + * Include a barrier to ensure safety in those cases. + */ + WT_FULL_BARRIER(); + WT_SYSCALL(pthread_join(tid, NULL), ret); if (ret == 0) return (0); @@ -45,7 +59,7 @@ __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) * __wt_thread_id -- * Fill in a printable version of the process and thread IDs. */ -void +int __wt_thread_id(char *buf, size_t buflen) WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) { @@ -57,10 +71,10 @@ __wt_thread_id(char *buf, size_t buflen) */ self = pthread_self(); #ifdef __sun - (void)snprintf(buf, buflen, - "%" PRIuMAX ":%u", (uintmax_t)getpid(), self); + return (__wt_snprintf(buf, buflen, + "%" PRIuMAX ":%u", (uintmax_t)getpid(), self)); #else - (void)snprintf(buf, buflen, - "%" PRIuMAX ":%p", (uintmax_t)getpid(), (void *)self); + return (__wt_snprintf(buf, buflen, + "%" PRIuMAX ":%p", (uintmax_t)getpid(), (void *)self)); #endif } diff --git a/src/os_posix/os_yield.c b/src/os_posix/os_yield.c index 37d05bc1854..f7c43aae746 100644 --- a/src/os_posix/os_yield.c +++ b/src/os_posix/os_yield.c @@ -16,5 +16,13 @@ void __wt_yield(void) WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) { + /* + * Yielding the processor isn't documented as a memory barrier, and it's + * a reasonable expectation to have. There's no reason not to explicitly + * include a barrier since we're giving up the CPU, and ensures callers + * aren't ever surprised. + */ + WT_FULL_BARRIER(); + sched_yield(); } diff --git a/src/os_win/os_fs.c b/src/os_win/os_fs.c index 2f76fff04a5..5cf47ea5763 100644 --- a/src/os_win/os_fs.c +++ b/src/os_win/os_fs.c @@ -87,22 +87,19 @@ __win_fs_rename(WT_FILE_SYSTEM *file_system, WT_ERR(__wt_to_utf16_string(session, to, &to_wide)); /* - * Check if file exists since Windows does not override the file if - * it exists. + * We want an atomic rename, but that's not guaranteed by MoveFileExW + * (or by any MSDN API). Don't set the MOVEFILE_COPY_ALLOWED flag to + * prevent the system from falling back to a copy and delete process. + * Do set the MOVEFILE_WRITE_THROUGH flag so the window is as small + * as possible, just in case. WiredTiger renames are done in a single + * directory and we expect that to be an atomic metadata update on any + * modern filesystem. */ - if (GetFileAttributesW(to_wide->data) != INVALID_FILE_ATTRIBUTES) - if (DeleteFileW(to_wide->data) == FALSE) { - windows_error = __wt_getlasterror(); - __wt_errx(session, - "%s: file-rename: DeleteFileW: %s", - to, __wt_formatmessage(session, windows_error)); - WT_ERR(__wt_map_windows_error(windows_error)); - } - - if (MoveFileW(from_wide->data, to_wide->data) == FALSE) { + if (MoveFileExW(from_wide->data, to_wide->data, + MOVEFILE_REPLACE_EXISTING | MOVEFILE_WRITE_THROUGH) == FALSE) { windows_error = __wt_getlasterror(); __wt_errx(session, - "%s to %s: file-rename: MoveFileW: %s", + "%s to %s: file-rename: MoveFileExW: %s", from, to, __wt_formatmessage(session, windows_error)); WT_ERR(__wt_map_windows_error(windows_error)); } diff --git a/src/os_win/os_mtx_cond.c b/src/os_win/os_mtx_cond.c index 79c62ccd7f2..0001c6c2322 100644 --- a/src/os_win/os_mtx_cond.c +++ b/src/os_win/os_mtx_cond.c @@ -13,8 +13,7 @@ * Allocate and initialize a condition variable. */ int -__wt_cond_alloc(WT_SESSION_IMPL *session, - const char *name, bool is_signalled, WT_CONDVAR **condp) +__wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, WT_CONDVAR **condp) { WT_CONDVAR *cond; @@ -26,7 +25,7 @@ __wt_cond_alloc(WT_SESSION_IMPL *session, InitializeConditionVariable(&cond->cond); cond->name = name; - cond->waiters = is_signalled ? -1 : 0; + cond->waiters = 0; *condp = cond; return (0); @@ -38,8 +37,8 @@ __wt_cond_alloc(WT_SESSION_IMPL *session, * out period expires, let the caller know. */ void -__wt_cond_wait_signal( - WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, bool *signalled) +__wt_cond_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, + uint64_t usecs, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) { BOOL sleepret; DWORD milliseconds, windows_error; @@ -59,8 +58,26 @@ __wt_cond_wait_signal( EnterCriticalSection(&cond->mtx); locked = true; + /* + * It's possible to race with threads waking us up. That's not a problem + * if there are multiple wakeups because the next wakeup will get us, or + * if we're only pausing for a short period. It's a problem if there's + * only a single wakeup, our waker is likely waiting for us to exit. + * After acquiring the mutex (so we're guaranteed to be awakened by any + * future wakeup call), optionally check if we're OK to keep running. + * This won't ensure our caller won't just loop and call us again, but + * at least it's not our fault. + * + * Assert we're not waiting longer than a second if not checking the + * run status. + */ + WT_ASSERT(session, run_func != NULL || usecs <= WT_MILLION); + + if (run_func != NULL && !run_func(session)) + goto skipping; + if (usecs > 0) { - milliseconds64 = usecs / 1000; + milliseconds64 = usecs / WT_THOUSAND; /* * Check for 32-bit unsigned integer overflow @@ -90,7 +107,7 @@ __wt_cond_wait_signal( if (sleepret == 0) { windows_error = __wt_getlasterror(); if (windows_error == ERROR_TIMEOUT) { - *signalled = false; +skipping: *signalled = false; sleepret = 1; } } @@ -117,17 +134,17 @@ void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) { WT_DECL_RET; - bool locked; - - locked = false; __wt_verbose(session, WT_VERB_MUTEX, "signal %s", cond->name); /* - * Our callers are often setting flags to cause a thread to exit. Add - * a barrier to ensure the flags are seen by the threads. + * Our callers often set flags to cause a thread to exit. Add a barrier + * to ensure exit flags are seen by the sleeping threads, otherwise we + * can wake up a thread, it immediately goes back to sleep, and we'll + * hang. Use a full barrier (we may not write before waiting on thread + * join). */ - WT_WRITE_BARRIER(); + WT_FULL_BARRIER(); /* * Fast path if we are in (or can enter), a state where the next waiter diff --git a/src/os_win/os_snprintf.c b/src/os_win/os_snprintf.c index a6056ff9342..f3025b12a60 100644 --- a/src/os_win/os_snprintf.c +++ b/src/os_win/os_snprintf.c @@ -8,17 +8,47 @@ #include "wt_internal.h" -_Check_return_opt_ int __cdecl _wt_snprintf( - _Out_writes_(_MaxCount) char * _DstBuf, - _In_ size_t _MaxCount, - _In_z_ _Printf_format_string_ const char * _Format, ...) +/* + * __wt_vsnprintf_len_incr -- + * POSIX vsnprintf convenience function, incrementing the returned size. + */ +int +__wt_vsnprintf_len_incr( + char *buf, size_t size, size_t *retsizep, const char *fmt, va_list ap) { - va_list args; - WT_DECL_RET; + int len; + + /* + * WiredTiger calls with length 0 to get the needed buffer size. Call + * the count only version in this case, _vsnprintf_s will invoke the + * invalid parameter handler if count is less than or equal to zero. + */ + if (size == 0) { + *retsizep += (size_t)_vscprintf(fmt, ap); + return (0); + } + + /* + * Additionally, the invalid parameter handler is invoked if buffer or + * format is a NULL pointer. + */ + if (buf == NULL || fmt == NULL) + return (EINVAL); + + /* + * If the storage required to store the data and a terminating null + * exceeds size, the invalid parameter handler is invoked, unless + * count is _TRUNCATE, in which case as much of the string as will + * fit in the buffer is written and -1 returned. + */ + if ((len = _vsnprintf_s(buf, size, _TRUNCATE, fmt, ap)) >= 0) { + *retsizep += (size_t)len; + return (0); + } - va_start(args, _Format); - ret = _wt_vsnprintf(_DstBuf, _MaxCount, _Format, args); - va_end(args); + /* Return the buffer size required. */ + if (len == -1) + *retsizep += (size_t)_vscprintf(fmt, ap); - return (ret); + return (0); } diff --git a/src/os_win/os_thread.c b/src/os_win/os_thread.c index a34dff776b6..4c8f212bb4f 100644 --- a/src/os_win/os_thread.c +++ b/src/os_win/os_thread.c @@ -16,6 +16,13 @@ int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg) { + /* + * Creating a thread isn't a memory barrier, but WiredTiger commonly + * sets flags and or state and then expects worker threads to start. + * Include a barrier to ensure safety in those cases. + */ + WT_FULL_BARRIER(); + /* Spawn a new thread of control. */ *tidret = (HANDLE)_beginthreadex(NULL, 0, func, arg, 0, NULL); if (*tidret != 0) @@ -33,6 +40,13 @@ __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) { DWORD windows_error; + /* + * Joining a thread isn't a memory barrier, but WiredTiger commonly + * sets flags and or state and then expects worker threads to halt. + * Include a barrier to ensure safety in those cases. + */ + WT_FULL_BARRIER(); + if ((windows_error = WaitForSingleObject(tid, INFINITE)) != WAIT_OBJECT_0) { if (windows_error == WAIT_FAILED) @@ -58,10 +72,10 @@ __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) * __wt_thread_id -- * Fill in a printable version of the process and thread IDs. */ -void +int __wt_thread_id(char *buf, size_t buflen) { - (void)snprintf(buf, buflen, + return (__wt_snprintf(buf, buflen, "%" PRIu64 ":%" PRIu64, - (uint64_t)GetCurrentProcessId(), (uint64_t)GetCurrentThreadId); + (uint64_t)GetCurrentProcessId(), (uint64_t)GetCurrentThreadId)); } diff --git a/src/os_win/os_vsnprintf.c b/src/os_win/os_vsnprintf.c deleted file mode 100644 index 63f96e79d5b..00000000000 --- a/src/os_win/os_vsnprintf.c +++ /dev/null @@ -1,41 +0,0 @@ -/*- - * Copyright (c) 2014-2016 MongoDB, Inc. - * Copyright (c) 2008-2014 WiredTiger, Inc. - * All rights reserved. - * - * See the file LICENSE for redistribution information. - */ - -#include "wt_internal.h" - -_Check_return_opt_ int __cdecl _wt_vsnprintf( - _Out_writes_(_MaxCount) char * _DstBuf, - _In_ size_t _MaxCount, - _In_z_ _Printf_format_string_ const char * _Format, - va_list _ArgList) -{ - int len; - - /* - * WiredTiger will call with length 0 to get the needed buffer size - * We call the count only version in this case since vsnprintf_s assumes - * length is greater than zero or else it triggers the invalid_parameter - * handler. - */ - if (_MaxCount == 0) { - return _vscprintf(_Format, _ArgList); - } - - len = (size_t)_vsnprintf_s( - _DstBuf, _MaxCount, _TRUNCATE, _Format, _ArgList); - - /* - * The MSVC implementation returns -1 on truncation instead of what - * it would have written. We could let callers iteratively grow the - * buffer, or just ask us how big a buffer they would like. - */ - if (len == -1) - len = _vscprintf(_Format, _ArgList) + 1; - - return (len); -} diff --git a/src/os_win/os_yield.c b/src/os_win/os_yield.c index aab1559e072..038f2efe162 100644 --- a/src/os_win/os_yield.c +++ b/src/os_win/os_yield.c @@ -15,5 +15,13 @@ void __wt_yield(void) { + /* + * Yielding the processor isn't documented as a memory barrier, and it's + * a reasonable expectation to have. There's no reason not to explicitly + * include a barrier since we're giving up the CPU, and ensures callers + * aren't ever surprised. + */ + WT_FULL_BARRIER(); + SwitchToThread(); } diff --git a/src/reconcile/rec_track.c b/src/reconcile/rec_track.c index 3795b6e5ae8..5bf425b1b21 100644 --- a/src/reconcile/rec_track.c +++ b/src/reconcile/rec_track.c @@ -875,9 +875,9 @@ __wt_ovfl_track_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) WT_RET(__ovfl_reuse_wrapup(session, page)); if (track->ovfl_txnc[0] != NULL) { - __wt_writelock(session, S2BT(session)->ovfl_lock); + __wt_writelock(session, &S2BT(session)->ovfl_lock); ret = __ovfl_txnc_wrapup(session, page); - __wt_writeunlock(session, S2BT(session)->ovfl_lock); + __wt_writeunlock(session, &S2BT(session)->ovfl_lock); } return (ret); } @@ -903,9 +903,9 @@ __wt_ovfl_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page) WT_RET(__ovfl_reuse_wrapup_err(session, page)); if (track->ovfl_txnc[0] != NULL) { - __wt_writelock(session, S2BT(session)->ovfl_lock); + __wt_writelock(session, &S2BT(session)->ovfl_lock); ret = __ovfl_txnc_wrapup(session, page); - __wt_writeunlock(session, S2BT(session)->ovfl_lock); + __wt_writeunlock(session, &S2BT(session)->ovfl_lock); } return (ret); } diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index e82f449a50d..6f95b84d292 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -26,6 +26,11 @@ typedef struct { uint32_t flags; /* Caller's configuration */ WT_ITEM disk_image; /* Temporary disk-image buffer */ + /* + * Temporary buffer used to write out a disk image when managing two + * chunks worth of data in memory + */ + WT_ITEM *interim_buf; /* * Track start/stop write generation to decide if all changes to the @@ -127,6 +132,7 @@ typedef struct { * repeatedly split a packed page. */ uint32_t split_size; /* Split page size */ + uint32_t min_split_size; /* Minimum split page size */ /* * The problem with splits is we've done a lot of work by the time we @@ -151,16 +157,6 @@ typedef struct { */ size_t offset; /* Split's first byte */ - /* - * The recno and entries fields are the starting record number - * of the split chunk (for column-store splits), and the number - * of entries in the split chunk. These fields are used both - * to write the split chunk, and to create a new internal page - * to reference the split pages. - */ - uint64_t recno; /* Split's starting record */ - uint32_t entries; /* Split's entries */ - WT_ADDR addr; /* Split's written location */ uint32_t size; /* Split's size */ uint32_t checksum; /* Split's checksum */ @@ -182,11 +178,36 @@ typedef struct { size_t supd_allocated; /* + * While reconciling pages, at any given time, we maintain two + * split chunks in the memory to be written out as pages. As we + * get to the last two chunks, if the last one turns out to be + * smaller than the minimum split size, we go back into the + * penultimate chunk and split at this minimum split size + * boundary. This moves some data from the penultimate chunk to + * the last chunk, hence increasing the size of the last page + * written without decreasing the penultimate page size beyond + * the minimum split size. For this reason, we maintain both a + * maximum split percentage boundary and a minimum split + * percentage boundary. + * + * The recno and entries fields are the starting record number + * of the split chunk (for column-store splits), and the number + * of entries in the split chunk. These fields are used both to + * write the split chunk, and to create a new internal page to + * reference the split pages. + * * The key for a row-store page; no column-store key is needed * because the page's recno, stored in the recno field, is the * column-store key. */ - WT_ITEM key; /* Promoted row-store key */ + uint32_t max_bnd_entries; + uint64_t max_bnd_recno; + WT_ITEM max_bnd_key; + + size_t min_bnd_offset; + uint32_t min_bnd_entries; + uint64_t min_bnd_recno; + WT_ITEM min_bnd_key; } *bnd; /* Saved boundaries */ uint32_t bnd_next; /* Next boundary slot */ uint32_t bnd_next_max; /* Maximum boundary slots used */ @@ -194,28 +215,6 @@ typedef struct { size_t bnd_allocated; /* Bytes allocated */ /* - * We track the total number of page entries copied into split chunks - * so we can easily figure out how many entries in the current split - * chunk. - */ - uint32_t total_entries; /* Total entries in splits */ - - /* - * And there's state information as to where in this process we are: - * (1) tracking split boundaries because we can still fit more split - * chunks into the maximum page size, (2) tracking the maximum page - * size boundary because we can't fit any more split chunks into the - * maximum page size, (3) not performing boundary checks because it's - * either not useful with the current page size configuration, or - * because we've already been forced to split. - */ - enum { SPLIT_BOUNDARY=0, /* Next: a split page boundary */ - SPLIT_MAX=1, /* Next: the maximum page boundary */ - SPLIT_TRACKING_OFF=2, /* No boundary checks */ - SPLIT_TRACKING_RAW=3 } /* Underlying compression decides */ - bnd_state; - - /* * We track current information about the current record number, the * number of entries copied into the temporary buffer, where we are * in the temporary buffer, and how much memory remains. Those items @@ -226,6 +225,8 @@ typedef struct { uint32_t entries; /* Current number of entries */ uint8_t *first_free; /* Current first free byte */ size_t space_avail; /* Remaining space in this chunk */ + /* Remaining space in this chunk to put a minimum size boundary */ + size_t min_space_avail; /* * Saved update list, supporting the WT_EVICT_UPDATE_RESTORE and @@ -247,15 +248,14 @@ typedef struct { /* * WT_DICTIONARY -- - * We optionally build a dictionary of row-store values for leaf - * pages. Where two value cells are identical, only write the value - * once, the second and subsequent copies point to the original cell. - * The dictionary is fixed size, but organized in a skip-list to make - * searches faster. + * We optionally build a dictionary of values for leaf pages. Where + * two value cells are identical, only write the value once, the second + * and subsequent copies point to the original cell. The dictionary is + * fixed size, but organized in a skip-list to make searches faster. */ struct __rec_dictionary { uint64_t hash; /* Hash value */ - void *cell; /* Matching cell */ + uint32_t offset; /* Matching cell */ u_int depth; /* Skiplist */ WT_DICTIONARY *next[0]; @@ -293,6 +293,13 @@ typedef struct { uint32_t tested_ref_state; /* Debugging information */ } WT_RECONCILE; +#define WT_CROSSING_MIN_BND(r, next_len) \ + ((r)->bnd[(r)->bnd_next].min_bnd_offset == 0 && \ + (next_len) > (r)->min_space_avail) +#define WT_CROSSING_SPLIT_BND(r, next_len) ((next_len) > (r)->space_avail) +#define WT_CHECK_CROSSING_BND(r, next_len) \ + (WT_CROSSING_MIN_BND(r, next_len) || WT_CROSSING_SPLIT_BND(r, next_len)) + static void __rec_bnd_cleanup(WT_SESSION_IMPL *, WT_RECONCILE *, bool); static void __rec_cell_build_addr(WT_SESSION_IMPL *, WT_RECONCILE *, const void *, size_t, u_int, uint64_t); @@ -314,6 +321,7 @@ static int __rec_col_var(WT_SESSION_IMPL *, static int __rec_col_var_helper(WT_SESSION_IMPL *, WT_RECONCILE *, WT_SALVAGE_COOKIE *, WT_ITEM *, bool, uint8_t, uint64_t); static int __rec_destroy_session(WT_SESSION_IMPL *); +static uint32_t __rec_min_split_page_size(WT_BTREE *, uint32_t); static int __rec_root_write(WT_SESSION_IMPL *, WT_PAGE *, uint32_t); static int __rec_row_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_row_leaf(WT_SESSION_IMPL *, @@ -323,7 +331,6 @@ static int __rec_row_leaf_insert( static int __rec_row_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_split_col(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_split_discard(WT_SESSION_IMPL *, WT_PAGE *); -static int __rec_split_fixup(WT_SESSION_IMPL *, WT_RECONCILE *); static int __rec_split_row(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_split_row_promote( WT_SESSION_IMPL *, WT_RECONCILE *, WT_ITEM *, uint8_t); @@ -968,6 +975,7 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep) *(WT_RECONCILE **)reconcilep = NULL; __wt_buf_free(session, &r->disk_image); + __wt_scr_free(session, &r->interim_buf); __wt_free(session, r->raw_entries); __wt_free(session, r->raw_offsets); @@ -1032,7 +1040,8 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, bool destroy) __wt_free(session, bnd->addr.addr); __wt_free(session, bnd->disk_image); __wt_free(session, bnd->supd); - __wt_buf_free(session, &bnd->key); + __wt_buf_free(session, &bnd->max_bnd_key); + __wt_buf_free(session, &bnd->min_bnd_key); } __wt_free(session, r->bnd); r->bnd_next = 0; @@ -1395,7 +1404,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, */ #define WT_CHILD_RELEASE(session, hazard, ref) do { \ if (hazard) { \ - hazard = false; \ + (hazard) = false; \ WT_TRET( \ __wt_page_release(session, ref, WT_READ_NO_EVICT)); \ } \ @@ -1717,6 +1726,17 @@ __rec_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t v, size_t size) r->entries += v; r->space_avail -= size; r->first_free += size; + + /* + * If offset for the minimum split size boundary is not set, we have not + * yet reached the minimum boundary, reduce the space available for it. + */ + if (r->bnd[r->bnd_next].min_bnd_offset == 0) { + if (r->min_space_avail >= size) + r->min_space_avail -= size; + else + r->min_space_avail = 0; + } } /* @@ -1737,7 +1757,7 @@ __rec_copy_incr(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_KV *kv) * WT_CELLs are typically small, 1 or 2 bytes -- don't call memcpy, do * the copy in-line. */ - for (p = (uint8_t *)r->first_free, + for (p = r->first_free, t = (uint8_t *)&kv->cell, len = kv->cell_len; len > 0; --len) *p++ = *t++; @@ -1781,16 +1801,22 @@ __rec_dict_replace( return (0); /* - * If the dictionary cell reference is not set, we're creating a new - * entry in the dictionary, update its location. + * If the dictionary offset isn't set, we're creating a new entry in the + * dictionary, set its location. * - * If the dictionary cell reference is set, we have a matching value. - * Create a copy cell instead. + * If the dictionary offset is set, we have a matching value. Create a + * copy cell instead. */ - if (dp->cell == NULL) - dp->cell = r->first_free; + if (dp->offset == 0) + dp->offset = WT_PTRDIFF32(r->first_free, r->disk_image.mem); else { - offset = WT_PTRDIFF(r->first_free, dp->cell); + /* + * The offset is the byte offset from this cell to the previous, + * matching cell, NOT the byte offset from the beginning of the + * page. + */ + offset = (uint64_t)WT_PTRDIFF(r->first_free, + (uint8_t *)r->disk_image.mem + dp->offset); val->len = val->cell_len = __wt_cell_pack_copy(&val->cell, rle, offset); val->buf.data = NULL; @@ -1927,8 +1953,8 @@ static void __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd) { bnd->offset = 0; - bnd->recno = WT_RECNO_OOB; - bnd->entries = 0; + bnd->max_bnd_recno = WT_RECNO_OOB; + bnd->max_bnd_entries = 0; __wt_free(session, bnd->addr.addr); WT_CLEAR(bnd->addr); @@ -1943,6 +1969,10 @@ __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd) bnd->already_compressed = false; + bnd->min_bnd_offset = 0; + bnd->min_bnd_entries = 0; + bnd->min_bnd_recno = WT_RECNO_OOB; + /* * Don't touch the key, we re-use that memory in each new * reconciliation. @@ -1974,40 +2004,64 @@ __rec_split_bnd_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r) } /* - * __wt_split_page_size -- - * Split page size calculation: we don't want to repeatedly split every - * time a new entry is added, so we split to a smaller-than-maximum page size. + * __rec_split_page_size_from_pct -- + * Given a split percentage, calculate split page size in bytes. */ -uint32_t -__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize) -{ +static uint32_t +__rec_split_page_size_from_pct( + int split_pct, uint32_t maxpagesize, uint32_t allocsize) { uintmax_t a; uint32_t split_size; /* * Ideally, the split page size is some percentage of the maximum page - * size rounded to an allocation unit (round to an allocation unit so - * we don't waste space when we write). + * size rounded to an allocation unit (round to an allocation unit so we + * don't waste space when we write). */ a = maxpagesize; /* Don't overflow. */ split_size = (uint32_t)WT_ALIGN_NEAREST( - (a * (u_int)btree->split_pct) / 100, btree->allocsize); + (a * (u_int)split_pct) / 100, allocsize); /* - * Respect the configured split percentage if the calculated split - * size is either zero or a full page. The user has either configured - * an allocation size that matches the page size, or a split - * percentage that is close to zero or one hundred. Rounding is going - * to provide a worse outcome than having a split point that doesn't - * fall on an allocation size boundary in those cases. + * Respect the configured split percentage if the calculated split size + * is either zero or a full page. The user has either configured an + * allocation size that matches the page size, or a split percentage + * that is close to zero or one hundred. Rounding is going to provide a + * worse outcome than having a split point that doesn't fall on an + * allocation size boundary in those cases. */ if (split_size == 0 || split_size == maxpagesize) - split_size = (uint32_t)((a * (u_int)btree->split_pct) / 100); + split_size = (uint32_t)((a * (u_int)split_pct) / 100); return (split_size); } /* + * __wt_split_page_size -- + * Split page size calculation: we don't want to repeatedly split every + * time a new entry is added, so we split to a smaller-than-maximum page size. + */ +uint32_t +__wt_split_page_size(WT_BTREE *btree, uint32_t maxpagesize) +{ + return (__rec_split_page_size_from_pct( + btree->split_pct, maxpagesize, btree->allocsize)); +} + +/* + * __rec_min_split_page_size -- + * Minimum split size boundary calculation: To track a boundary at the + * minimum split size that we could have split at instead of splitting at + * the split page size. + */ +static uint32_t +__rec_min_split_page_size(WT_BTREE *btree, uint32_t maxpagesize) +{ + return (__rec_split_page_size_from_pct( + WT_BTREE_MIN_SPLIT_PCT, maxpagesize, btree->allocsize)); +} + +/* * __rec_split_init -- * Initialization for the reconciliation split functions. */ @@ -2018,7 +2072,7 @@ __rec_split_init(WT_SESSION_IMPL *session, WT_BM *bm; WT_BTREE *btree; WT_PAGE_HEADER *dsk; - size_t corrected_page_size; + size_t corrected_page_size, disk_img_buf_size; btree = S2BT(session); bm = btree->bm; @@ -2053,33 +2107,6 @@ __rec_split_init(WT_SESSION_IMPL *session, r->max_raw_page_size = r->page_size = (uint32_t)WT_MIN(r->page_size * 10, WT_MAX(r->page_size, btree->maxmempage / 2)); - - /* - * Ensure the disk image buffer is large enough for the max object, as - * corrected by the underlying block manager. - */ - corrected_page_size = r->page_size; - WT_RET(bm->write_size(bm, session, &corrected_page_size)); - WT_RET(__wt_buf_init(session, &r->disk_image, corrected_page_size)); - - /* - * Clear the disk page header to ensure all of it is initialized, even - * the unused fields. - * - * In the case of fixed-length column-store, clear the entire buffer: - * fixed-length column-store sets bits in bytes, where the bytes are - * assumed to initially be 0. - */ - memset(r->disk_image.mem, 0, page->type == WT_PAGE_COL_FIX ? - corrected_page_size : WT_PAGE_HEADER_SIZE); - - /* - * Set the page type (the type doesn't change, and setting it later - * would require additional code in a few different places). - */ - dsk = r->disk_image.mem; - dsk->type = page->type; - /* * If we have to split, we want to choose a smaller page size for the * split pages, because otherwise we could end up splitting one large @@ -2099,22 +2126,28 @@ __rec_split_init(WT_SESSION_IMPL *session, * creating overflow items and compacted data, for example, as those * items have already been written to disk). So, the loop calls the * helper functions when approaching a split boundary, and we save the - * information at that point. That allows us to go back and split the - * page at the boundary points if we eventually overflow the maximum - * page size. + * information at that point. We also save the boundary information at + * the minimum split size. We maintain two chunks (each boundary + * represents a chunk that gets written as a page) in the memory, + * writing out the older one to the disk as a page when we need to make + * space for a new chunk. On reaching the last chunk, if it turns out to + * be smaller than the minimum split size, we go back into the + * penultimate chunk and split at this minimum split size boundary. This + * moves some data from the penultimate chunk to the last chunk, hence + * increasing the size of the last page written without decreasing the + * penultimate page size beyond the minimum split size. * * Finally, all this doesn't matter for fixed-size column-store pages, * raw compression, and salvage. Fixed-size column store pages can * split under (very) rare circumstances, but they're allocated at a * fixed page size, never anything smaller. In raw compression, the - * underlying compression routine decides when we split, so it's not - * our problem. In salvage, as noted above, we can't split at all. + * underlying compression routine decides when we split, so it's not our + * problem. In salvage, as noted above, we can't split at all. */ if (r->raw_compression || r->salvage != NULL) { r->split_size = 0; r->space_avail = r->page_size - WT_PAGE_HEADER_BYTE_SIZE(btree); - } - else if (page->type == WT_PAGE_COL_FIX) { + } else if (page->type == WT_PAGE_COL_FIX) { r->split_size = r->page_size; r->space_avail = r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); @@ -2122,32 +2155,55 @@ __rec_split_init(WT_SESSION_IMPL *session, r->split_size = __wt_split_page_size(btree, r->page_size); r->space_avail = r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + r->min_split_size = + __rec_min_split_page_size(btree, r->page_size); + r->min_space_avail = + r->min_split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); } + + /* + * Ensure the disk image buffer is large enough for the max object, as + * corrected by the underlying block manager. + * + * The buffer that we build disk image in, needs to hold two chunks + * worth of data. Since we want to support split_size more than the page + * size (to allow for adjustments based on the compression), this buffer + * should be greater of twice of split_size and page_size. + */ + corrected_page_size = r->page_size; + disk_img_buf_size = 2 * WT_MAX(corrected_page_size, r->split_size); + WT_RET(bm->write_size(bm, session, &corrected_page_size)); + WT_RET(__wt_buf_init(session, &r->disk_image, disk_img_buf_size)); + + /* + * Clear the disk page header to ensure all of it is initialized, even + * the unused fields. + * + * In the case of fixed-length column-store, clear the entire buffer: + * fixed-length column-store sets bits in bytes, where the bytes are + * assumed to initially be 0. + */ + memset(r->disk_image.mem, 0, page->type == WT_PAGE_COL_FIX ? + disk_img_buf_size : WT_PAGE_HEADER_SIZE); + + /* + * Set the page type (the type doesn't change, and setting it later + * would require additional code in a few different places). + */ + dsk = r->disk_image.mem; + dsk->type = page->type; + r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); /* Initialize the first boundary. */ r->bnd_next = 0; WT_RET(__rec_split_bnd_grow(session, r)); __rec_split_bnd_init(session, &r->bnd[0]); - r->bnd[0].recno = recno; + r->bnd[0].max_bnd_recno = recno; r->bnd[0].offset = WT_PAGE_HEADER_BYTE_SIZE(btree); - /* - * If the maximum page size is the same as the split page size, either - * because of the object type or application configuration, there isn't - * any need to maintain split boundaries within a larger page. - * - * No configuration for salvage here, because salvage can't split. - */ - if (r->raw_compression) - r->bnd_state = SPLIT_TRACKING_RAW; - else if (max == r->split_size) - r->bnd_state = SPLIT_TRACKING_OFF; - else - r->bnd_state = SPLIT_BOUNDARY; - - /* Initialize the entry counters. */ - r->entries = r->total_entries = 0; + /* Initialize the entry counter. */ + r->entries = 0; /* Initialize the starting record number. */ r->recno = recno; @@ -2350,19 +2406,112 @@ __rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len) { WT_BM *bm; WT_BTREE *btree; - size_t corrected_page_size, len; + size_t corrected_page_size, inuse, len; btree = S2BT(session); bm = btree->bm; len = WT_PTRDIFF(r->first_free, r->disk_image.mem); - corrected_page_size = len + add_len; + inuse = (len - r->bnd[r->bnd_next].offset) + + WT_PAGE_HEADER_BYTE_SIZE(btree); + corrected_page_size = inuse + add_len; + WT_RET(bm->write_size(bm, session, &corrected_page_size)); - WT_RET(__wt_buf_grow(session, &r->disk_image, corrected_page_size)); + /* Need to account for buffer carrying two chunks worth of data */ + WT_RET(__wt_buf_grow(session, &r->disk_image, 2 * corrected_page_size)); + r->first_free = (uint8_t *)r->disk_image.mem + len; - WT_ASSERT(session, corrected_page_size >= len); - r->space_avail = corrected_page_size - len; + WT_ASSERT(session, corrected_page_size >= inuse); + r->space_avail = corrected_page_size - inuse; WT_ASSERT(session, r->space_avail >= add_len); + + return (0); +} + +/* + * __rec_split_write_prev_and_shift_cur -- + * Write the previous split chunk to the disk as a page. Shift the contents + * of the current chunk to the start of the buffer, making space for a new + * chunk to be written. + * If the caller asks for a chunk resizing, the boundary between the two + * chunks is readjusted to the minimum split size boundary details stored + * in the previous chunk, letting the current chunk grow at the cost of the + * previous chunk. + */ +static int +__rec_split_write_prev_and_shift_cur( + WT_SESSION_IMPL *session, WT_RECONCILE *r, bool resize_chunks) +{ + WT_BM *bm; + WT_BOUNDARY *bnd_cur, *bnd_prev; + WT_BTREE *btree; + WT_PAGE_HEADER *dsk, *dsk_tmp; + size_t cur_len, len; + uint8_t *dsk_start; + + WT_ASSERT(session, r->bnd_next != 0); + + btree = S2BT(session); + bm = btree->bm; + bnd_cur = &r->bnd[r->bnd_next]; + bnd_prev = bnd_cur - 1; + dsk = r->disk_image.mem; + cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset; + + /* + * Resize chunks if the current is smaller than the minimum, and there + * are details on the minimum split size boundary available in the + * previous boundary details. + * + * There is a possibility that we do not have a minimum boundary set, in + * such a case we skip chunk resizing. Such a condition is possible for + * instance when we are building the image in the buffer and the first + * K/V pair is large enough that it surpasses both the minimum split + * size and the split size the application has set. In such a case we + * split the chunk without saving any minimum boundary. + */ + if (resize_chunks && + cur_len < r->min_split_size && bnd_prev->min_bnd_offset != 0) { + bnd_cur->offset = bnd_prev->min_bnd_offset; + bnd_cur->max_bnd_entries += + bnd_prev->max_bnd_entries - bnd_prev->min_bnd_entries; + bnd_prev->max_bnd_entries = bnd_prev->min_bnd_entries; + bnd_cur->max_bnd_recno = bnd_prev->min_bnd_recno; + + WT_RET(__wt_buf_set(session, &bnd_cur->max_bnd_key, + bnd_prev->min_bnd_key.data, bnd_prev->min_bnd_key.size)); + + /* Update current chunk's length */ + cur_len = WT_PTRDIFF(r->first_free, dsk) - bnd_cur->offset; + } + + /* + * Create an interim buffer if not already done to prepare the previous + * chunk's disk image. + */ + len = bnd_cur->offset; + WT_RET(bm->write_size(bm, session, &len)); + if (r->interim_buf == NULL) + WT_RET(__wt_scr_alloc(session, len, &r->interim_buf)); + else + WT_RET(__wt_buf_init(session, r->interim_buf, len)); + + dsk_tmp = r->interim_buf->mem; + memcpy(dsk_tmp, dsk, bnd_cur->offset); + dsk_tmp->recno = bnd_prev->max_bnd_recno; + dsk_tmp->u.entries = bnd_prev->max_bnd_entries; + dsk_tmp->mem_size = WT_STORE_SIZE(bnd_cur->offset); + r->interim_buf->size = dsk_tmp->mem_size; + WT_RET(__rec_split_write(session, r, bnd_prev, r->interim_buf, false)); + + /* Shift the current chunk to the start of the buffer */ + dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); + (void)memmove(dsk_start, (uint8_t *)dsk + bnd_cur->offset, cur_len); + + /* Fix boundary offset */ + bnd_cur->offset = WT_PAGE_HEADER_BYTE_SIZE(btree); + /* Fix where free points */ + r->first_free = dsk_start + cur_len; return (0); } @@ -2382,6 +2531,9 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) btree = S2BT(session); dsk = r->disk_image.mem; + /* Fixed length col store can call with next_len 0 */ + WT_ASSERT(session, next_len == 0 || r->space_avail < next_len); + /* * We should never split during salvage, and we're about to drop core * because there's no parent page. @@ -2391,147 +2543,63 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) "%s page too large, attempted split during salvage", __wt_page_type_string(r->page->type)); - /* Hitting a page boundary resets the dictionary, in all cases. */ - __rec_dictionary_reset(r); - - inuse = WT_PTRDIFF(r->first_free, dsk); - switch (r->bnd_state) { - case SPLIT_BOUNDARY: - /* - * We can get here if the first key/value pair won't fit. - * Additionally, grow the buffer to contain the current item if - * we haven't already consumed a reasonable portion of a split - * chunk. - */ - if (inuse < r->split_size / 2) - break; - - /* - * About to cross a split boundary but not yet forced to split - * into multiple pages. If we have to split, this is one of the - * split points, save information about where we are when the - * split would have happened. - */ - WT_RET(__rec_split_bnd_grow(session, r)); - last = &r->bnd[r->bnd_next++]; - next = last + 1; - - /* Set the number of entries for the just finished chunk. */ - last->entries = r->entries - r->total_entries; - r->total_entries = r->entries; - - /* Set the key for the next chunk. */ - next->recno = r->recno; - if (dsk->type == WT_PAGE_ROW_INT || - dsk->type == WT_PAGE_ROW_LEAF) - WT_RET(__rec_split_row_promote( - session, r, &next->key, dsk->type)); - - /* - * Set the starting buffer offset and clear the entries (the - * latter not required, but cleaner). - */ - next->offset = WT_PTRDIFF(r->first_free, dsk); - next->entries = 0; - - /* Set the space available to another split-size chunk. */ - r->space_avail = - r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); - - /* - * Adjust the space available to handle two cases: - * - We don't have enough room for another full split-size - * chunk on the page. - * - We chose to fill past a page boundary because of a - * large item. - */ - if (inuse + r->space_avail > r->page_size) { - r->space_avail = - r->page_size > inuse ? (r->page_size - inuse) : 0; - - /* There are no further boundary points. */ - r->bnd_state = SPLIT_MAX; - } - - /* - * Return if the next object fits into this page, else we have - * to split the page. - */ - if (r->space_avail >= next_len) - return (0); - - /* FALLTHROUGH */ - case SPLIT_MAX: - /* - * We're going to have to split and create multiple pages. - * - * Cycle through the saved split-point information, writing the - * split chunks we have tracked. The underlying fixup function - * sets the space available and other information, and copied - * any unwritten chunk of data to the beginning of the buffer. - */ - WT_RET(__rec_split_fixup(session, r)); - - /* We're done saving split chunks. */ - r->bnd_state = SPLIT_TRACKING_OFF; - break; - case SPLIT_TRACKING_OFF: - /* - * We can get here if the first key/value pair won't fit. - * Additionally, grow the buffer to contain the current item if - * we haven't already consumed a reasonable portion of a split - * chunk. - */ - if (inuse < r->split_size / 2) - break; + last = &r->bnd[r->bnd_next]; + inuse = (WT_PTRDIFF(r->first_free, dsk) - last->offset) + + WT_PAGE_HEADER_BYTE_SIZE(btree); - /* - * The key/value pairs didn't fit into a single page, but either - * we've already noticed that and are now processing the rest of - * the pairs at split size boundaries, or the split size was the - * same as the page size, and we never bothered with split point - * information at all. - */ - WT_RET(__rec_split_bnd_grow(session, r)); - last = &r->bnd[r->bnd_next++]; - next = last + 1; + /* + * We can get here if the first key/value pair won't fit. + * Additionally, grow the buffer to contain the current item if we + * haven't already consumed a reasonable portion of a split chunk. + */ + if (inuse < r->split_size / 2) + goto done; - /* - * Set the key for the next chunk (before writing the block, a - * key range is needed in that code). - */ - next->recno = r->recno; - if (dsk->type == WT_PAGE_ROW_INT || - dsk->type == WT_PAGE_ROW_LEAF) - WT_RET(__rec_split_row_promote( - session, r, &next->key, dsk->type)); + /* All page boundaries reset the dictionary. */ + __rec_dictionary_reset(r); - /* Clear the entries (not required, but cleaner). */ - next->entries = 0; + /* Set the number of entries for the just finished chunk. */ + last->max_bnd_entries = r->entries; - /* Finalize the header information and write the page. */ - dsk->recno = last->recno; - dsk->u.entries = r->entries; - dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk); + /* + * In case of bulk load, write out chunks as we get them. Otherwise we + * keep two chunks in memory at a given time. So, if there is a previous + * chunk, write it out, making space in the buffer for the next chunk to + * be written. + */ + if (r->is_bulk_load) { + dsk->recno = last->max_bnd_recno; + dsk->u.entries = last->max_bnd_entries; + dsk->mem_size = (uint32_t)inuse; r->disk_image.size = dsk->mem_size; - WT_RET( - __rec_split_write(session, r, last, &r->disk_image, false)); - - /* - * Set the caller's entry count and buffer information for the - * next chunk. We only get here if we're not splitting or have - * already split, so it's split-size chunks from here on out. - */ - r->entries = 0; + WT_RET(__rec_split_write( + session, r, last, &r->disk_image, false)); + /* Fix where free points */ r->first_free = WT_PAGE_HEADER_BYTE(btree, dsk); - r->space_avail = - r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); - break; - case SPLIT_TRACKING_RAW: - return (__wt_illegal_value(session, NULL)); - } + } else if (r->bnd_next != 0) + WT_RET(__rec_split_write_prev_and_shift_cur(session, r, false)); - /* + /* Prepare the next boundary */ + WT_RET(__rec_split_bnd_grow(session, r)); + r->bnd_next++; + next = &r->bnd[r->bnd_next]; + next->offset = WT_PTRDIFF(r->first_free, dsk); + /* Set the key for the next chunk. */ + next->max_bnd_recno = r->recno; + if (dsk->type == WT_PAGE_ROW_INT || dsk->type == WT_PAGE_ROW_LEAF) + WT_RET(__rec_split_row_promote( + session, r, &next->max_bnd_key, dsk->type)); + + r->entries = 0; + /* + * Set the space available to another split-size and minimum split-size + * chunk. + */ + r->space_avail = r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + r->min_space_avail = + r->min_split_size - WT_PAGE_HEADER_BYTE_SIZE(btree); + +done: /* * Overflow values can be larger than the maximum page size but still be * "on-page". If the next key/value pair is larger than space available * after a split has happened (in other words, larger than the maximum @@ -2549,6 +2617,64 @@ __rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) } /* + * __rec_split_crossing_bnd -- + * Save the details for the minimum split size boundary or call for a + * split. + */ +static inline int +__rec_split_crossing_bnd( + WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) +{ + WT_BOUNDARY *bnd; + WT_BTREE *btree; + WT_PAGE_HEADER *dsk; + size_t min_bnd_offset; + + WT_ASSERT(session, WT_CHECK_CROSSING_BND(r, next_len)); + + /* + * If crossing the minimum split size boundary, store the boundary + * details at the current location in the buffer. If we are crossing the + * split boundary at the same time, possible when the next record is + * large enough, just split at this point. + */ + if (WT_CROSSING_MIN_BND(r, next_len) && + !WT_CROSSING_SPLIT_BND(r, next_len)) { + btree = S2BT(session); + bnd = &r->bnd[r->bnd_next]; + dsk = r->disk_image.mem; + min_bnd_offset = (WT_PTRDIFF(r->first_free, dsk) - + bnd->offset) + WT_PAGE_HEADER_BYTE_SIZE(btree); + if (min_bnd_offset == WT_PAGE_HEADER_BYTE_SIZE(btree)) + /* + * This is possible if the first record doesn't fit in + * the minimum split size, we write this record without + * setting up any boundary here. We will get the + * opportunity to setup a boundary before writing out + * the next record. + */ + return (0); + + WT_ASSERT(session, bnd->min_bnd_offset == 0); + + /* All page boundaries reset the dictionary. */ + __rec_dictionary_reset(r); + + bnd->min_bnd_offset = min_bnd_offset; + bnd->min_bnd_entries = r->entries; + bnd->min_bnd_recno = r->recno; + if (dsk->type == WT_PAGE_ROW_INT || + dsk->type == WT_PAGE_ROW_LEAF) + WT_RET(__rec_split_row_promote( + session, r, &bnd->min_bnd_key, dsk->type)); + return (0); + } + + /* We are crossing a split boundary */ + return (__rec_split(session, r, next_len)); +} + +/* * __rec_split_raw_worker -- * Handle the raw compression page reconciliation bookkeeping. */ @@ -2626,7 +2752,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, */ recno = WT_RECNO_OOB; if (dsk->type == WT_PAGE_COL_VAR) - recno = last->recno; + recno = last->max_bnd_recno; entry = max_image_slot = slots = 0; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { @@ -2853,7 +2979,7 @@ no_slots: */ dst->size = result_len + WT_BLOCK_COMPRESS_SKIP; dsk_dst = dst->mem; - dsk_dst->recno = last->recno; + dsk_dst->recno = last->max_bnd_recno; dsk_dst->mem_size = r->raw_offsets[result_slots] + WT_BLOCK_COMPRESS_SKIP; dsk_dst->u.entries = r->raw_entries[result_slots - 1]; @@ -2873,7 +2999,7 @@ no_slots: WT_RET(__wt_strndup(session, dsk, dsk_dst->mem_size, &last->disk_image)); disk_image = last->disk_image; - disk_image->recno = last->recno; + disk_image->recno = last->max_bnd_recno; disk_image->mem_size = dsk_dst->mem_size; disk_image->u.entries = dsk_dst->u.entries; } @@ -2889,7 +3015,7 @@ no_slots: len = WT_PTRDIFF( r->first_free, (uint8_t *)dsk + dsk_dst->mem_size); dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); - (void)memmove(dsk_start, (uint8_t *)r->first_free - len, len); + (void)memmove(dsk_start, r->first_free - len, len); r->entries -= r->raw_entries[result_slots - 1]; r->first_free = dsk_start + len; @@ -2903,14 +3029,14 @@ no_slots: */ switch (dsk->type) { case WT_PAGE_COL_INT: - next->recno = r->raw_recnos[result_slots]; + next->max_bnd_recno = r->raw_recnos[result_slots]; break; case WT_PAGE_COL_VAR: - next->recno = r->raw_recnos[result_slots - 1]; + next->max_bnd_recno = r->raw_recnos[result_slots - 1]; break; case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: - next->recno = WT_RECNO_OOB; + next->max_bnd_recno = WT_RECNO_OOB; if (!last_block) { /* * Confirm there was uncompressed data remaining @@ -2919,7 +3045,7 @@ no_slots: */ WT_ASSERT(session, len > 0); WT_RET(__rec_split_row_promote_cell( - session, dsk, &next->key)); + session, dsk, &next->max_bnd_key)); } break; } @@ -2931,7 +3057,7 @@ no_slots: */ WT_STAT_DATA_INCR(session, compress_raw_fail); - dsk->recno = last->recno; + dsk->recno = last->max_bnd_recno; dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk); dsk->u.entries = r->entries; r->disk_image.size = dsk->mem_size; @@ -3008,35 +3134,9 @@ __rec_split_raw(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) static int __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) { - WT_BOUNDARY *bnd; + WT_BOUNDARY *bnd_cur, *bnd_prev; WT_PAGE_HEADER *dsk; - - /* Adjust the boundary information based on our split status. */ - switch (r->bnd_state) { - case SPLIT_BOUNDARY: - case SPLIT_MAX: - /* - * We never split, the reconciled page fit into a maximum page - * size. Change the first boundary slot to represent the full - * page (the first boundary slot is largely correct, just update - * the number of entries). - */ - r->bnd_next = 0; - break; - case SPLIT_TRACKING_OFF: - /* - * If we have already split, or aren't tracking boundaries, put - * the remaining data in the next boundary slot. - */ - WT_RET(__rec_split_bnd_grow(session, r)); - break; - case SPLIT_TRACKING_RAW: - /* - * We were configured for raw compression, and either we never - * wrote anything, or there's a remaindered block of data. - */ - break; - } + bool grow_bnd; /* * We may arrive here with no entries to write if the page was entirely @@ -3063,20 +3163,66 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) return (EBUSY); } - /* Set the boundary reference and increment the count. */ - bnd = &r->bnd[r->bnd_next++]; - bnd->entries = r->entries; - - /* Finalize the header information. */ dsk = r->disk_image.mem; - dsk->recno = bnd->recno; - dsk->u.entries = r->entries; + + /* Set the number of entries for the just finished chunk. */ + bnd_cur = &r->bnd[r->bnd_next]; + bnd_cur->max_bnd_entries = r->entries; + + grow_bnd = true; + /* + * We can reach here even with raw_compression when the last split chunk + * is too small to be sent for raw compression. + */ + if (!r->is_bulk_load && !r->raw_compression) { + if (WT_PTRDIFF(r->first_free, dsk) > r->page_size && + r->bnd_next != 0) { + /* + * We hold two boundaries worth of data in the buffer, + * and this data doesn't fit in a single page. If the + * last chunk is too small, readjust the boundary to a + * pre-computed minimum. + * Write out the penultimate chunk to the disk as a page + */ + WT_RET(__rec_split_write_prev_and_shift_cur( + session, r, true)); + } else + if (r->bnd_next != 0) { + /* + * We have two boundaries, but the data in the + * buffer can fit a single page. Merge the + * boundaries to create a single chunk. + */ + bnd_prev = bnd_cur - 1; + bnd_prev->max_bnd_entries += + bnd_cur->max_bnd_entries; + r->bnd_next--; + grow_bnd = false; + } + } + + /* + * We already have space for an extra boundary if we merged two + * boundaries above, in that case we do not need to grow the boundary + * structure. + */ + if (grow_bnd) + WT_RET(__rec_split_bnd_grow(session, r)); + bnd_cur = &r->bnd[r->bnd_next]; + r->bnd_next++; + + /* + * Current boundary now has all the remaining data/last page now. + * Let's write it to the disk + */ + dsk->recno = bnd_cur->max_bnd_recno; + dsk->u.entries = bnd_cur->max_bnd_entries; dsk->mem_size = WT_PTRDIFF32(r->first_free, dsk); r->disk_image.size = dsk->mem_size; /* If this is a checkpoint, we're done, otherwise write the page. */ - return (__rec_is_checkpoint(session, r, bnd) ? - 0 : __rec_split_write(session, r, bnd, &r->disk_image, true)); + return (__rec_is_checkpoint(session, r, bnd_cur) ? + 0 : __rec_split_write(session, r, bnd_cur, &r->disk_image, true)); } /* @@ -3110,98 +3256,6 @@ __rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r) } /* - * __rec_split_fixup -- - * Fix up after crossing the maximum page boundary. - */ -static int -__rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r) -{ - WT_BOUNDARY *bnd; - WT_BTREE *btree; - WT_DECL_ITEM(tmp); - WT_DECL_RET; - WT_PAGE_HEADER *dsk; - size_t i, len; - uint8_t *dsk_start, *p; - - /* - * When we overflow physical limits of the page, we walk the list of - * split chunks we've created and write those pages out, then update - * the caller's information. - */ - btree = S2BT(session); - - /* - * The data isn't laid out on a page boundary or nul padded; copy it to - * a clean, aligned, padded buffer before writing it. - * - * Allocate a scratch buffer to hold the new disk image. Copy the disk - * page's header and block-manager space into the scratch buffer, most - * of the header information remains unchanged between the pages. - */ - WT_RET(__wt_scr_alloc(session, r->disk_image.memsize, &tmp)); - dsk = tmp->mem; - memcpy(dsk, r->disk_image.mem, WT_PAGE_HEADER_BYTE_SIZE(btree)); - - /* - * For each split chunk we've created, update the disk image and copy - * it into place. - */ - dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); - for (i = 0, bnd = r->bnd; i < r->bnd_next; ++i, ++bnd) { - /* Copy the page contents to the temporary buffer. */ - len = (bnd + 1)->offset - bnd->offset; - memcpy(dsk_start, - (uint8_t *)r->disk_image.mem + bnd->offset, len); - - /* Finalize the header information and write the page. */ - dsk->recno = bnd->recno; - dsk->u.entries = bnd->entries; - tmp->size = WT_PAGE_HEADER_BYTE_SIZE(btree) + len; - dsk->mem_size = WT_STORE_SIZE(tmp->size); - WT_ERR(__rec_split_write(session, r, bnd, tmp, false)); - } - - /* - * There is probably a remnant in the working buffer that didn't get - * written, copy it down to the beginning of the working buffer. - * - * Confirm the remnant is no larger than a split-sized chunk, including - * header. We know that's the maximum sized remnant because we only have - * remnants if split switches from accumulating to a split boundary to - * accumulating to the end of the page (the other path here is when we - * hit a split boundary, there was room for another split chunk in the - * page, and the next item still wouldn't fit, in which case there is no - * remnant). So: we were accumulating to the end of the page and created - * a remnant. We know the remnant cannot be as large as a split-sized - * chunk, including header, because if there was room for that large a - * remnant, we wouldn't have switched from accumulating to a page end. - */ - p = (uint8_t *)r->disk_image.mem + bnd->offset; - len = WT_PTRDIFF(r->first_free, p); - if (len >= r->split_size - WT_PAGE_HEADER_BYTE_SIZE(btree)) - WT_PANIC_ERR(session, EINVAL, - "Reconciliation remnant too large for the split buffer"); - dsk = r->disk_image.mem; - dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk); - (void)memmove(dsk_start, p, len); - - /* - * Fix up our caller's information, including updating the starting - * record number. - */ - r->entries -= r->total_entries; - r->first_free = dsk_start + len; - WT_ASSERT(session, - r->page_size >= (WT_PAGE_HEADER_BYTE_SIZE(btree) + len)); - r->space_avail = - r->split_size - (WT_PAGE_HEADER_BYTE_SIZE(btree) + len); - -err: __wt_scr_free(session, &tmp); - return (ret); -} - -/* * __rec_split_write -- * Write a disk block out for the split helper functions. */ @@ -3222,11 +3276,17 @@ __rec_split_write(WT_SESSION_IMPL *session, int cmp; uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE]; bool need_image; +#ifdef HAVE_DIAGNOSTIC + bool verify_image; +#endif btree = S2BT(session); dsk = buf->mem; page = r->page; mod = page->modify; +#ifdef HAVE_DIAGNOSTIC + verify_image = true; +#endif /* Set the zero-length value flag in the page header. */ if (dsk->type == WT_PAGE_ROW_LEAF) { @@ -3238,8 +3298,6 @@ __rec_split_write(WT_SESSION_IMPL *session, F_SET(dsk, WT_PAGE_EMPTY_V_NONE); } - bnd->entries = r->entries; - /* Initialize the address (set the page type for the parent). */ switch (dsk->type) { case WT_PAGE_COL_FIX: @@ -3285,7 +3343,8 @@ __rec_split_write(WT_SESSION_IMPL *session, switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: - if (WT_INSERT_RECNO(supd->ins) >= (bnd + 1)->recno) + if (WT_INSERT_RECNO(supd->ins) >= + (bnd + 1)->max_bnd_recno) goto supd_check_complete; break; case WT_PAGE_ROW_LEAF: @@ -3296,8 +3355,8 @@ __rec_split_write(WT_SESSION_IMPL *session, key->data = WT_INSERT_KEY(supd->ins); key->size = WT_INSERT_KEY_SIZE(supd->ins); } - WT_ERR(__wt_compare(session, - btree->collator, key, &(bnd + 1)->key, &cmp)); + WT_ERR(__wt_compare(session, btree->collator, + key, &(bnd + 1)->max_bnd_key, &cmp)); if (cmp >= 0) goto supd_check_complete; break; @@ -3387,18 +3446,21 @@ supd_check_complete: #ifdef HAVE_VERBOSE /* Output a verbose message if we create a page without many entries */ - if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT) && r->entries < 6) + if (WT_VERBOSE_ISSET(session, WT_VERB_SPLIT) && + bnd->max_bnd_entries < 6) __wt_verbose(session, WT_VERB_SPLIT, "Reconciliation creating a page with %" PRIu32 " entries, memory footprint %" WT_SIZET_FMT - ", page count %" PRIu32 ", %s, split state: %d", - r->entries, r->page->memory_footprint, r->bnd_next, - F_ISSET(r, WT_EVICTING) ? "evict" : "checkpoint", - r->bnd_state); + ", page count %" PRIu32 ", %s", bnd->max_bnd_entries, + r->page->memory_footprint, r->bnd_next, + F_ISSET(r, WT_EVICTING) ? "evict" : "checkpoint"); #endif WT_ERR(__wt_bt_write(session, buf, addr, &addr_size, false, F_ISSET(r, WT_CHECKPOINTING), bnd->already_compressed)); +#ifdef HAVE_DIAGNOSTIC + verify_image = false; +#endif WT_ERR(__wt_strndup(session, addr, addr_size, &bnd->addr.addr)); bnd->addr.size = (uint8_t)addr_size; @@ -3425,9 +3487,20 @@ copy_image: */ need_image = F_ISSET(r, WT_EVICT_SCRUB) || (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL); - if (need_image && bnd->disk_image == NULL) + if (need_image && bnd->disk_image == NULL) { +#ifdef HAVE_DIAGNOSTIC + /* + * The I/O routines verify all disk images we write, but there + * are paths in reconciliation that don't do I/O. Verify those + * images, too. + */ + WT_ASSERT(session, verify_image == false || + __wt_verify_dsk_image( + session, "[reconcile-image]", buf->data, 0, true) == 0); +#endif WT_ERR(__wt_strndup( session, buf->data, buf->size, &bnd->disk_image)); + } if (!need_image) __wt_free(session, bnd->disk_image); @@ -3583,11 +3656,12 @@ __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) uint64_t recno; btree = S2BT(session); + /* * Bulk-load is only permitted on newly created files, not any empty * file -- see the checkpoint code for a discussion. */ - if (!btree->bulk_load_ok) + if (!btree->original) WT_RET_MSG(session, EINVAL, "bulk-load is only possible for newly created trees"); @@ -3604,16 +3678,7 @@ __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) r = cbulk->reconcile; r->is_bulk_load = true; - recno = WT_RECNO_OOB; /* -Werror=maybe-uninitialized */ - switch (btree->type) { - case BTREE_COL_FIX: - case BTREE_COL_VAR: - recno = 1; - break; - case BTREE_ROW: - recno = WT_RECNO_OOB; - break; - } + recno = btree->type == BTREE_ROW ? WT_RECNO_OOB : 1; return (__rec_split_init( session, r, cbulk->leaf, recno, btree->maxleafpage)); @@ -3688,11 +3753,12 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) cursor->value.data, cursor->value.size, (uint64_t)0)); /* Boundary: split or write the page. */ - if (key->len + val->len > r->space_avail) { - if (r->raw_compression) - WT_RET( - __rec_split_raw(session, r, key->len + val->len)); - else { + if (r->raw_compression) { + if (key->len + val->len > r->space_avail) + WT_RET(__rec_split_raw( + session, r, key->len + val->len)); + } else + if (WT_CROSSING_SPLIT_BND(r, key->len + val->len)) { /* * Turn off prefix compression until a full key written * to the new page, and (unless already working with an @@ -3704,10 +3770,9 @@ __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_RET(__rec_cell_build_leaf_key( session, r, NULL, 0, &ovfl_key)); } - - WT_RET(__rec_split(session, r, key->len + val->len)); + WT_RET(__rec_split_crossing_bnd( + session, r, key->len + val->len)); } - } /* Copy the key/value pair onto the page. */ __rec_copy_incr(session, r, key); @@ -3748,6 +3813,10 @@ __rec_col_fix_bulk_insert_split_check(WT_CURSOR_BULK *cbulk) * split. * * Boundary: split or write the page. + * + * No need to have a minimum split size boundary, all + * pages are filled 100% except the last, allowing it to + * grow in the future. */ __rec_incr(session, r, cbulk->entry, __bitstr_size( @@ -3852,10 +3921,12 @@ __wt_bulk_insert_var( r, cbulk->last.data, cbulk->last.size, cbulk->rle)); /* Boundary: split or write the page. */ - if (val->len > r->space_avail) - WT_RET(r->raw_compression ? - __rec_split_raw(session, r, val->len) : - __rec_split(session, r, val->len)); + if (r->raw_compression) { + if (val->len > r->space_avail) + WT_RET(__rec_split_raw(session, r, val->len)); + } else + if (WT_CROSSING_SPLIT_BND(r, val->len)) + WT_RET(__rec_split_crossing_bnd(session, r, val->len)); /* Copy the value onto the page. */ if (btree->dictionary) @@ -3991,10 +4062,13 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) WT_CHILD_RELEASE_ERR(session, hazard, ref); /* Boundary: split or write the page. */ - if (val->len > r->space_avail) - WT_ERR(r->raw_compression ? - __rec_split_raw(session, r, val->len) : - __rec_split(session, r, val->len)); + if (r->raw_compression) { + if (val->len > r->space_avail) + WT_ERR(__rec_split_raw(session, r, val->len)); + } else + if (WT_CHECK_CROSSING_BND(r, val->len)) + WT_ERR(__rec_split_crossing_bnd( + session, r, val->len)); /* Copy the value onto the page. */ __rec_copy_incr(session, r, val); @@ -4036,10 +4110,13 @@ __rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) addr->addr, addr->size, __rec_vtype(addr), r->recno); /* Boundary: split or write the page. */ - if (val->len > r->space_avail) - WT_RET(r->raw_compression ? - __rec_split_raw(session, r, val->len) : - __rec_split(session, r, val->len)); + if (r->raw_compression) { + if (val->len > r->space_avail) + WT_RET(__rec_split_raw(session, r, val->len)); + } else + if (WT_CHECK_CROSSING_BND(r, val->len)) + WT_RET(__rec_split_crossing_bnd( + session, r, val->len)); /* Copy the value onto the page. */ __rec_copy_incr(session, r, val); @@ -4069,7 +4146,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) /* Copy the original, disk-image bytes into place. */ memcpy(r->first_free, page->pg_fix_bitf, - __bitstr_size((size_t)page->pg_fix_entries * btree->bitcnt)); + __bitstr_size((size_t)page->entries * btree->bitcnt)); /* Update any changes to the original on-page data items. */ WT_SKIP_FOREACH(ins, WT_COL_UPDATE_SINGLE(page)) { @@ -4081,9 +4158,8 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) } /* Calculate the number of entries per page remainder. */ - entry = page->pg_fix_entries; - nrecs = WT_FIX_BYTES_TO_ENTRIES( - btree, r->space_avail) - page->pg_fix_entries; + entry = page->entries; + nrecs = WT_FIX_BYTES_TO_ENTRIES(btree, r->space_avail) - page->entries; r->recno += entry; /* Walk any append list. */ @@ -4148,6 +4224,10 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) * split. * * Boundary: split or write the page. + * + * No need to have a minimum split size boundary, all + * pages are filled 100% except the last, allowing it to + * grow in the future. */ __rec_incr(session, r, entry, __bitstr_size((size_t)entry * btree->bitcnt)); @@ -4206,7 +4286,7 @@ __rec_col_fix_slvg(WT_SESSION_IMPL *session, session, r, page, pageref->ref_recno, btree->maxleafpage)); /* We may not be taking all of the entries on the original page. */ - page_take = salvage->take == 0 ? page->pg_fix_entries : salvage->take; + page_take = salvage->take == 0 ? page->entries : salvage->take; page_start = salvage->skip == 0 ? 0 : salvage->skip; /* Calculate the number of entries per page. */ @@ -4304,10 +4384,13 @@ __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r, session, r, value->data, value->size, rle)); /* Boundary: split or write the page. */ - if (val->len > r->space_avail) - WT_RET(r->raw_compression ? - __rec_split_raw(session, r, val->len) : - __rec_split(session, r, val->len)); + if (r->raw_compression) { + if (val->len > r->space_avail) + WT_RET(__rec_split_raw(session, r, val->len)); + } else + if (WT_CHECK_CROSSING_BND(r, val->len)) + WT_RET(__rec_split_crossing_bnd( + session, r, val->len)); /* Copy the value onto the page. */ if (!deleted && !overflow_type && btree->dictionary) @@ -4970,11 +5053,12 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) r->cell_zero = false; /* Boundary: split or write the page. */ - if (key->len + val->len > r->space_avail) { - if (r->raw_compression) + if (r->raw_compression) { + if (key->len + val->len > r->space_avail) WT_ERR(__rec_split_raw( session, r, key->len + val->len)); - else { + } else + if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) { /* * In one path above, we copied address blocks * from the page rather than building the actual @@ -4986,10 +5070,10 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_IKEY_DATA(ikey), ikey->size)); key_onpage_ovfl = false; } - WT_ERR(__rec_split( + + WT_ERR(__rec_split_crossing_bnd( session, r, key->len + val->len)); } - } /* Copy the key and value onto the page. */ __rec_copy_incr(session, r, key); @@ -5039,10 +5123,14 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) addr->addr, addr->size, __rec_vtype(addr), WT_RECNO_OOB); /* Boundary: split or write the page. */ - if (key->len + val->len > r->space_avail) - WT_RET(r->raw_compression ? - __rec_split_raw(session, r, key->len + val->len) : - __rec_split(session, r, key->len + val->len)); + if (r->raw_compression) { + if (key->len + val->len > r->space_avail) + WT_RET(__rec_split_raw( + session, r, key->len + val->len)); + } else + if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) + WT_RET(__rec_split_crossing_bnd( + session, r, key->len + val->len)); /* Copy the key and value onto the page. */ __rec_copy_incr(session, r, key); @@ -5371,16 +5459,17 @@ build: } /* Boundary: split or write the page. */ - if (key->len + val->len > r->space_avail) { - if (r->raw_compression) + if (r->raw_compression) { + if (key->len + val->len > r->space_avail) WT_ERR(__rec_split_raw( session, r, key->len + val->len)); - else { + } else + if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) { /* - * In one path above, we copied address blocks - * from the page rather than building the actual - * key. In that case, we have to build the key - * now because we are about to promote it. + * If we copied address blocks from the page + * rather than building the actual key, we have + * to build the key now because we are about to + * promote it. */ if (key_onpage_ovfl) { WT_ERR(__wt_dsk_cell_data_ref(session, @@ -5399,14 +5488,13 @@ build: if (!ovfl_key) WT_ERR( __rec_cell_build_leaf_key( - session, - r, NULL, 0, &ovfl_key)); + session, r, NULL, 0, + &ovfl_key)); } - WT_ERR(__rec_split( + WT_ERR(__rec_split_crossing_bnd( session, r, key->len + val->len)); } - } /* Copy the key/value pair onto the page. */ __rec_copy_incr(session, r, key); @@ -5469,11 +5557,12 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key)); /* Boundary: split or write the page. */ - if (key->len + val->len > r->space_avail) { - if (r->raw_compression) + if (r->raw_compression) { + if (key->len + val->len > r->space_avail) WT_RET(__rec_split_raw( session, r, key->len + val->len)); - else { + } else + if (WT_CHECK_CROSSING_BND(r, key->len + val->len)) { /* * Turn off prefix compression until a full key * written to the new page, and (unless already @@ -5485,14 +5574,13 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) if (!ovfl_key) WT_RET( __rec_cell_build_leaf_key( - session, - r, NULL, 0, &ovfl_key)); + session, r, NULL, 0, + &ovfl_key)); } - WT_RET(__rec_split( + WT_RET(__rec_split_crossing_bnd( session, r, key->len + val->len)); } - } /* Copy the key/value pair onto the page. */ __rec_copy_incr(session, r, key); @@ -5604,13 +5692,14 @@ __rec_split_dump_keys(WT_SESSION_IMPL *session, WT_PAGE *page, WT_RECONCILE *r) __wt_verbose(session, WT_VERB_SPLIT, "starting key %s", __wt_buf_set_printable( - session, bnd->key.data, bnd->key.size, tkey)); + session, bnd->max_bnd_key.data, + bnd->max_bnd_key.size, tkey)); break; case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: __wt_verbose(session, WT_VERB_SPLIT, - "starting recno %" PRIu64, bnd->recno); + "starting recno %" PRIu64, bnd->max_bnd_recno); break; WT_ILLEGAL_VALUE_ERR(session); } @@ -5872,10 +5961,10 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) /* We never set the first page's key, grab it from the original page. */ ref = r->ref; if (__wt_ref_is_root(ref)) - WT_RET(__wt_buf_set(session, &r->bnd[0].key, "", 1)); + WT_RET(__wt_buf_set(session, &r->bnd[0].max_bnd_key, "", 1)); else { __wt_ref_key(ref->home, ref, &p, &size); - WT_RET(__wt_buf_set(session, &r->bnd[0].key, p, size)); + WT_RET(__wt_buf_set(session, &r->bnd[0].max_bnd_key, p, size)); } /* Allocate, then initialize the array of replacement blocks. */ @@ -5883,8 +5972,8 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) for (multi = mod->mod_multi, bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) { - WT_RET(__wt_row_ikey_alloc(session, 0, - bnd->key.data, bnd->key.size, &multi->key.ikey)); + WT_RET(__wt_row_ikey_alloc(session, 0, bnd->max_bnd_key.data, + bnd->max_bnd_key.size, &multi->key.ikey)); /* * Copy any disk image. Don't take saved updates without a @@ -5931,7 +6020,7 @@ __rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) for (multi = mod->mod_multi, bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) { - multi->key.recno = bnd->recno; + multi->key.recno = bnd->max_bnd_recno; /* * Copy any disk image. Don't take saved updates without a @@ -6408,7 +6497,8 @@ __rec_dictionary_lookup( for (dp = __rec_dictionary_skip_search(r->dictionary_head, hash); dp != NULL && dp->hash == hash; dp = dp->next[0]) { WT_RET(__wt_cell_pack_data_match( - dp->cell, &val->cell, val->buf.data, &match)); + (WT_CELL *)((uint8_t *)r->disk_image.mem + dp->offset), + &val->cell, val->buf.data, &match)); if (match) { WT_STAT_DATA_INCR(session, rec_dictionary); *dpp = dp; @@ -6434,7 +6524,7 @@ __rec_dictionary_lookup( * know where on the page it will be written). */ next = r->dictionary[r->dictionary_next++]; - next->cell = NULL; /* Not necessary, just cautious. */ + next->offset = 0; /* Not necessary, just cautious. */ next->hash = hash; __rec_dictionary_skip_insert(r->dictionary_head, next, hash); *dpp = next; diff --git a/src/schema/schema_create.c b/src/schema/schema_create.c index 020d5e72c13..0677fa711a5 100644 --- a/src/schema/schema_create.c +++ b/src/schema/schema_create.c @@ -35,7 +35,7 @@ __wt_direct_io_size_check(WT_SESSION_IMPL *session, * units of its happy place. */ if (FLD_ISSET(conn->direct_io, - WT_DIRECT_IO_CHECKPOINT | WT_DIRECT_IO_DATA)) { + WT_DIRECT_IO_CHECKPOINT | WT_DIRECT_IO_DATA)) { align = (int64_t)conn->buffer_alignment; if (align != 0 && (cval.val < align || cval.val % align != 0)) WT_RET_MSG(session, EINVAL, @@ -601,7 +601,8 @@ __create_table(WT_SESSION_IMPL *session, if (ncolgroups == 0) { cgsize = strlen("colgroup:") + strlen(tablename) + 1; WT_ERR(__wt_calloc_def(session, cgsize, &cgname)); - snprintf(cgname, cgsize, "colgroup:%s", tablename); + WT_ERR(__wt_snprintf( + cgname, cgsize, "colgroup:%s", tablename)); WT_ERR(__create_colgroup( session, cgname, exclusive, config)); } diff --git a/src/schema/schema_drop.c b/src/schema/schema_drop.c index c1a4f257648..49801e4e5f9 100644 --- a/src/schema/schema_drop.c +++ b/src/schema/schema_drop.c @@ -30,7 +30,7 @@ __drop_file( WT_RET(__wt_schema_backup_check(session, filename)); /* Close all btree handles associated with this file. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __wt_conn_dhandle_close_all(session, uri, force)); WT_RET(ret); diff --git a/src/schema/schema_list.c b/src/schema/schema_list.c index ea7374b7554..74ef5135a4a 100644 --- a/src/schema/schema_list.c +++ b/src/schema/schema_list.c @@ -25,7 +25,7 @@ __schema_add_table(WT_SESSION_IMPL *session, /* Make sure the metadata is open before getting other locks. */ WT_RET(__wt_metadata_cursor(session, NULL)); - WT_WITH_TABLE_LOCK(session, + WT_WITH_TABLE_READ_LOCK(session, ret = __wt_schema_open_table( session, name, namelen, ok_incomplete, &table)); WT_RET(ret); diff --git a/src/schema/schema_rename.c b/src/schema/schema_rename.c index f512482c162..a374f4c2831 100644 --- a/src/schema/schema_rename.c +++ b/src/schema/schema_rename.c @@ -33,7 +33,7 @@ __rename_file( WT_RET(__wt_schema_backup_check(session, filename)); WT_RET(__wt_schema_backup_check(session, newfile)); /* Close any btree handles in the file. */ - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __wt_conn_dhandle_close_all(session, uri, false)); WT_ERR(ret); diff --git a/src/schema/schema_util.c b/src/schema/schema_util.c index 433224a868e..9de4b916a79 100644 --- a/src/schema/schema_util.c +++ b/src/schema/schema_util.c @@ -26,7 +26,7 @@ __wt_schema_backup_check(WT_SESSION_IMPL *session, const char *name) conn = S2C(session); if (!conn->hot_backup) return (0); - __wt_readlock(session, conn->hot_backup_lock); + __wt_readlock(session, &conn->hot_backup_lock); /* * There is a window at the end of a backup where the list has been * cleared from the connection but the flag is still set. It is safe @@ -34,7 +34,7 @@ __wt_schema_backup_check(WT_SESSION_IMPL *session, const char *name) */ if (!conn->hot_backup || (backup_list = conn->hot_backup_list) == NULL) { - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); return (0); } for (i = 0; backup_list[i] != NULL; ++i) { @@ -43,7 +43,7 @@ __wt_schema_backup_check(WT_SESSION_IMPL *session, const char *name) break; } } - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); return (ret); } diff --git a/src/schema/schema_worker.c b/src/schema/schema_worker.c index fb7f8cec074..62cdd7d367b 100644 --- a/src/schema/schema_worker.c +++ b/src/schema/schema_worker.c @@ -49,7 +49,7 @@ __wt_schema_worker(WT_SESSION_IMPL *session, * any open file handles, including checkpoints. */ if (FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE)) { - WT_WITH_HANDLE_LIST_LOCK(session, + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, ret = __wt_conn_dhandle_close_all( session, uri, false)); WT_ERR(ret); @@ -112,10 +112,10 @@ __wt_schema_worker(WT_SESSION_IMPL *session, wt_session = (WT_SESSION *)session; if (file_func == __wt_salvage && dsrc->salvage != NULL) WT_ERR(dsrc->salvage( - dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg)); + dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg)); else if (file_func == __wt_verify && dsrc->verify != NULL) WT_ERR(dsrc->verify( - dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg)); + dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg)); else if (file_func == __wt_checkpoint) ; else if (file_func == __wt_checkpoint_get_handles) diff --git a/src/session/session_api.c b/src/session/session_api.c index fe1bf821d3b..b7daf0e2e02 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -128,7 +128,7 @@ __session_clear(WT_SESSION_IMPL *session) * * For these reasons, be careful when clearing the session structure. */ - memset(session, 0, WT_SESSION_CLEAR_SIZE(session)); + memset(session, 0, WT_SESSION_CLEAR_SIZE); WT_INIT_LSN(&session->bg_sync_lsn); @@ -162,7 +162,7 @@ __session_alter(WT_SESSION *wt_session, const char *uri, const char *config) cfg[1] = NULL; WT_WITH_CHECKPOINT_LOCK(session, WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, + WT_WITH_TABLE_WRITE_LOCK(session, ret = __wt_schema_alter(session, uri, cfg)))); err: if (ret != 0) @@ -234,9 +234,6 @@ __session_close(WT_SESSION *wt_session, const char *config) /* Release common session resources. */ WT_TRET(__wt_session_release_resources(session)); - /* Destroy the thread's mutex. */ - WT_TRET(__wt_cond_destroy(session, &session->cond)); - /* The API lock protects opening and closing of sessions. */ __wt_spin_lock(session, &conn->api_lock); @@ -521,7 +518,7 @@ __wt_session_create( WT_DECL_RET; WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, + WT_WITH_TABLE_WRITE_LOCK(session, ret = __wt_schema_create(session, uri, config))); return (ret); } @@ -769,7 +766,7 @@ __session_rename(WT_SESSION *wt_session, WT_WITH_CHECKPOINT_LOCK(session, WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, + WT_WITH_TABLE_WRITE_LOCK(session, ret = __wt_schema_rename(session, uri, newuri, cfg)))); err: if (ret != 0) @@ -858,21 +855,22 @@ __session_drop(WT_SESSION *wt_session, const char *uri, const char *config) if (lock_wait) WT_WITH_CHECKPOINT_LOCK(session, WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, ret = + WT_WITH_TABLE_WRITE_LOCK(session, ret = __wt_schema_drop(session, uri, cfg)))); else WT_WITH_CHECKPOINT_LOCK_NOWAIT(session, ret, WT_WITH_SCHEMA_LOCK_NOWAIT(session, ret, - WT_WITH_TABLE_LOCK_NOWAIT(session, ret, ret = + WT_WITH_TABLE_WRITE_LOCK_NOWAIT(session, ret, + ret = __wt_schema_drop(session, uri, cfg)))); } else { if (lock_wait) WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, + WT_WITH_TABLE_WRITE_LOCK(session, ret = __wt_schema_drop(session, uri, cfg))); else WT_WITH_SCHEMA_LOCK_NOWAIT(session, ret, - WT_WITH_TABLE_LOCK_NOWAIT(session, ret, + WT_WITH_TABLE_WRITE_LOCK_NOWAIT(session, ret, ret = __wt_schema_drop(session, uri, cfg))); } @@ -1208,10 +1206,15 @@ __wt_session_range_truncate(WT_SESSION_IMPL *session, done: err: /* - * Close any locally-opened start cursor. + * Close any locally-opened start cursor. Reset application cursors, + * they've possibly moved and the application cannot use them. */ if (local_start) WT_TRET(start->close(start)); + else + WT_TRET(start->reset(start)); + if (stop != NULL) + WT_TRET(stop->reset(stop)); return (ret); } @@ -1489,6 +1492,20 @@ err: API_END_RET(session, ret); } /* + * __transaction_sync_run_chk -- + * Check to decide if the transaction sync call should continue running. + */ +static bool +__transaction_sync_run_chk(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + + return (FLD_ISSET(conn->flags, WT_CONN_SERVER_LOG)); +} + +/* * __session_transaction_sync -- * WT_SESSION->transaction_sync method. */ @@ -1502,7 +1519,7 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config) WT_SESSION_IMPL *session; WT_TXN *txn; struct timespec now, start; - uint64_t timeout_ms, waited_ms; + uint64_t remaining_usec, timeout_ms, waited_ms; bool forever; session = (WT_SESSION_IMPL *)wt_session; @@ -1555,22 +1572,20 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config) __wt_epoch(session, &start); /* * Keep checking the LSNs until we find it is stable or we reach - * our timeout. + * our timeout, or there's some other reason to quit. */ while (__wt_log_cmp(&session->bg_sync_lsn, &log->sync_lsn) > 0) { + if (!__transaction_sync_run_chk(session)) + WT_ERR(ETIMEDOUT); + __wt_cond_signal(session, conn->log_file_cond); __wt_epoch(session, &now); waited_ms = WT_TIMEDIFF_MS(now, start); - if (forever || waited_ms < timeout_ms) - /* - * Note, we will wait an increasing amount of time - * each iteration, likely doubling. Also note that - * the function timeout value is in usecs (we are - * computing the wait time in msecs and passing that - * in, unchanged, as the usecs to wait). - */ - __wt_cond_wait(session, log->log_sync_cond, waited_ms); - else + if (forever || waited_ms < timeout_ms) { + remaining_usec = (timeout_ms - waited_ms) * WT_THOUSAND; + __wt_cond_wait(session, log->log_sync_cond, + remaining_usec, __transaction_sync_run_chk); + } else WT_ERR(ETIMEDOUT); } @@ -1686,7 +1701,7 @@ __session_snapshot(WT_SESSION *wt_session, const char *config) WT_ERR(__wt_txn_named_snapshot_config( session, cfg, &has_create, &has_drop)); - __wt_writelock(session, txn_global->nsnap_rwlock); + __wt_writelock(session, &txn_global->nsnap_rwlock); /* Drop any snapshots to be removed first. */ if (has_drop) @@ -1696,7 +1711,7 @@ __session_snapshot(WT_SESSION *wt_session, const char *config) if (has_create) WT_ERR(__wt_txn_named_snapshot_begin(session, cfg)); -err: __wt_writeunlock(session, txn_global->nsnap_rwlock); +err: __wt_writeunlock(session, &txn_global->nsnap_rwlock); API_END_RET_NOTFOUND_MAP(session, ret); } @@ -1797,7 +1812,7 @@ __open_session(WT_CONNECTION_IMPL *conn, * closes the connection. This is particularly intended to catch * cases where server threads open sessions. */ - WT_ASSERT(session, F_ISSET(conn, WT_CONN_SERVER_RUN)); + WT_ASSERT(session, !F_ISSET(conn, WT_CONN_CLOSING)); /* Find the first inactive session slot. */ for (session_ret = conn->sessions, @@ -1825,8 +1840,6 @@ __open_session(WT_CONNECTION_IMPL *conn, session_ret->name = NULL; session_ret->id = i; - WT_ERR(__wt_cond_alloc(session, "session", false, &session_ret->cond)); - if (WT_SESSION_FIRST_USE(session_ret)) __wt_random_init(&session_ret->rnd); diff --git a/src/session/session_compact.c b/src/session/session_compact.c index 85214ae6d98..72c072e0fb8 100644 --- a/src/session/session_compact.c +++ b/src/session/session_compact.c @@ -210,7 +210,7 @@ __compact_checkpoint(WT_SESSION_IMPL *session) * work we need to have done is done in the underlying block manager. */ const char *checkpoint_cfg[] = { - WT_CONFIG_BASE(session, WT_SESSION_checkpoint), "force=1", NULL }; + WT_CONFIG_BASE(session, WT_SESSION_checkpoint), "force=1", NULL }; /* Checkpoints take a lot of time, check if we've run out. */ WT_RET(__wt_session_compact_check_timeout(session)); diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c index 732dc797b6d..95fb6a6f90e 100644 --- a/src/session/session_dhandle.c +++ b/src/session/session_dhandle.c @@ -44,8 +44,7 @@ __session_discard_dhandle( TAILQ_REMOVE(&session->dhandles, dhandle_cache, q); TAILQ_REMOVE(&session->dhhash[bucket], dhandle_cache, hashq); - (void)__wt_atomic_sub32(&dhandle_cache->dhandle->session_ref, 1); - + WT_DHANDLE_RELEASE(dhandle_cache->dhandle); __wt_overwrite_and_free(session, dhandle_cache); } @@ -181,17 +180,17 @@ __wt_session_lock_dhandle( */ if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && (!want_exclusive || lock_busy)) { - __wt_readlock(session, dhandle->rwlock); + __wt_readlock(session, &dhandle->rwlock); if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) { *is_deadp = 1; - __wt_readunlock(session, dhandle->rwlock); + __wt_readunlock(session, &dhandle->rwlock); return (0); } is_open = F_ISSET(dhandle, WT_DHANDLE_OPEN); if (is_open && !want_exclusive) return (0); - __wt_readunlock(session, dhandle->rwlock); + __wt_readunlock(session, &dhandle->rwlock); } else is_open = false; @@ -201,10 +200,11 @@ __wt_session_lock_dhandle( * with another thread that successfully opens the file, we * don't want to block waiting to get exclusive access. */ - if ((ret = __wt_try_writelock(session, dhandle->rwlock)) == 0) { + if ((ret = + __wt_try_writelock(session, &dhandle->rwlock)) == 0) { if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) { *is_deadp = 1; - __wt_writeunlock(session, dhandle->rwlock); + __wt_writeunlock(session, &dhandle->rwlock); return (0); } @@ -215,7 +215,7 @@ __wt_session_lock_dhandle( if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && !want_exclusive) { lock_busy = false; - __wt_writeunlock(session, dhandle->rwlock); + __wt_writeunlock(session, &dhandle->rwlock); continue; } @@ -270,6 +270,16 @@ __wt_session_release_btree(WT_SESSION_IMPL *session) if (F_ISSET(dhandle, WT_DHANDLE_DISCARD_FORCE)) { ret = __wt_conn_btree_sync_and_close(session, false, true); F_CLR(dhandle, WT_DHANDLE_DISCARD_FORCE); + } else if (F_ISSET(btree, WT_BTREE_BULK)) { + WT_ASSERT(session, F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) && + !F_ISSET(dhandle, WT_DHANDLE_DISCARD)); + /* + * Acquire the schema lock while completing a bulk load. This + * avoids racing with a checkpoint while it gathers a set + * of handles. + */ + WT_WITH_SCHEMA_LOCK(session, ret = + __wt_conn_btree_sync_and_close(session, false, false)); } else if (F_ISSET(dhandle, WT_DHANDLE_DISCARD) || F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)) { WT_ASSERT(session, F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE)); @@ -286,9 +296,9 @@ __wt_session_release_btree(WT_SESSION_IMPL *session) if (locked) { if (write_locked) { F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); - __wt_writeunlock(session, dhandle->rwlock); + __wt_writeunlock(session, &dhandle->rwlock); } else - __wt_readunlock(session, dhandle->rwlock); + __wt_readunlock(session, &dhandle->rwlock); } session->dhandle = NULL; @@ -411,17 +421,27 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session) /* * __session_find_shared_dhandle -- * Search for a data handle in the connection and add it to a session's - * cache. Since the data handle isn't locked, this must be called holding - * the handle list lock, and we must increment the handle's reference - * count before releasing it. + * cache. We must increment the handle's reference count while holding + * the handle list lock. */ static int __session_find_shared_dhandle( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) { - WT_RET(__wt_conn_dhandle_find(session, uri, checkpoint)); - (void)__wt_atomic_add32(&session->dhandle->session_ref, 1); - return (0); + WT_DECL_RET; + + WT_WITH_HANDLE_LIST_READ_LOCK(session, + if ((ret = __wt_conn_dhandle_find(session, uri, checkpoint)) == 0) + WT_DHANDLE_ACQUIRE(session->dhandle)); + + if (ret != WT_NOTFOUND) + return (ret); + + WT_WITH_HANDLE_LIST_WRITE_LOCK(session, + if ((ret = __wt_conn_dhandle_alloc(session, uri, checkpoint)) == 0) + WT_DHANDLE_ACQUIRE(session->dhandle)); + + return (ret); } /* @@ -449,16 +469,16 @@ __session_get_dhandle( * We didn't find a match in the session cache, search the shared * handle list and cache the handle we find. */ - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __session_find_shared_dhandle(session, uri, checkpoint)); - WT_RET(ret); + WT_RET(__session_find_shared_dhandle(session, uri, checkpoint)); /* * Fixup the reference count on failure (we incremented the reference * count while holding the handle-list lock). */ - if ((ret = __session_add_dhandle(session)) != 0) - (void)__wt_atomic_sub32(&session->dhandle->session_ref, 1); + if ((ret = __session_add_dhandle(session)) != 0) { + WT_DHANDLE_RELEASE(session->dhandle); + session->dhandle = NULL; + } return (ret); } @@ -504,17 +524,15 @@ __wt_session_get_btree(WT_SESSION_IMPL *session, * reopen handles in the meantime. A combination of the schema * and handle list locks are used to enforce this. */ - if (!F_ISSET(session, WT_SESSION_LOCKED_SCHEMA) || - !F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)) { + if (!F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)) { dhandle->excl_session = NULL; dhandle->excl_ref = 0; F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); - __wt_writeunlock(session, dhandle->rwlock); + __wt_writeunlock(session, &dhandle->rwlock); WT_WITH_SCHEMA_LOCK(session, - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_session_get_btree( - session, uri, checkpoint, cfg, flags))); + ret = __wt_session_get_btree( + session, uri, checkpoint, cfg, flags)); return (ret); } @@ -531,7 +549,7 @@ __wt_session_get_btree(WT_SESSION_IMPL *session, dhandle->excl_session = NULL; dhandle->excl_ref = 0; F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); - __wt_writeunlock(session, dhandle->rwlock); + __wt_writeunlock(session, &dhandle->rwlock); WT_RET(ret); } @@ -552,7 +570,7 @@ __wt_session_get_btree(WT_SESSION_IMPL *session, int __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint) { - WT_DATA_HANDLE *dhandle, *saved_dhandle; + WT_DATA_HANDLE *saved_dhandle; WT_DECL_RET; WT_ASSERT(session, WT_META_TRACKING(session)); @@ -560,31 +578,33 @@ __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint) /* * Get the checkpoint handle exclusive, so no one else can access it - * while we are creating the new checkpoint. + * while we are creating the new checkpoint. Hold the lock until the + * checkpoint completes. */ WT_ERR(__wt_session_get_btree(session, saved_dhandle->name, checkpoint, NULL, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_LOCK_ONLY)); + if ((ret = __wt_meta_track_handle_lock(session, false)) != 0) { + WT_TRET(__wt_session_release_btree(session)); + goto err; + } /* - * Flush any pages in this checkpoint from the cache (we are about to - * re-write the checkpoint which will mean cached pages no longer have - * valid contents). This is especially noticeable with memory mapped - * files, since changes to the underlying file are visible to the in - * memory pages. + * Get exclusive access to the handle and then flush any pages in this + * checkpoint from the cache (we are about to re-write the checkpoint + * which will mean cached pages no longer have valid contents). This + * is especially noticeable with memory mapped files, since changes to + * the underlying file are visible to the in-memory pages. */ + WT_ERR(__wt_evict_file_exclusive_on(session)); WT_ERR(__wt_cache_op(session, WT_SYNC_DISCARD)); /* * We lock checkpoint handles that we are overwriting, so the handle * must be closed when we release it. */ - dhandle = session->dhandle; - F_SET(dhandle, WT_DHANDLE_DISCARD); - - WT_ERR(__wt_meta_track_handle_lock(session, false)); + F_SET(session->dhandle, WT_DHANDLE_DISCARD); - /* Restore the original btree in the session. */ + /* Restore the original data handle in the session. */ err: session->dhandle = saved_dhandle; - return (ret); } diff --git a/src/session/session_salvage.c b/src/session/session_salvage.c index 983b28dd8ea..12ce71cdbb0 100644 --- a/src/session/session_salvage.c +++ b/src/session/session_salvage.c @@ -54,6 +54,6 @@ __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_meta_ckptlist_set( session, dhandle->name, ckptbase, NULL)); -err: __wt_meta_ckptlist_free(session, ckptbase); +err: __wt_meta_ckptlist_free(session, &ckptbase); return (ret); } diff --git a/src/support/cond_auto.c b/src/support/cond_auto.c index a3ae67f5baa..600e5eab0ff 100644 --- a/src/support/cond_auto.c +++ b/src/support/cond_auto.c @@ -1,29 +1,9 @@ /*- - * Public Domain 2014-2016 MongoDB, Inc. - * Public Domain 2008-2014 WiredTiger, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. * - * This is free and unencumbered software released into the public domain. - * - * Anyone is free to copy, modify, publish, use, compile, sell, or - * distribute this software, either in source code form or as a compiled - * binary, for any purpose, commercial or non-commercial, and by any - * means. - * - * In jurisdictions that recognize copyright laws, the author or authors - * of this software dedicate any and all copyright interest in the - * software to the public domain. We make this dedication for the benefit - * of the public at large and to the detriment of our heirs and - * successors. We intend this dedication to be an overt act of - * relinquishment in perpetuity of all present and future rights to this - * software under copyright law. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. + * See the file LICENSE for redistribution information. */ #include "wt_internal.h" @@ -38,13 +18,12 @@ * Allocate and initialize an automatically adjusting condition variable. */ int -__wt_cond_auto_alloc( - WT_SESSION_IMPL *session, const char *name, - bool is_signalled, uint64_t min, uint64_t max, WT_CONDVAR **condp) +__wt_cond_auto_alloc(WT_SESSION_IMPL *session, + const char *name, uint64_t min, uint64_t max, WT_CONDVAR **condp) { WT_CONDVAR *cond; - WT_RET(__wt_cond_alloc(session, name, is_signalled, condp)); + WT_RET(__wt_cond_alloc(session, name, condp)); cond = *condp; cond->min_wait = min; @@ -55,33 +34,19 @@ __wt_cond_auto_alloc( } /* - * __wt_cond_auto_signal -- - * Signal a condition variable. - */ -void -__wt_cond_auto_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) -{ - - WT_ASSERT(session, cond->min_wait != 0); - __wt_cond_signal(session, cond); -} - -/* * __wt_cond_auto_wait_signal -- * Wait on a mutex, optionally timing out. If we get it before the time * out period expires, let the caller know. - * TODO: Can this version of the API be removed, now that we have the - * auto adjusting condition variables? */ void -__wt_cond_auto_wait_signal( - WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool *signalled) +__wt_cond_auto_wait_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond, + bool progress, bool (*run_func)(WT_SESSION_IMPL *), bool *signalled) { uint64_t delta; /* * Catch cases where this function is called with a condition variable - * that was initialized non-auto. + * that wasn't initialized to do automatic adjustments. */ WT_ASSERT(session, cond->min_wait != 0); @@ -94,7 +59,8 @@ __wt_cond_auto_wait_signal( cond->max_wait, cond->prev_wait + delta); } - __wt_cond_wait_signal(session, cond, cond->prev_wait, signalled); + __wt_cond_wait_signal( + session, cond, cond->prev_wait, run_func, signalled); if (progress || *signalled) WT_STAT_CONN_INCR(session, cond_auto_wait_reset); @@ -108,24 +74,10 @@ __wt_cond_auto_wait_signal( * out period expires, let the caller know. */ void -__wt_cond_auto_wait( - WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress) +__wt_cond_auto_wait(WT_SESSION_IMPL *session, + WT_CONDVAR *cond, bool progress, bool (*run_func)(WT_SESSION_IMPL *)) { - bool signalled; - - /* - * Call the signal version so the wait period is reset if the - * condition is woken explicitly. - */ - __wt_cond_auto_wait_signal(session, cond, progress, &signalled); -} + bool notused; -/* - * __wt_cond_auto_destroy -- - * Destroy a condition variable. - */ -int -__wt_cond_auto_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) -{ - return (__wt_cond_destroy(session, condp)); + __wt_cond_auto_wait_signal(session, cond, progress, run_func, ¬used); } diff --git a/src/support/crypto.c b/src/support/crypto.c index ab94ec2c829..cce0d228832 100644 --- a/src/support/crypto.c +++ b/src/support/crypto.c @@ -133,5 +133,4 @@ __wt_encrypt_size(WT_SESSION_IMPL *session, return; *sizep = incoming_size + kencryptor->size_const + WT_ENCRYPT_LEN_SIZE; - return; } diff --git a/src/support/err.c b/src/support/err.c index 369997d38c0..57efde72b23 100644 --- a/src/support/err.c +++ b/src/support/err.c @@ -102,9 +102,10 @@ __handler_failure(WT_SESSION_IMPL *session, */ char s[256]; - (void)snprintf(s, sizeof(s), + if (__wt_snprintf(s, sizeof(s), "application %s event handler failed: %s", - which, __wt_strerror(session, error, NULL, 0)); + which, __wt_strerror(session, error, NULL, 0)) != 0) + return; /* * Use the error handler to report the failure, unless it was the error @@ -148,6 +149,23 @@ __wt_event_handler_set(WT_SESSION_IMPL *session, WT_EVENT_HANDLER *handler) session->event_handler = handler; } +#define WT_ERROR_APPEND(p, remain, ...) do { \ + size_t __len; \ + WT_ERR(__wt_snprintf_len_set(p, remain, &__len, __VA_ARGS__)); \ + if (__len > remain) \ + __len = remain; \ + p += __len; \ + remain -= __len; \ +} while (0) +#define WT_ERROR_APPEND_AP(p, remain, ...) do { \ + size_t __len; \ + WT_ERR(__wt_vsnprintf_len_set(p, remain, &__len, __VA_ARGS__)); \ + if (__len > remain) \ + __len = remain; \ + p += __len; \ + remain -= __len; \ +} while (0) + /* * __wt_eventv -- * Report a message to an event handler. @@ -161,9 +179,9 @@ __wt_eventv(WT_SESSION_IMPL *session, bool msg_event, int error, WT_DECL_RET; WT_SESSION *wt_session; struct timespec ts; - size_t len, remain, wlen; + size_t len, remain; const char *err, *prefix; - char *end, *p, tid[128]; + char *p, tid[128]; /* * We're using a stack buffer because we want error messages no matter @@ -174,6 +192,8 @@ __wt_eventv(WT_SESSION_IMPL *session, bool msg_event, int error, * Buffer placed at the end of the stack in case snprintf overflows. */ char s[2048]; + p = s; + remain = sizeof(s); /* * !!! @@ -185,24 +205,8 @@ __wt_eventv(WT_SESSION_IMPL *session, bool msg_event, int error, * first session, but if the allocation of the first session fails, for * example, we can end up here without a session.) */ - if (session == NULL) { - if (fprintf(stderr, - "WiredTiger Error%s%s: ", - error == 0 ? "" : ": ", - error == 0 ? "" : - __wt_strerror(session, error, NULL, 0)) < 0) - ret = EIO; - if (vfprintf(stderr, fmt, ap) < 0) - ret = EIO; - if (fprintf(stderr, "\n") < 0) - ret = EIO; - if (fflush(stderr) != 0) - ret = EIO; - return (ret); - } - - p = s; - end = s + sizeof(s); + if (session == NULL) + goto err; /* * We have several prefixes for the error message: a timestamp and the @@ -211,42 +215,24 @@ __wt_eventv(WT_SESSION_IMPL *session, bool msg_event, int error, * followed by a colon. */ __wt_epoch(session, &ts); - __wt_thread_id(tid, sizeof(tid)); - remain = WT_PTRDIFF(end, p); - wlen = (size_t)snprintf(p, remain, "[%" PRIuMAX ":%" PRIuMAX "][%s]", + WT_ERR(__wt_thread_id(tid, sizeof(tid))); + WT_ERROR_APPEND(p, remain, + "[%" PRIuMAX ":%" PRIuMAX "][%s]", (uintmax_t)ts.tv_sec, (uintmax_t)ts.tv_nsec / WT_THOUSAND, tid); - p = wlen >= remain ? end : p + wlen; - if ((prefix = S2C(session)->error_prefix) != NULL) { - remain = WT_PTRDIFF(end, p); - wlen = (size_t)snprintf(p, remain, ", %s", prefix); - p = wlen >= remain ? end : p + wlen; - } + if ((prefix = S2C(session)->error_prefix) != NULL) + WT_ERROR_APPEND(p, remain, ", %s", prefix); prefix = session->dhandle == NULL ? NULL : session->dhandle->name; - if (prefix != NULL) { - remain = WT_PTRDIFF(end, p); - wlen = (size_t)snprintf(p, remain, ", %s", prefix); - p = wlen >= remain ? end : p + wlen; - } - if ((prefix = session->name) != NULL) { - remain = WT_PTRDIFF(end, p); - wlen = (size_t)snprintf(p, remain, ", %s", prefix); - p = wlen >= remain ? end : p + wlen; - } - remain = WT_PTRDIFF(end, p); - wlen = (size_t)snprintf(p, remain, ": "); - p = wlen >= remain ? end : p + wlen; - - if (file_name != NULL) { - remain = WT_PTRDIFF(end, p); - wlen = (size_t) - snprintf(p, remain, "%s, %d: ", file_name, line_number); - p = wlen >= remain ? end : p + wlen; - } + if (prefix != NULL) + WT_ERROR_APPEND(p, remain, ", %s", prefix); + if ((prefix = session->name) != NULL) + WT_ERROR_APPEND(p, remain, ", %s", prefix); + WT_ERROR_APPEND(p, remain, ": "); + + if (file_name != NULL) + WT_ERROR_APPEND(p, remain, "%s, %d: ", file_name, line_number); - remain = WT_PTRDIFF(end, p); - wlen = (size_t)vsnprintf(p, remain, fmt, ap); - p = wlen >= remain ? end : p + wlen; + WT_ERROR_APPEND_AP(p, remain, fmt, ap); if (error != 0) { /* @@ -261,10 +247,8 @@ __wt_eventv(WT_SESSION_IMPL *session, bool msg_event, int error, */ err = __wt_strerror(session, error, NULL, 0); len = strlen(err); - if (WT_PTRDIFF(p, s) < len || strcmp(p - len, err) != 0) { - remain = WT_PTRDIFF(end, p); - (void)snprintf(p, remain, ": %s", err); - } + if (WT_PTRDIFF(p, s) < len || strcmp(p - len, err) != 0) + WT_ERROR_APPEND(p, remain, ": %s", err); } /* @@ -279,7 +263,7 @@ __wt_eventv(WT_SESSION_IMPL *session, bool msg_event, int error, * * If an application-specified error message handler fails, complain * using the default error handler. If the default error handler fails, - * there's nothing to do. + * fallback to stderr. */ wt_session = (WT_SESSION *)session; handler = session->event_handler; @@ -293,6 +277,21 @@ __wt_eventv(WT_SESSION_IMPL *session, bool msg_event, int error, __handler_failure(session, ret, "error", true); } + if (ret != 0) { +err: if (fprintf(stderr, + "WiredTiger Error%s%s: ", + error == 0 ? "" : ": ", + error == 0 ? "" : + __wt_strerror(session, error, NULL, 0)) < 0) + WT_TRET(EIO); + if (vfprintf(stderr, fmt, ap) < 0) + WT_TRET(EIO); + if (fprintf(stderr, "\n") < 0) + WT_TRET(EIO); + if (fflush(stderr) != 0) + WT_TRET(EIO); + } + return (ret); } @@ -376,7 +375,7 @@ info_msg(WT_SESSION_IMPL *session, const char *fmt, va_list ap) */ char s[2048]; - (void)vsnprintf(s, sizeof(s), fmt, ap); + WT_RET(__wt_vsnprintf(s, sizeof(s), fmt, ap)); wt_session = (WT_SESSION *)session; handler = session->event_handler; diff --git a/src/support/mtx_rw.c b/src/support/mtx_rw.c index ea18f556257..35ad5da23f2 100644 --- a/src/support/mtx_rw.c +++ b/src/support/mtx_rw.c @@ -115,23 +115,27 @@ #include "wt_internal.h" /* - * __wt_rwlock_alloc -- - * Allocate and initialize a read/write lock. + * __wt_rwlock_init -- + * Initialize a read/write lock. */ -int -__wt_rwlock_alloc( - WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name) +void +__wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - WT_RWLOCK *rwlock; - - __wt_verbose(session, WT_VERB_MUTEX, "rwlock: alloc %s", name); + WT_UNUSED(session); - WT_RET(__wt_calloc_one(session, &rwlock)); + l->u = 0; +} - rwlock->name = name; +/* + * __wt_rwlock_destroy -- + * Destroy a read/write lock. + */ +void +__wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK *l) +{ + WT_UNUSED(session); - *rwlockp = rwlock; - return (0); + l->u = 0; } /* @@ -139,13 +143,12 @@ __wt_rwlock_alloc( * Try to get a shared lock, fail immediately if unavailable. */ int -__wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +__wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - wt_rwlock_t *l, new, old; + WT_RWLOCK new, old; WT_STAT_CONN_INCR(session, rwlock_read); - l = &rwlock->rwlock; new = old = *l; /* @@ -172,19 +175,15 @@ __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) * exclusive. */ void -__wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +__wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - wt_rwlock_t *l; - - l = &rwlock->rwlock; - /* * Try to get the lock in a single operation if it is available to * readers. This avoids the situation where multiple readers arrive * concurrently and have to line up in order to enter the lock. For * read-heavy workloads it can make a significant difference. */ - while (__wt_try_readlock(session, rwlock) != 0) { + while (__wt_try_readlock(session, l) != 0) { if (l->s.writers_active > 0) __wt_yield(); else @@ -197,9 +196,8 @@ __wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) * Get a shared lock. */ void -__wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +__wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - wt_rwlock_t *l; uint16_t ticket; int pause_cnt; @@ -207,8 +205,6 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_DIAGNOSTIC_YIELD; - l = &rwlock->rwlock; - /* * Possibly wrap: if we have more than 64K lockers waiting, the ticket * value will wrap and two lockers will simultaneously be granted the @@ -246,14 +242,10 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) * Release a shared lock. */ void -__wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +__wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - wt_rwlock_t *l; - WT_UNUSED(session); - l = &rwlock->rwlock; - /* * Increment the writers value (other readers are doing the same, make * sure we don't race). @@ -266,13 +258,12 @@ __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) * Try to get an exclusive lock, fail immediately if unavailable. */ int -__wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +__wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - wt_rwlock_t *l, new, old; + WT_RWLOCK new, old; WT_STAT_CONN_INCR(session, rwlock_write); - l = &rwlock->rwlock; old = new = *l; /* @@ -296,16 +287,13 @@ __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) * Wait to get an exclusive lock. */ void -__wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +__wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - wt_rwlock_t *l; uint16_t ticket; int pause_cnt; WT_STAT_CONN_INCR(session, rwlock_write); - l = &rwlock->rwlock; - /* * Possibly wrap: if we have more than 64K lockers waiting, the ticket * value will wrap and two lockers will simultaneously be granted the @@ -338,13 +326,12 @@ __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) * Release an exclusive lock. */ void -__wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +__wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - wt_rwlock_t *l, new; + WT_RWLOCK new; WT_UNUSED(session); - l = &rwlock->rwlock; (void)__wt_atomic_sub16(&l->s.writers_active, 1); /* @@ -368,40 +355,16 @@ __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_DIAGNOSTIC_YIELD; } -/* - * __wt_rwlock_destroy -- - * Destroy a read/write lock. - */ -void -__wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp) -{ - WT_RWLOCK *rwlock; - - rwlock = *rwlockp; /* Clear our caller's reference. */ - if (rwlock == NULL) - return; - *rwlockp = NULL; - - __wt_verbose( - session, WT_VERB_MUTEX, "rwlock: destroy %s", rwlock->name); - - __wt_free(session, rwlock); -} - #ifdef HAVE_DIAGNOSTIC /* * __wt_rwlock_islocked -- * Return if a read/write lock is currently locked for reading or writing. */ bool -__wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) +__wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - wt_rwlock_t *l; - WT_UNUSED(session); - l = &rwlock->rwlock; - return (l->s.writers != l->s.next || l->s.readers != l->s.next); } #endif diff --git a/src/support/rand.c b/src/support/rand.c index a5b229b9abc..4fae43edc8e 100644 --- a/src/support/rand.c +++ b/src/support/rand.c @@ -120,3 +120,15 @@ __wt_random(WT_RAND_STATE volatile * rnd_state) return ((z << 16) + (w & 65535)); } + +/* + * __wt_random64 -- + * Return a 64-bit pseudo-random number. + */ +uint64_t +__wt_random64(WT_RAND_STATE volatile * rnd_state) + WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) +{ + return (((uint64_t)__wt_random(rnd_state) << 32) + + __wt_random(rnd_state)); +} diff --git a/src/support/scratch.c b/src/support/scratch.c index 69987ebc852..485cea90e89 100644 --- a/src/support/scratch.c +++ b/src/support/scratch.c @@ -69,13 +69,16 @@ int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4))) { + WT_DECL_RET; va_list ap; size_t len; for (;;) { va_start(ap, fmt); - len = (size_t)vsnprintf(buf->mem, buf->memsize, fmt, ap); + ret = __wt_vsnprintf_len_set( + buf->mem, buf->memsize, &len, fmt, ap); va_end(ap); + WT_RET(ret); /* Check if there was enough space. */ if (len < buf->memsize) { @@ -100,6 +103,7 @@ int __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4))) { + WT_DECL_RET; va_list ap; size_t len, space; char *p; @@ -117,8 +121,9 @@ __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) p = (char *)((uint8_t *)buf->mem + buf->size); WT_ASSERT(session, buf->memsize >= buf->size); space = buf->memsize - buf->size; - len = (size_t)vsnprintf(p, space, fmt, ap); + ret = __wt_vsnprintf_len_set(p, space, &len, fmt, ap); va_end(ap); + WT_RET(ret); /* Check if there was enough space. */ if (len < space) { diff --git a/src/support/stat.c b/src/support/stat.c index a9c0b24ef29..2c2217f8c20 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -64,6 +64,7 @@ static const char * const __stats_dsrc_desc[] = { "cache: pages requested from the cache", "cache: pages written from cache", "cache: pages written requiring in-memory restoration", + "cache: tracked dirty bytes in the cache", "cache: unmodified pages evicted", "cache_walk: Average difference between current eviction generation when the page was last considered", "cache_walk: Average on-disk page image size seen", @@ -225,6 +226,7 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) stats->cache_pages_requested = 0; stats->cache_write = 0; stats->cache_write_restore = 0; + /* not clearing cache_bytes_dirty */ stats->cache_eviction_clean = 0; /* not clearing cache_state_gen_avg_gap */ /* not clearing cache_state_avg_written_size */ @@ -372,6 +374,7 @@ __wt_stat_dsrc_aggregate_single( to->cache_pages_requested += from->cache_pages_requested; to->cache_write += from->cache_write; to->cache_write_restore += from->cache_write_restore; + to->cache_bytes_dirty += from->cache_bytes_dirty; to->cache_eviction_clean += from->cache_eviction_clean; to->cache_state_gen_avg_gap += from->cache_state_gen_avg_gap; to->cache_state_avg_written_size += @@ -535,6 +538,7 @@ __wt_stat_dsrc_aggregate( WT_STAT_READ(from, cache_pages_requested); to->cache_write += WT_STAT_READ(from, cache_write); to->cache_write_restore += WT_STAT_READ(from, cache_write_restore); + to->cache_bytes_dirty += WT_STAT_READ(from, cache_bytes_dirty); to->cache_eviction_clean += WT_STAT_READ(from, cache_eviction_clean); to->cache_state_gen_avg_gap += WT_STAT_READ(from, cache_state_gen_avg_gap); @@ -673,10 +677,15 @@ static const char * const __stats_connection_desc[] = { "cache: eviction server unable to reach eviction goal", "cache: eviction state", "cache: eviction walks abandoned", + "cache: eviction worker thread active", + "cache: eviction worker thread created", "cache: eviction worker thread evicting pages", + "cache: eviction worker thread removed", + "cache: eviction worker thread stable number", "cache: failed eviction of pages that exceeded the in-memory maximum", "cache: files with active eviction walks", "cache: files with new eviction walks started", + "cache: force re-tuning of eviction workers once in a while", "cache: hazard pointer blocked page eviction", "cache: hazard pointer check calls", "cache: hazard pointer check entries walked", @@ -751,9 +760,7 @@ static const char * const __stats_connection_desc[] = { "lock: checkpoint lock acquisitions", "lock: checkpoint lock application thread wait time (usecs)", "lock: checkpoint lock internal thread wait time (usecs)", - "lock: handle-list lock acquisitions", - "lock: handle-list lock application thread wait time (usecs)", - "lock: handle-list lock internal thread wait time (usecs)", + "lock: handle-list lock eviction thread wait time (usecs)", "lock: metadata lock acquisitions", "lock: metadata lock application thread wait time (usecs)", "lock: metadata lock internal thread wait time (usecs)", @@ -765,9 +772,11 @@ static const char * const __stats_connection_desc[] = { "lock: table lock internal thread time waiting for the table lock (usecs)", "log: busy returns attempting to switch slots", "log: consolidated slot closures", + "log: consolidated slot join active slot closed", "log: consolidated slot join races", "log: consolidated slot join transitions", "log: consolidated slot joins", + "log: consolidated slot transitions unable to find free slot", "log: consolidated slot unbuffered writes", "log: log bytes of payload data", "log: log bytes written", @@ -954,10 +963,15 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->cache_eviction_slow = 0; /* not clearing cache_eviction_state */ stats->cache_eviction_walks_abandoned = 0; + /* not clearing cache_eviction_active_workers */ + stats->cache_eviction_worker_created = 0; stats->cache_eviction_worker_evicting = 0; + stats->cache_eviction_worker_removed = 0; + /* not clearing cache_eviction_stable_state_workers */ stats->cache_eviction_force_fail = 0; /* not clearing cache_eviction_walks_active */ stats->cache_eviction_walks_started = 0; + stats->cache_eviction_force_retune = 0; stats->cache_eviction_hazard = 0; stats->cache_hazard_checks = 0; stats->cache_hazard_walks = 0; @@ -1032,9 +1046,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->lock_checkpoint_count = 0; stats->lock_checkpoint_wait_application = 0; stats->lock_checkpoint_wait_internal = 0; - stats->lock_handle_list_count = 0; - stats->lock_handle_list_wait_application = 0; - stats->lock_handle_list_wait_internal = 0; + stats->lock_handle_list_wait_eviction = 0; stats->lock_metadata_count = 0; stats->lock_metadata_wait_application = 0; stats->lock_metadata_wait_internal = 0; @@ -1046,9 +1058,11 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->lock_table_wait_internal = 0; stats->log_slot_switch_busy = 0; stats->log_slot_closes = 0; + stats->log_slot_active_closed = 0; stats->log_slot_races = 0; stats->log_slot_transitions = 0; stats->log_slot_joins = 0; + stats->log_slot_no_free_slots = 0; stats->log_slot_unbuffered = 0; stats->log_bytes_payload = 0; stats->log_bytes_written = 0; @@ -1228,14 +1242,24 @@ __wt_stat_connection_aggregate( to->cache_eviction_state += WT_STAT_READ(from, cache_eviction_state); to->cache_eviction_walks_abandoned += WT_STAT_READ(from, cache_eviction_walks_abandoned); + to->cache_eviction_active_workers += + WT_STAT_READ(from, cache_eviction_active_workers); + to->cache_eviction_worker_created += + WT_STAT_READ(from, cache_eviction_worker_created); to->cache_eviction_worker_evicting += WT_STAT_READ(from, cache_eviction_worker_evicting); + to->cache_eviction_worker_removed += + WT_STAT_READ(from, cache_eviction_worker_removed); + to->cache_eviction_stable_state_workers += + WT_STAT_READ(from, cache_eviction_stable_state_workers); to->cache_eviction_force_fail += WT_STAT_READ(from, cache_eviction_force_fail); to->cache_eviction_walks_active += WT_STAT_READ(from, cache_eviction_walks_active); to->cache_eviction_walks_started += WT_STAT_READ(from, cache_eviction_walks_started); + to->cache_eviction_force_retune += + WT_STAT_READ(from, cache_eviction_force_retune); to->cache_eviction_hazard += WT_STAT_READ(from, cache_eviction_hazard); to->cache_hazard_checks += WT_STAT_READ(from, cache_hazard_checks); @@ -1331,12 +1355,8 @@ __wt_stat_connection_aggregate( WT_STAT_READ(from, lock_checkpoint_wait_application); to->lock_checkpoint_wait_internal += WT_STAT_READ(from, lock_checkpoint_wait_internal); - to->lock_handle_list_count += - WT_STAT_READ(from, lock_handle_list_count); - to->lock_handle_list_wait_application += - WT_STAT_READ(from, lock_handle_list_wait_application); - to->lock_handle_list_wait_internal += - WT_STAT_READ(from, lock_handle_list_wait_internal); + to->lock_handle_list_wait_eviction += + WT_STAT_READ(from, lock_handle_list_wait_eviction); to->lock_metadata_count += WT_STAT_READ(from, lock_metadata_count); to->lock_metadata_wait_application += WT_STAT_READ(from, lock_metadata_wait_application); @@ -1354,9 +1374,13 @@ __wt_stat_connection_aggregate( WT_STAT_READ(from, lock_table_wait_internal); to->log_slot_switch_busy += WT_STAT_READ(from, log_slot_switch_busy); to->log_slot_closes += WT_STAT_READ(from, log_slot_closes); + to->log_slot_active_closed += + WT_STAT_READ(from, log_slot_active_closed); to->log_slot_races += WT_STAT_READ(from, log_slot_races); to->log_slot_transitions += WT_STAT_READ(from, log_slot_transitions); to->log_slot_joins += WT_STAT_READ(from, log_slot_joins); + to->log_slot_no_free_slots += + WT_STAT_READ(from, log_slot_no_free_slots); to->log_slot_unbuffered += WT_STAT_READ(from, log_slot_unbuffered); to->log_bytes_payload += WT_STAT_READ(from, log_bytes_payload); to->log_bytes_written += WT_STAT_READ(from, log_bytes_written); diff --git a/src/support/thread_group.c b/src/support/thread_group.c index a866d2d01c5..2b4b7ad4e61 100644 --- a/src/support/thread_group.c +++ b/src/support/thread_group.c @@ -50,8 +50,7 @@ __thread_group_grow( { WT_THREAD *thread; - WT_ASSERT(session, - __wt_rwlock_islocked(session, group->lock)); + WT_ASSERT(session, __wt_rwlock_islocked(session, &group->lock)); /* * Any bounds checking is done by the caller so we know that @@ -72,20 +71,19 @@ __thread_group_grow( /* * __thread_group_shrink -- - * Decrease the number of running threads in the group, and free any + * Decrease the number of running threads in the group. Optionally free any * memory associated with slots larger than the new count. */ static int __thread_group_shrink(WT_SESSION_IMPL *session, - WT_THREAD_GROUP *group, uint32_t new_count) + WT_THREAD_GROUP *group, uint32_t new_count, bool free_thread) { WT_DECL_RET; WT_SESSION *wt_session; WT_THREAD *thread; uint32_t current_slot; - WT_ASSERT(session, - __wt_rwlock_islocked(session, group->lock)); + WT_ASSERT(session, __wt_rwlock_islocked(session, &group->lock)); for (current_slot = group->alloc; current_slot > new_count; ) { /* @@ -107,14 +105,15 @@ __thread_group_shrink(WT_SESSION_IMPL *session, WT_TRET(__wt_thread_join(session, thread->tid)); thread->tid = 0; } - - if (thread->session != NULL) { - wt_session = (WT_SESSION *)thread->session; - WT_TRET(wt_session->close(wt_session, NULL)); - thread->session = NULL; + if (free_thread) { + if (thread->session != NULL) { + wt_session = (WT_SESSION *)thread->session; + WT_TRET(wt_session->close(wt_session, NULL)); + thread->session = NULL; + } + __wt_free(session, thread); + group->threads[current_slot] = NULL; } - __wt_free(session, thread); - group->threads[current_slot] = NULL; } /* Update the thread group state to match our changes */ @@ -142,16 +141,19 @@ __thread_group_resize( WT_ASSERT(session, group->current_threads <= group->alloc && - __wt_rwlock_islocked(session, group->lock)); + __wt_rwlock_islocked(session, &group->lock)); if (new_min == group->min && new_max == group->max) return (0); + if (new_min > new_max) + return (EINVAL); + /* - * Coll shrink to reduce the number of thread structures and running + * Call shrink to reduce the number of thread structures and running * threads if required by the change in group size. */ - WT_RET(__thread_group_shrink(session, group, new_max)); + WT_RET(__thread_group_shrink(session, group, new_max, true)); /* * Only reallocate the thread array if it is the largest ever, since @@ -227,9 +229,9 @@ __wt_thread_group_resize( " from max: %" PRIu32 " -> %" PRIu32, (void *)group, group->min, new_min, group->max, new_max); - __wt_writelock(session, group->lock); + __wt_writelock(session, &group->lock); WT_TRET(__thread_group_resize(session, group, new_min, new_max, flags)); - __wt_writeunlock(session, group->lock); + __wt_writeunlock(session, &group->lock); return (ret); } @@ -255,17 +257,17 @@ __wt_thread_group_create( __wt_verbose(session, WT_VERB_THREAD_GROUP, "Creating thread group: %p", (void *)group); - WT_RET(__wt_rwlock_alloc(session, &group->lock, "Thread group")); + __wt_rwlock_init(session, &group->lock); WT_ERR(__wt_cond_alloc( - session, "Thread group cond", false, &group->wait_cond)); + session, "thread group cond", &group->wait_cond)); cond_alloced = true; - __wt_writelock(session, group->lock); + __wt_writelock(session, &group->lock); group->run_func = run_func; group->name = name; WT_TRET(__thread_group_resize(session, group, min, max, flags)); - __wt_writeunlock(session, group->lock); + __wt_writeunlock(session, &group->lock); /* Cleanup on error to avoid leaking resources */ err: if (ret != 0) { @@ -288,10 +290,10 @@ __wt_thread_group_destroy(WT_SESSION_IMPL *session, WT_THREAD_GROUP *group) __wt_verbose(session, WT_VERB_THREAD_GROUP, "Destroying thread group: %p", (void *)group); - WT_ASSERT(session, __wt_rwlock_islocked(session, group->lock)); + WT_ASSERT(session, __wt_rwlock_islocked(session, &group->lock)); /* Shut down all threads and free associated resources. */ - WT_TRET(__thread_group_shrink(session, group, 0)); + WT_TRET(__thread_group_shrink(session, group, 0, true)); __wt_free(session, group->threads); @@ -322,15 +324,42 @@ __wt_thread_group_start_one( return (0); if (wait) - __wt_writelock(session, group->lock); - else if (__wt_try_writelock(session, group->lock) != 0) - return (0); + __wt_writelock(session, &group->lock); + else + WT_RET(__wt_try_writelock(session, &group->lock)); /* Recheck the bounds now that we hold the lock */ if (group->current_threads < group->max) WT_TRET(__thread_group_grow( session, group, group->current_threads + 1)); - __wt_writeunlock(session, group->lock); + __wt_writeunlock(session, &group->lock); + + return (ret); +} + +/* + * __wt_thread_group_stop_one -- + * Stop one thread if possible. + */ +int +__wt_thread_group_stop_one( + WT_SESSION_IMPL *session, WT_THREAD_GROUP *group, bool wait) +{ + WT_DECL_RET; + + if (group->current_threads <= group->min) + return (0); + + if (wait) + __wt_writelock(session, &group->lock); + else + WT_RET(__wt_try_writelock(session, &group->lock)); + + /* Recheck the bounds now that we hold the lock */ + if (group->current_threads > group->min) + WT_TRET(__thread_group_shrink( + session, group, group->current_threads - 1, false)); + __wt_writeunlock(session, &group->lock); return (ret); } diff --git a/src/txn/txn.c b/src/txn/txn.c index 26a0ed679e2..6eebf5ecf9f 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -126,7 +126,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) n = 0; /* We're going to scan the table: wait for the lock. */ - __wt_readlock_spin(session, txn_global->scan_rwlock); + __wt_readlock_spin(session, &txn_global->scan_rwlock); current_id = pinned_id = txn_global->current; prev_oldest_id = txn_global->oldest_id; @@ -180,7 +180,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); txn_state->pinned_id = pinned_id; -done: __wt_readunlock(session, txn_global->scan_rwlock); +done: __wt_readunlock(session, &txn_global->scan_rwlock); __txn_sort_snapshot(session, n, current_id); } @@ -293,13 +293,13 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) /* First do a read-only scan. */ if (wait) - __wt_readlock_spin(session, txn_global->scan_rwlock); + __wt_readlock_spin(session, &txn_global->scan_rwlock); else if ((ret = - __wt_try_readlock(session, txn_global->scan_rwlock)) != 0) + __wt_try_readlock(session, &txn_global->scan_rwlock)) != 0) return (ret == EBUSY ? 0 : ret); __txn_oldest_scan(session, &oldest_id, &last_running, &metadata_pinned, &oldest_session); - __wt_readunlock(session, txn_global->scan_rwlock); + __wt_readunlock(session, &txn_global->scan_rwlock); /* * If the state hasn't changed (or hasn't moved far enough for @@ -314,9 +314,9 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) /* It looks like an update is necessary, wait for exclusive access. */ if (wait) - __wt_writelock(session, txn_global->scan_rwlock); + __wt_writelock(session, &txn_global->scan_rwlock); else if ((ret = - __wt_try_writelock(session, txn_global->scan_rwlock)) != 0) + __wt_try_writelock(session, &txn_global->scan_rwlock)) != 0) return (ret == EBUSY ? 0 : ret); /* @@ -375,7 +375,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) #endif } -done: __wt_writeunlock(session, txn_global->scan_rwlock); +done: __wt_writeunlock(session, &txn_global->scan_rwlock); return (ret); } @@ -713,7 +713,7 @@ __wt_txn_stats_update(WT_SESSION_IMPL *session) snapshot_pinned = txn_global->nsnap_oldest_id; WT_STAT_SET(session, stats, txn_pinned_range, - txn_global->current - txn_global->oldest_id); + txn_global->current - txn_global->oldest_id); WT_STAT_SET(session, stats, txn_pinned_snapshot_range, snapshot_pinned == WT_TXN_NONE ? @@ -768,10 +768,8 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_spin_init(session, &txn_global->id_lock, "transaction id lock")); - WT_RET(__wt_rwlock_alloc(session, - &txn_global->scan_rwlock, "transaction scan lock")); - WT_RET(__wt_rwlock_alloc(session, - &txn_global->nsnap_rwlock, "named snapshot lock")); + __wt_rwlock_init(session, &txn_global->scan_rwlock); + __wt_rwlock_init(session, &txn_global->nsnap_rwlock); txn_global->nsnap_oldest_id = WT_TXN_NONE; TAILQ_INIT(&txn_global->nsnaph); @@ -805,3 +803,98 @@ __wt_txn_global_destroy(WT_SESSION_IMPL *session) __wt_rwlock_destroy(session, &txn_global->nsnap_rwlock); __wt_free(session, txn_global->states); } + +#if defined(HAVE_DIAGNOSTIC) || defined(HAVE_VERBOSE) +/* + * __wt_verbose_dump_txn -- + * Output diagnostic information about the global transaction state. + */ +int +__wt_verbose_dump_txn(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_TXN_GLOBAL *txn_global; + WT_TXN *txn; + WT_TXN_STATE *s; + const char *iso_tag; + uint64_t id; + uint32_t i, session_cnt; + + conn = S2C(session); + txn_global = &conn->txn_global; + + WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); + WT_RET(__wt_msg(session, "transaction state dump")); + + WT_RET(__wt_msg(session, "current ID: %" PRIu64, txn_global->current)); + WT_RET(__wt_msg(session, + "last running ID: %" PRIu64, txn_global->last_running)); + WT_RET(__wt_msg(session, "oldest ID: %" PRIu64, txn_global->oldest_id)); + WT_RET(__wt_msg(session, + "oldest named snapshot ID: %" PRIu64, txn_global->nsnap_oldest_id)); + + WT_RET(__wt_msg(session, "checkpoint running? %s", + txn_global->checkpoint_running ? "yes" : "no")); + WT_RET(__wt_msg(session, + "checkpoint generation: %" PRIu64, txn_global->checkpoint_gen)); + WT_RET(__wt_msg(session, + "checkpoint pinned ID: %" PRIu64, txn_global->checkpoint_pinned)); + WT_RET(__wt_msg(session, + "checkpoint txn ID: %" PRIu64, txn_global->checkpoint_txnid)); + + WT_ORDERED_READ(session_cnt, conn->session_cnt); + WT_RET(__wt_msg(session, "session count: %" PRIu32, session_cnt)); + + WT_RET(__wt_msg(session, "Transaction state of active sessions:")); + + /* + * Walk each session transaction state and dump information. Accessing + * the content of session handles is not thread safe, so some + * information may change while traversing if other threads are active + * at the same time, which is OK since this is diagnostic code. + */ + for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { + /* Skip sessions with no active transaction */ + if ((id = s->id) == WT_TXN_NONE && s->pinned_id == WT_TXN_NONE) + continue; + + txn = &conn->sessions[i].txn; + iso_tag = "INVALID"; + switch (txn->isolation) { + case WT_ISO_READ_COMMITTED: + iso_tag = "WT_ISO_READ_COMMITTED"; + break; + case WT_ISO_READ_UNCOMMITTED: + iso_tag = "WT_ISO_READ_UNCOMMITTED"; + break; + case WT_ISO_SNAPSHOT: + iso_tag = "WT_ISO_SNAPSHOT"; + break; + } + + WT_RET(__wt_msg(session, + "ID: %6" PRIu64 + ", mod count: %u" + ", pinned ID: %" PRIu64 + ", snap min: %" PRIu64 + ", snap max: %" PRIu64 + ", metadata pinned ID: %" PRIu64 + ", flags: 0x%08" PRIx32 + ", name: %s" + ", isolation: %s", + id, + txn->mod_count, + s->pinned_id, + txn->snap_min, + txn->snap_max, + s->metadata_pinned, + txn->flags, + conn->sessions[i].name == NULL ? + "EMPTY" : conn->sessions[i].name, + iso_tag)); + } + WT_RET(__wt_msg(session, "%s", WT_DIVIDER)); + + return (0); +} +#endif diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 399d9187d82..f4ccf5eacd0 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -8,9 +8,9 @@ #include "wt_internal.h" -static int __checkpoint_lock_tree( - WT_SESSION_IMPL *, bool, bool, const char *[]); -static int __checkpoint_mark_deletes(WT_SESSION_IMPL *, const char *[]); +static int __checkpoint_lock_dirty_tree( + WT_SESSION_IMPL *, bool, bool, bool, const char *[]); +static int __checkpoint_mark_skip(WT_SESSION_IMPL *, WT_CKPT *, bool); static int __checkpoint_presync(WT_SESSION_IMPL *, const char *[]); static int __checkpoint_tree_helper(WT_SESSION_IMPL *, const char *[]); @@ -90,6 +90,33 @@ err: WT_TRET(__wt_metadata_cursor_release(session, &cursor)); } /* + * __checkpoint_update_generation -- + * Update the checkpoint generation of the current tree. + * + * This indicates that the tree will not be visited again by the current + * checkpoint. + */ +static void +__checkpoint_update_generation(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + + btree = S2BT(session); + + /* + * Updates to the metadata are made by the checkpoint transaction, so + * the metadata tree's checkpoint generation should never be updated. + */ + if (WT_IS_METADATA(session->dhandle)) + return; + + WT_PUBLISH(btree->checkpoint_gen, + S2C(session)->txn_global.checkpoint_gen); + WT_STAT_DATA_SET(session, + btree_checkpoint_generation, btree->checkpoint_gen); +} + +/* * __checkpoint_apply_all -- * Apply an operation to all files involved in a checkpoint. */ @@ -239,22 +266,82 @@ int __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BTREE *btree; + WT_CONFIG_ITEM cval; WT_DECL_RET; const char *name; + bool force; + + btree = S2BT(session); + + /* Find out if we have to force a checkpoint. */ + WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval)); + force = cval.val != 0; + if (!force) { + WT_RET(__wt_config_gets_def(session, cfg, "name", 0, &cval)); + force = cval.len != 0; + } /* Should not be called with anything other than a file object. */ WT_ASSERT(session, session->dhandle->checkpoint == NULL); WT_ASSERT(session, WT_PREFIX_MATCH(session->dhandle->name, "file:")); /* Skip files that are never involved in a checkpoint. */ - if (F_ISSET(S2BT(session), WT_BTREE_NO_CHECKPOINT)) + if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT)) + return (0); + +#ifdef HAVE_DIAGNOSTIC + /* + * We may have raced between starting the checkpoint transaction and + * some operation completing on the handle that updated the metadata + * (e.g., closing a bulk load cursor). All such operations either have + * exclusive access to the handle or hold the schema lock. We are now + * holding the schema lock and have an open btree handle, so if we + * can't update the metadata, then there has been some state change + * invisible to the checkpoint transaction. + */ + if (!WT_IS_METADATA(session->dhandle)) { + WT_CURSOR *meta_cursor; + bool metadata_race; + + WT_ASSERT(session, !F_ISSET(&session->txn, WT_TXN_ERROR)); + WT_RET(__wt_metadata_cursor(session, &meta_cursor)); + meta_cursor->set_key(meta_cursor, session->dhandle->name); + ret = __wt_curfile_insert_check(meta_cursor); + if (ret == WT_ROLLBACK) { + metadata_race = true; + ret = 0; + } else + metadata_race = false; + WT_TRET(__wt_metadata_cursor_release(session, &meta_cursor)); + WT_RET(ret); + WT_ASSERT(session, !metadata_race); + } +#endif + + /* + * Decide whether the tree needs to be included in the checkpoint and + * if so, acquire the necessary locks. + */ + WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree( + session, true, force, true, cfg)); + WT_RET(ret); + if (F_ISSET(btree, WT_BTREE_SKIP_CKPT)) { + WT_ASSERT(session, btree->ckpt == NULL); + __checkpoint_update_generation(session); return (0); + } - /* Make sure there is space for the next entry. */ + /* + * Make sure there is space for the new entry: do this before getting + * the handle to avoid cleanup if we can't allocate the memory. + */ WT_RET(__wt_realloc_def(session, &session->ckpt_handle_allocated, session->ckpt_handle_next + 1, &session->ckpt_handle)); - /* Not strictly necessary, but cleaner to clear the current handle. */ + /* + * The current tree will be included: get it again because the handle + * we have is only valid for the duration of this function. + */ name = session->dhandle->name; session->dhandle = NULL; @@ -266,49 +353,13 @@ __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) * with eviction and we don't want to unfairly penalize (or promote) * eviction in trees due to checkpoints. */ - btree = S2BT(session); btree->evict_walk_saved = btree->evict_walk_period; - WT_SAVE_DHANDLE(session, - ret = __checkpoint_lock_tree(session, true, true, cfg)); - if (ret != 0) { - WT_TRET(__wt_session_release_btree(session)); - return (ret); - } - - /* - * Flag that the handle is part of a checkpoint for the purposes - * of transaction visibility checks. - */ - WT_PUBLISH(btree->include_checkpoint_txn, true); - session->ckpt_handle[session->ckpt_handle_next++] = session->dhandle; return (0); } /* - * __checkpoint_update_generation -- - * Update the checkpoint generation of the current tree. - * - * This indicates that the tree will not be visited again by the current - * checkpoint. - */ -static void -__checkpoint_update_generation(WT_SESSION_IMPL *session) -{ - WT_BTREE *btree; - - btree = S2BT(session); - if (!WT_IS_METADATA(session->dhandle)) - WT_PUBLISH(btree->include_checkpoint_txn, false); - - WT_PUBLISH(btree->checkpoint_gen, - S2C(session)->txn_global.checkpoint_gen); - WT_STAT_DATA_SET(session, - btree_checkpoint_generation, btree->checkpoint_gen); -} - -/* * __checkpoint_reduce_dirty_cache -- * Release clean trees from the list cached for checkpoints. */ @@ -371,7 +422,6 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) __wt_sleep(0, stepdown_us / 10); __wt_epoch(session, &stop); current_us = WT_TIMEDIFF_US(stop, last); - total_ms = WT_TIMEDIFF_MS(stop, start); bytes_written_total = cache->bytes_written - bytes_written_start; @@ -434,36 +484,6 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) } /* - * __checkpoint_release_clean_trees -- - * Release clean trees from the list cached for checkpoints. - */ -static int -__checkpoint_release_clean_trees(WT_SESSION_IMPL *session) -{ - WT_BTREE *btree; - WT_DATA_HANDLE *dhandle; - WT_DECL_RET; - u_int i; - - for (i = 0; i < session->ckpt_handle_next; i++) { - dhandle = session->ckpt_handle[i]; - btree = dhandle->handle; - if (!F_ISSET(btree, WT_BTREE_SKIP_CKPT)) - continue; - __wt_meta_ckptlist_free(session, btree->ckpt); - btree->ckpt = NULL; - WT_WITH_DHANDLE(session, dhandle, - __checkpoint_update_generation(session)); - session->ckpt_handle[i] = NULL; - WT_WITH_DHANDLE(session, dhandle, - ret = __wt_session_release_btree(session)); - WT_RET(ret); - } - - return (0); -} - -/* * __checkpoint_stats -- * Update checkpoint timer stats. */ @@ -525,6 +545,112 @@ __checkpoint_verbose_track(WT_SESSION_IMPL *session, } /* + * __checkpoint_fail_reset -- + * Reset fields when a failure occurs. + */ +static void +__checkpoint_fail_reset(WT_SESSION_IMPL *session) +{ + WT_BTREE *btree; + + btree = S2BT(session); + btree->modified = true; + __wt_meta_ckptlist_free(session, &btree->ckpt); +} + +/* + * __checkpoint_prepare -- + * Start the transaction for a checkpoint and gather handles. + */ +static int +__checkpoint_prepare(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_TXN *txn; + WT_TXN_GLOBAL *txn_global; + WT_TXN_STATE *txn_state; + const char *txn_cfg[] = { WT_CONFIG_BASE(session, + WT_SESSION_begin_transaction), "isolation=snapshot", NULL }; + + conn = S2C(session); + txn = &session->txn; + txn_global = &conn->txn_global; + txn_state = WT_SESSION_TXN_STATE(session); + + /* + * Start a snapshot transaction for the checkpoint. + * + * Note: we don't go through the public API calls because they have + * side effects on cursors, which applications can hold open across + * calls to checkpoint. + */ + WT_RET(__wt_txn_begin(session, txn_cfg)); + + WT_DIAGNOSTIC_YIELD; + + /* Ensure a transaction ID is allocated prior to sharing it globally */ + WT_RET(__wt_txn_id_check(session)); + + /* + * Mark the connection as clean. If some data gets modified after + * generating checkpoint transaction id, connection will be reset to + * dirty when reconciliation marks the btree dirty on encountering the + * dirty page. + */ + conn->modified = false; + + /* + * Save the checkpoint session ID. + * + * We never do checkpoints in the default session (with id zero). + */ + WT_ASSERT(session, session->id != 0 && txn_global->checkpoint_id == 0); + txn_global->checkpoint_id = session->id; + + /* + * Remove the checkpoint transaction from the global table. + * + * This allows ordinary visibility checks to move forward because + * checkpoints often take a long time and only write to the metadata. + */ + __wt_writelock(session, &txn_global->scan_rwlock); + txn_global->checkpoint_txnid = txn->id; + txn_global->checkpoint_pinned = WT_MIN(txn->id, txn->snap_min); + + /* + * Sanity check that the oldest ID hasn't moved on before we have + * cleared our entry. + */ + WT_ASSERT(session, + WT_TXNID_LE(txn_global->oldest_id, txn_state->id) && + WT_TXNID_LE(txn_global->oldest_id, txn_state->pinned_id)); + + /* + * Clear our entry from the global transaction session table. Any + * operation that needs to know about the ID for this checkpoint will + * consider the checkpoint ID in the global structure. Most operations + * can safely ignore the checkpoint ID (see the visible all check for + * details). + */ + txn_state->id = txn_state->pinned_id = + txn_state->metadata_pinned = WT_TXN_NONE; + __wt_writeunlock(session, &txn_global->scan_rwlock); + + /* + * Get a list of handles we want to flush; for named checkpoints this + * may pull closed objects into the session cache. + * + * First, gather all handles, then start the checkpoint transaction, + * then release any clean handles. + */ + WT_ASSERT(session, session->ckpt_handle_next == 0); + WT_WITH_TABLE_READ_LOCK(session, ret = __checkpoint_apply_all( + session, cfg, __wt_checkpoint_get_handles, NULL)); + return (ret); +} + +/* * __txn_checkpoint -- * Checkpoint a database or a list of objects in the database. */ @@ -539,19 +665,15 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_ISOLATION saved_isolation; - WT_TXN_STATE *txn_state; void *saved_meta_next; u_int i; uint64_t fsync_duration_usecs; - bool full, idle, logging, tracking; - const char *txn_cfg[] = { WT_CONFIG_BASE(session, - WT_SESSION_begin_transaction), "isolation=snapshot", NULL }; + bool failed, full, idle, logging, tracking; conn = S2C(session); cache = conn->cache; txn = &session->txn; txn_global = &conn->txn_global; - txn_state = WT_SESSION_TXN_STATE(session); saved_isolation = session->isolation; full = idle = logging = tracking = false; @@ -620,87 +742,24 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) tracking = true; /* - * Get a list of handles we want to flush; for named checkpoints this - * may pull closed objects into the session cache. - * * We want to skip checkpointing clean handles whenever possible. That * is, when the checkpoint is not named or forced. However, we need to * take care about ordering with respect to the checkpoint transaction. * - * If we skip clean handles before starting the transaction, the + * We can't skip clean handles before starting the transaction or the * checkpoint can miss updates in trees that become dirty as the * checkpoint is starting. If we wait until the transaction has * started before locking a handle, there could be a metadata-changing * operation in between (e.g., salvage) that will cause a write * conflict when the checkpoint goes to write the metadata. * - * First, gather all handles, then start the checkpoint transaction, - * then release any clean handles. + * Hold the schema lock while starting the transaction and gathering + * handles so the set we get is complete and correct. */ - WT_ASSERT(session, session->ckpt_handle_next == 0); - WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __checkpoint_apply_all( - session, cfg, __wt_checkpoint_get_handles, NULL)))); + WT_WITH_SCHEMA_LOCK(session, ret = __checkpoint_prepare(session, cfg)); WT_ERR(ret); - /* - * Start a snapshot transaction for the checkpoint. - * - * Note: we don't go through the public API calls because they have - * side effects on cursors, which applications can hold open across - * calls to checkpoint. - */ - WT_ERR(__wt_txn_begin(session, txn_cfg)); - - /* Ensure a transaction ID is allocated prior to sharing it globally */ - WT_ERR(__wt_txn_id_check(session)); - - /* - * Mark the connection as clean. If some data gets modified after - * generating checkpoint transaction id, connection will be reset to - * dirty when reconciliation marks the btree dirty on encountering the - * dirty page. - */ - conn->modified = false; - - /* - * Save the checkpoint session ID. - * - * We never do checkpoints in the default session (with id zero). - */ - WT_ASSERT(session, session->id != 0 && txn_global->checkpoint_id == 0); - txn_global->checkpoint_id = session->id; - - /* - * Remove the checkpoint transaction from the global table. - * - * This allows ordinary visibility checks to move forward because - * checkpoints often take a long time and only write to the metadata. - */ - __wt_writelock(session, txn_global->scan_rwlock); - txn_global->checkpoint_txnid = txn->id; - txn_global->checkpoint_pinned = WT_MIN(txn->id, txn->snap_min); - - /* - * Sanity check that the oldest ID hasn't moved on before we have - * cleared our entry. - */ - WT_ASSERT(session, - WT_TXNID_LE(txn_global->oldest_id, txn_state->id) && - WT_TXNID_LE(txn_global->oldest_id, txn_state->pinned_id)); - - /* - * Clear our entry from the global transaction session table. Any - * operation that needs to know about the ID for this checkpoint will - * consider the checkpoint ID in the global structure. Most operations - * can safely ignore the checkpoint ID (see the visible all check for - * details). - */ - txn_state->id = txn_state->pinned_id = - txn_state->metadata_pinned = WT_TXN_NONE; - __wt_writeunlock(session, txn_global->scan_rwlock); + WT_ASSERT(session, txn->isolation == WT_ISO_SNAPSHOT); /* * Unblock updates -- we can figure out that any updates to clean pages @@ -709,16 +768,6 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) cache->eviction_scrub_limit = 0.0; WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0); - /* - * Mark old checkpoints that are being deleted and figure out which - * trees we can skip in this checkpoint. - * - * Release clean trees. Any updates made after this point will not - * visible to the checkpoint transaction. - */ - WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_mark_deletes)); - WT_ERR(__checkpoint_release_clean_trees(session)); - /* Tell logging that we have started a database checkpoint. */ if (full && logging) WT_ERR(__wt_txn_checkpoint_log( @@ -825,12 +874,13 @@ err: /* * overwritten the checkpoint, so what ends up on disk is not * consistent. */ - if (ret != 0 && !conn->modified) + failed = ret != 0; + if (failed) conn->modified = true; session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED; if (tracking) - WT_TRET(__wt_meta_track_off(session, false, ret != 0)); + WT_TRET(__wt_meta_track_off(session, false, failed)); cache->eviction_scrub_limit = 0.0; WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0); @@ -863,6 +913,13 @@ err: /* for (i = 0; i < session->ckpt_handle_next; ++i) { if (session->ckpt_handle[i] == NULL) continue; + /* + * If the operation failed, mark all trees dirty so they are + * included if a future checkpoint can succeed. + */ + if (failed) + WT_WITH_DHANDLE(session, session->ckpt_handle[i], + __checkpoint_fail_reset(session)); WT_WITH_DHANDLE(session, session->ckpt_handle[i], WT_TRET(__wt_session_release_btree(session))); } @@ -1047,12 +1104,13 @@ __drop_to(WT_CKPT *ckptbase, const char *name, size_t len) } /* - * __checkpoint_lock_tree -- - * Acquire the locks required to checkpoint a tree. + * __checkpoint_lock_dirty_tree -- + * Decide whether the tree needs to be included in the checkpoint and if + * so, acquire the necessary locks. */ static int -__checkpoint_lock_tree(WT_SESSION_IMPL *session, - bool is_checkpoint, bool need_tracking, const char *cfg[]) +__checkpoint_lock_dirty_tree(WT_SESSION_IMPL *session, + bool is_checkpoint, bool force, bool need_tracking, const char *cfg[]) { WT_BTREE *btree; WT_CKPT *ckpt, *ckptbase; @@ -1159,7 +1217,7 @@ __checkpoint_lock_tree(WT_SESSION_IMPL *session, * Hold the lock until we're done (blocking hot backups from starting), * we don't want to race with a future hot backup. */ - __wt_readlock(session, conn->hot_backup_lock); + __wt_readlock(session, &conn->hot_backup_lock); hot_backup_locked = true; if (conn->hot_backup) WT_CKPT_FOREACH(ckptbase, ckpt) { @@ -1177,6 +1235,14 @@ __checkpoint_lock_tree(WT_SESSION_IMPL *session, } /* + * Mark old checkpoints that are being deleted and figure out which + * trees we can skip in this checkpoint. + */ + WT_ERR(__checkpoint_mark_skip(session, ckptbase, force)); + if (F_ISSET(btree, WT_BTREE_SKIP_CKPT)) + goto err; + + /* * Lock the checkpoints that will be deleted. * * Checkpoints are only locked when tracking is enabled, which covers @@ -1209,64 +1275,47 @@ __checkpoint_lock_tree(WT_SESSION_IMPL *session, } /* - * There are special files: those being bulk-loaded, salvaged, upgraded - * or verified during the checkpoint. We have to do something for those - * objects because a checkpoint is an external name the application can - * reference and the name must exist no matter what's happening during - * the checkpoint. For bulk-loaded files, we could block until the load - * completes, checkpoint the partial load, or magic up an empty-file - * checkpoint. The first is too slow, the second is insane, so do the - * third. - * Salvage, upgrade and verify don't currently require any work, all - * three hold the schema lock, blocking checkpoints. If we ever want to - * fix that (and I bet we eventually will, at least for verify), we can - * copy the last checkpoint the file has. That works if we guarantee - * salvage, upgrade and verify act on objects with previous checkpoints - * (true if handles are closed/re-opened between object creation and a - * subsequent salvage, upgrade or verify operation). Presumably, - * salvage and upgrade will discard all previous checkpoints when they - * complete, which is fine with us. This change will require reference - * counting checkpoints, and once that's done, we should use checkpoint - * copy instead of forcing checkpoints on clean objects to associate - * names with checkpoints. + * There are special tree: those being bulk-loaded, salvaged, upgraded + * or verified during the checkpoint. They should never be part of a + * checkpoint: we will fail to lock them because the operations have + * exclusive access to the handles. Named checkpoints will fail in that + * case, ordinary checkpoints will skip files that cannot be opened + * normally. */ WT_ASSERT(session, !is_checkpoint || !F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)); - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); - WT_ASSERT(session, btree->ckpt == NULL); + WT_ASSERT(session, btree->ckpt == NULL && + !F_ISSET(btree, WT_BTREE_SKIP_CKPT)); btree->ckpt = ckptbase; return (0); err: if (hot_backup_locked) - __wt_readunlock(session, conn->hot_backup_lock); + __wt_readunlock(session, &conn->hot_backup_lock); - __wt_meta_ckptlist_free(session, ckptbase); + __wt_meta_ckptlist_free(session, &ckptbase); __wt_free(session, name_alloc); return (ret); } /* - * __checkpoint_mark_deletes -- - * Figure out what old checkpoints will be deleted, and whether the - * checkpoint can be skipped entirely. + * __checkpoint_mark_skip -- + * Figure out whether the checkpoint can be skipped for a tree. */ static int -__checkpoint_mark_deletes( - WT_SESSION_IMPL *session, const char *cfg[]) +__checkpoint_mark_skip( + WT_SESSION_IMPL *session, WT_CKPT *ckptbase, bool force) { WT_BTREE *btree; - WT_CKPT *ckpt, *ckptbase; - WT_CONFIG_ITEM cval; + WT_CKPT *ckpt; const char *name; int deleted; - bool force; btree = S2BT(session); - ckptbase = btree->ckpt; /* * Check for clean objects not requiring a checkpoint. @@ -1292,12 +1341,7 @@ __checkpoint_mark_deletes( * to open the checkpoint in a cursor after taking any checkpoint, which * means it must exist. */ - force = false; F_CLR(btree, WT_BTREE_SKIP_CKPT); - if (!btree->modified && cfg != NULL) { - WT_RET(__wt_config_gets(session, cfg, "force", &cval)); - force = cval.val != 0; - } if (!btree->modified && !force) { deleted = 0; WT_CKPT_FOREACH(ckptbase, ckpt) @@ -1341,7 +1385,6 @@ __checkpoint_tree( WT_DATA_HANDLE *dhandle; WT_DECL_RET; WT_LSN ckptlsn; - int was_modified; bool fake_ckpt; WT_UNUSED(cfg); @@ -1352,7 +1395,6 @@ __checkpoint_tree( conn = S2C(session); dhandle = session->dhandle; fake_ckpt = false; - was_modified = btree->modified; /* * Set the checkpoint LSN to the maximum LSN so that if logging is @@ -1377,7 +1419,7 @@ __checkpoint_tree( * delete a physical checkpoint, and that will end in tears. */ if (is_checkpoint) - if (btree->bulk_load_ok) { + if (btree->original) { fake_ckpt = true; goto fake; } @@ -1483,14 +1525,12 @@ err: /* * If the checkpoint didn't complete successfully, make sure the * tree is marked dirty. */ - if (ret != 0 && !btree->modified && was_modified) { + if (ret != 0) { btree->modified = true; - if (!S2C(session)->modified) - S2C(session)->modified = true; + S2C(session)->modified = true; } - __wt_meta_ckptlist_free(session, ckptbase); - btree->ckpt = NULL; + __wt_meta_ckptlist_free(session, &btree->ckpt); return (ret); } @@ -1509,7 +1549,8 @@ __checkpoint_presync(WT_SESSION_IMPL *session, const char *cfg[]) WT_UNUSED(cfg); btree = S2BT(session); - WT_ASSERT(session, !btree->include_checkpoint_txn); + WT_ASSERT(session, btree->checkpoint_gen == + S2C(session)->txn_global.checkpoint_gen); btree->evict_walk_period = btree->evict_walk_saved; return (0); } @@ -1558,7 +1599,9 @@ __checkpoint_tree_helper(WT_SESSION_IMPL *session, const char *cfg[]) int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) { + WT_CONFIG_ITEM cval; WT_DECL_RET; + bool force; /* Should not be called with a checkpoint handle. */ WT_ASSERT(session, session->dhandle->checkpoint == NULL); @@ -1567,12 +1610,13 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ASSERT(session, !WT_IS_METADATA(session->dhandle) || F_ISSET(session, WT_SESSION_LOCKED_METADATA)); - WT_SAVE_DHANDLE(session, - ret = __checkpoint_lock_tree(session, true, true, cfg)); - WT_RET(ret); - WT_SAVE_DHANDLE(session, - ret = __checkpoint_mark_deletes(session, cfg)); + WT_RET(__wt_config_gets_def(session, cfg, "force", 0, &cval)); + force = cval.val != 0; + WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree( + session, true, force, true, cfg)); WT_RET(ret); + if (F_ISSET(S2BT(session), WT_BTREE_SKIP_CKPT)) + return (0); return (__checkpoint_tree(session, true, cfg)); } @@ -1647,15 +1691,10 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final) if (need_tracking) WT_RET(__wt_meta_track_on(session)); - WT_SAVE_DHANDLE(session, - ret = __checkpoint_lock_tree(session, false, need_tracking, NULL)); + WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_dirty_tree( + session, false, false, need_tracking, NULL)); WT_ASSERT(session, ret == 0); - if (ret == 0) { - WT_SAVE_DHANDLE(session, - ret = __checkpoint_mark_deletes(session, NULL)); - WT_ASSERT(session, ret == 0); - } - if (ret == 0) + if (ret == 0 && !F_ISSET(btree, WT_BTREE_SKIP_CKPT)) ret = __checkpoint_tree(session, false, NULL); if (need_tracking) diff --git a/src/txn/txn_log.c b/src/txn/txn_log.c index 5f4704b40c4..2931dc1ce82 100644 --- a/src/txn/txn_log.c +++ b/src/txn/txn_log.c @@ -269,7 +269,7 @@ __wt_txn_checkpoint_logread(WT_SESSION_IMPL *session, WT_ITEM ckpt_snapshot_unused; uint32_t ckpt_file, ckpt_offset; u_int ckpt_nsnapshot_unused; - const char *fmt = WT_UNCHECKED_STRING(IIIU); + const char *fmt = WT_UNCHECKED_STRING(IIIu); if ((ret = __wt_struct_unpack(session, *pp, WT_PTRDIFF(end, *pp), fmt, &ckpt_file, &ckpt_offset, @@ -297,7 +297,7 @@ __wt_txn_checkpoint_log( uint8_t *end, *p; size_t recsize; uint32_t i, rectype = WT_LOGREC_CHECKPOINT; - const char *fmt = WT_UNCHECKED_STRING(IIIIU); + const char *fmt = WT_UNCHECKED_STRING(IIIIu); txn = &session->txn; ckpt_lsn = &txn->ckpt_lsn; @@ -368,14 +368,16 @@ __wt_txn_checkpoint_log( /* * If this full checkpoint completed successfully and there is - * no hot backup in progress, tell the logging subsystem the - * checkpoint LSN so that it can archive. Do not update the - * logging checkpoint LSN if this is during a clean connection - * close, only during a full checkpoint. A clean close may not - * update any metadata LSN and we do not want to archive in - * that case. + * no hot backup in progress and this is not recovery, tell + * the logging subsystem the checkpoint LSN so that it can + * archive. Do not update the logging checkpoint LSN if this + * is during a clean connection close, only during a full + * checkpoint. A clean close may not update any metadata LSN + * and we do not want to archive in that case. */ - if (!S2C(session)->hot_backup && txn->full_ckpt) + if (!S2C(session)->hot_backup && + !F_ISSET(S2C(session), WT_CONN_RECOVERING) && + txn->full_ckpt) __wt_log_ckpt(session, ckpt_lsn); /* FALLTHROUGH */ diff --git a/src/txn/txn_nsnap.c b/src/txn/txn_nsnap.c index 65ec1a6662f..659570dbcd9 100644 --- a/src/txn/txn_nsnap.c +++ b/src/txn/txn_nsnap.c @@ -211,9 +211,9 @@ __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[]) if (TAILQ_EMPTY(&txn_global->nsnaph)) { WT_ASSERT(session, txn_global->nsnap_oldest_id == WT_TXN_NONE && !__wt_txn_visible_all(session, nsnap_new->pinned_id)); - __wt_readlock(session, txn_global->scan_rwlock); + __wt_readlock(session, &txn_global->scan_rwlock); txn_global->nsnap_oldest_id = nsnap_new->pinned_id; - __wt_readunlock(session, txn_global->scan_rwlock); + __wt_readunlock(session, &txn_global->scan_rwlock); } TAILQ_INSERT_TAIL(&txn_global->nsnaph, nsnap_new, q); WT_STAT_CONN_INCR(session, txn_snapshots_created); @@ -297,16 +297,16 @@ __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval) if (session->ncursors > 0) WT_RET(__wt_session_copy_values(session)); - __wt_readlock(session, txn_global->nsnap_rwlock); + __wt_readlock(session, &txn_global->nsnap_rwlock); TAILQ_FOREACH(nsnap, &txn_global->nsnaph, q) if (WT_STRING_MATCH(nsnap->name, nameval->str, nameval->len)) { /* * Acquire the scan lock so the oldest ID can't move * forward without seeing our pinned ID. */ - __wt_readlock(session, txn_global->scan_rwlock); + __wt_readlock(session, &txn_global->scan_rwlock); txn_state->pinned_id = nsnap->pinned_id; - __wt_readunlock(session, txn_global->scan_rwlock); + __wt_readunlock(session, &txn_global->scan_rwlock); WT_ASSERT(session, !__wt_txn_visible_all( session, txn_state->pinned_id) && @@ -327,7 +327,7 @@ __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval) F_SET(txn, WT_TXN_HAS_SNAPSHOT); break; } - __wt_readunlock(session, txn_global->nsnap_rwlock); + __wt_readunlock(session, &txn_global->nsnap_rwlock); if (nsnap == NULL) WT_RET_MSG(session, EINVAL, diff --git a/src/txn/txn_recover.c b/src/txn/txn_recover.c index a6390dcbd06..30932195b1e 100644 --- a/src/txn/txn_recover.c +++ b/src/txn/txn_recover.c @@ -93,7 +93,7 @@ __recovery_cursor(WT_SESSION_IMPL *session, WT_RECOVERY *r, "%s op %" PRIu32 " to file %" PRIu32 " at LSN %" PRIu32 \ "/%" PRIu32, \ cursor == NULL ? "Skipping" : "Applying", \ - optype, fileid, lsnp->l.file, lsnp->l.offset); \ + optype, fileid, (lsnp)->l.file, (lsnp)->l.offset); \ if (cursor == NULL) \ break @@ -501,7 +501,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session) * Pass WT_LOGSCAN_RECOVER so that old logs get truncated. */ r.metadata_only = false; - __wt_verbose(session, WT_VERB_RECOVERY, + __wt_verbose(session, WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS, "Main recovery loop: starting at %" PRIu32 "/%" PRIu32, r.ckpt_lsn.l.file, r.ckpt_lsn.l.offset); WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec)); diff --git a/src/utilities/util.h b/src/utilities/util.h index 2658d877b63..93a96d44219 100644 --- a/src/utilities/util.h +++ b/src/utilities/util.h @@ -40,7 +40,6 @@ int util_flush(WT_SESSION *, const char *); int util_list(WT_SESSION *, int, char *[]); int util_load(WT_SESSION *, int, char *[]); int util_loadtext(WT_SESSION *, int, char *[]); -char *util_name(WT_SESSION *, const char *, const char *); int util_printlog(WT_SESSION *, int, char *[]); int util_read(WT_SESSION *, int, char *[]); int util_read_line(WT_SESSION *, ULINE *, bool, bool *); @@ -49,6 +48,8 @@ int util_rename(WT_SESSION *, int, char *[]); int util_salvage(WT_SESSION *, int, char *[]); int util_stat(WT_SESSION *, int, char *[]); int util_str2recno(WT_SESSION *, const char *p, uint64_t *recnop); +int util_truncate(WT_SESSION *, int, char *[]); int util_upgrade(WT_SESSION *, int, char *[]); +char *util_uri(WT_SESSION *, const char *, const char *); int util_verify(WT_SESSION *, int, char *[]); int util_write(WT_SESSION *, int, char *[]); diff --git a/src/utilities/util_alter.c b/src/utilities/util_alter.c index d228c15cd48..ef01a1ed826 100644 --- a/src/utilities/util_alter.c +++ b/src/utilities/util_alter.c @@ -34,9 +34,12 @@ util_alter(WT_SESSION *session, int argc, char *argv[]) for (configp = argv; configp != NULL && *configp != NULL; configp += 2) if ((ret = session->alter( - session, configp[0], configp[1])) != 0) - break; - return (ret); + session, configp[0], configp[1])) != 0) { + (void)util_err(session, ret, + "session.alter: %s, %s", configp[0], configp[1]); + return (1); + } + return (0); } static int diff --git a/src/utilities/util_backup.c b/src/utilities/util_backup.c index 5dc9671fb45..f1b31f7621a 100644 --- a/src/utilities/util_backup.c +++ b/src/utilities/util_backup.c @@ -109,9 +109,14 @@ copy(WT_SESSION *session, const char *directory, const char *name) /* Build the target pathname. */ len = strlen(directory) + strlen(name) + 2; - if ((to = malloc(len)) == NULL) - goto memerr; - (void)snprintf(to, len, "%s/%s", directory, name); + if ((to = malloc(len)) == NULL) { + fprintf(stderr, "%s: %s\n", progname, strerror(errno)); + return (1); + } + if ((ret = __wt_snprintf(to, len, "%s/%s", directory, name)) != 0) { + fprintf(stderr, "%s: %s\n", progname, strerror(ret)); + goto err; + } if (verbose && printf("Backing up %s/%s to %s\n", home, name, to) < 0) { fprintf(stderr, "%s: %s\n", progname, strerror(EIO)); @@ -126,11 +131,7 @@ copy(WT_SESSION *session, const char *directory, const char *name) fprintf(stderr, "%s/%s to %s: backup copy: %s\n", home, name, to, session->strerror(session, ret)); - if (0) { -memerr: fprintf(stderr, "%s: %s\n", progname, strerror(errno)); - } err: free(to); - return (ret); } diff --git a/src/utilities/util_compact.c b/src/utilities/util_compact.c index c114eb207fa..e469b4dce6e 100644 --- a/src/utilities/util_compact.c +++ b/src/utilities/util_compact.c @@ -30,21 +30,13 @@ util_compact(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the table name. */ if (argc != 1) return (usage()); - if ((uri = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - if ((ret = session->compact(session, uri, NULL)) != 0) { - fprintf(stderr, "%s: compact(%s): %s\n", - progname, uri, session->strerror(session, ret)); - goto err; - } - - if (0) { -err: ret = 1; - } + if ((ret = session->compact(session, uri, NULL)) != 0) + (void)util_err(session, ret, "session.compact: %s", uri); free(uri); - return (ret); } diff --git a/src/utilities/util_create.c b/src/utilities/util_create.c index 4e609736f2d..7c22a67792b 100644 --- a/src/utilities/util_create.c +++ b/src/utilities/util_create.c @@ -15,9 +15,9 @@ util_create(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; int ch; - const char *config, *uri; + char *config, *uri; - config = NULL; + config = uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "c:")) != EOF) switch (ch) { case 'c': /* command-line configuration */ @@ -35,12 +35,14 @@ util_create(WT_SESSION *session, int argc, char *argv[]) if (argc != 1) return (usage()); - if ((uri = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); if ((ret = session->create(session, uri, config)) != 0) - return (util_err(session, ret, "%s: session.create", uri)); - return (0); + (void)util_err(session, ret, "session.create: %s", uri); + + free(uri); + return (ret); } static int diff --git a/src/utilities/util_drop.c b/src/utilities/util_drop.c index ba41445dfb6..456005d445d 100644 --- a/src/utilities/util_drop.c +++ b/src/utilities/util_drop.c @@ -15,8 +15,9 @@ util_drop(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; int ch; - char *name; + char *uri; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) switch (ch) { case '?': @@ -30,12 +31,13 @@ util_drop(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the uri. */ if (argc != 1) return (usage()); - if ((name = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - ret = session->drop(session, name, "force"); + if ((ret = session->drop(session, uri, "force")) != 0) + (void)util_err(session, ret, "session.drop: %s", uri); - free(name); + free(uri); return (ret); } diff --git a/src/utilities/util_dump.c b/src/utilities/util_dump.c index 7dde13ee837..955148b7d46 100644 --- a/src/utilities/util_dump.c +++ b/src/utilities/util_dump.c @@ -6,10 +6,14 @@ * See the file LICENSE for redistribution information. */ +#include <assert.h> #include "util.h" #include "util_dump.h" -static int dump_config(WT_SESSION *, const char *, bool, bool); +#define STRING_MATCH_CONFIG(s, item) \ + (strncmp(s, (item).str, (item).len) == 0 && (s)[(item).len] == '\0') + +static int dump_config(WT_SESSION *, const char *, WT_CURSOR *, bool, bool); static int dump_json_begin(WT_SESSION *); static int dump_json_end(WT_SESSION *); static int dump_json_separator(WT_SESSION *); @@ -17,7 +21,8 @@ static int dump_json_table_end(WT_SESSION *); static int dump_prefix(WT_SESSION *, bool, bool); static int dump_record(WT_CURSOR *, bool, bool); static int dump_suffix(WT_SESSION *, bool); -static int dump_table_config(WT_SESSION *, WT_CURSOR *, const char *, bool); +static int dump_table_config( + WT_SESSION *, WT_CURSOR *, WT_CURSOR *, const char *, bool); static int dump_table_parts_config( WT_SESSION *, WT_CURSOR *, const char *, const char *, bool); static int dup_json_string(const char *, char **); @@ -32,10 +37,11 @@ util_dump(WT_SESSION *session, int argc, char *argv[]) size_t len; int ch, i; bool hex, json, reverse; - char *checkpoint, *config, *name; + char *checkpoint, *config, *p, *simpleuri, *uri; hex = json = reverse = false; - checkpoint = config = name = NULL; + checkpoint = config = simpleuri = uri = NULL; + cursor = NULL; while ((ch = __wt_getopt(progname, argc, argv, "c:f:jrx")) != EOF) switch (ch) { case 'c': @@ -75,21 +81,19 @@ util_dump(WT_SESSION *session, int argc, char *argv[]) return (usage()); if (json && - ((ret = dump_json_begin(session)) != 0 || - (ret = dump_prefix(session, hex, json)) != 0)) + (dump_json_begin(session) != 0 || + dump_prefix(session, hex, json) != 0)) goto err; for (i = 0; i < argc; i++) { if (json && i > 0) - if ((ret = dump_json_separator(session)) != 0) + if (dump_json_separator(session) != 0) goto err; - free(name); - name = NULL; - - if ((name = util_name(session, argv[i], "table")) == NULL) - goto err; + free(uri); + free(simpleuri); + uri = simpleuri = NULL; - if (dump_config(session, name, hex, json) != 0) + if ((uri = util_uri(session, argv[i], "table")) == NULL) goto err; len = @@ -109,18 +113,34 @@ util_dump(WT_SESSION *session, int argc, char *argv[]) (void)strcat(config, json ? "dump=json" : (hex ? "dump=hex" : "dump=print")); if ((ret = session->open_cursor( - session, name, NULL, config, &cursor)) != 0) { + session, uri, NULL, config, &cursor)) != 0) { fprintf(stderr, "%s: cursor open(%s) failed: %s\n", - progname, name, session->strerror(session, ret)); + progname, uri, session->strerror(session, ret)); + goto err; + } + + if ((simpleuri = strdup(uri)) == NULL) { + (void)util_err(session, errno, NULL); goto err; } + if ((p = strchr(simpleuri, '(')) != NULL) + *p = '\0'; + if (dump_config(session, simpleuri, cursor, hex, json) != 0) + goto err; - if ((ret = dump_record(cursor, reverse, json)) != 0) + if (dump_record(cursor, reverse, json) != 0) goto err; - if (json && (ret = dump_json_table_end(session)) != 0) + if (json && dump_json_table_end(session) != 0) goto err; + + ret = cursor->close(cursor); + cursor = NULL; + if (ret != 0) { + (void)util_err(session, ret, NULL); + goto err; + } } - if (json && ((ret = dump_json_end(session)) != 0)) + if (json && dump_json_end(session) != 0) goto err; if (0) { @@ -128,8 +148,12 @@ err: ret = 1; } free(config); - free(name); - + free(uri); + free(simpleuri); + if (cursor != NULL && (ret = cursor->close(cursor)) != 0) { + (void)util_err(session, ret, NULL); + ret = 1; + } return (ret); } @@ -138,15 +162,16 @@ err: ret = 1; * Dump the config for the uri. */ static int -dump_config(WT_SESSION *session, const char *uri, bool hex, bool json) +dump_config(WT_SESSION *session, const char *uri, WT_CURSOR *cursor, bool hex, + bool json) { - WT_CURSOR *cursor; + WT_CURSOR *mcursor; WT_DECL_RET; int tret; /* Open a metadata cursor. */ if ((ret = session->open_cursor( - session, "metadata:create", NULL, NULL, &cursor)) != 0) { + session, "metadata:create", NULL, NULL, &mcursor)) != 0) { fprintf(stderr, "%s: %s: session.open_cursor: %s\n", progname, "metadata:create", session->strerror(session, ret)); return (1); @@ -156,10 +181,11 @@ dump_config(WT_SESSION *session, const char *uri, bool hex, bool json) * want to output a header if the user entered the wrong name. This is * where we find out a table doesn't exist, use a simple error message. */ - cursor->set_key(cursor, uri); - if ((ret = cursor->search(cursor)) == 0) { + mcursor->set_key(mcursor, uri); + if ((ret = mcursor->search(mcursor)) == 0) { if ((!json && dump_prefix(session, hex, json) != 0) || - dump_table_config(session, cursor, uri, json) != 0 || + dump_table_config(session, mcursor, cursor, + uri, json) != 0 || dump_suffix(session, json) != 0) ret = 1; } else if (ret == WT_NOTFOUND) @@ -167,8 +193,8 @@ dump_config(WT_SESSION *session, const char *uri, bool hex, bool json) else ret = util_err(session, ret, "%s", uri); - if ((tret = cursor->close(cursor)) != 0) { - tret = util_cerr(cursor, "close", tret); + if ((tret = mcursor->close(mcursor)) != 0) { + tret = util_cerr(mcursor, "close", tret); if (ret == 0) ret = tret; } @@ -225,16 +251,126 @@ dump_json_table_end(WT_SESSION *session) } /* + * dump_add_config + * Add a formatted config string to an output buffer. + */ +static int +dump_add_config(WT_SESSION *session, char **bufp, size_t *leftp, + const char *fmt, ...) + WT_GCC_FUNC_ATTRIBUTE((format (printf, 4, 5))) +{ + WT_DECL_RET; + size_t n; + va_list ap; + + va_start(ap, fmt); + ret = __wt_vsnprintf_len_set(*bufp, *leftp, &n, fmt, ap); + va_end(ap); + if (ret != 0) + return (util_err(session, ret, NULL)); + *bufp += n; + *leftp -= (size_t)n; + return (0); +} + +/* + * dump_projection -- + * Create a new config containing projection information. + */ +static int +dump_projection(WT_SESSION *session, const char *config, WT_CURSOR *cursor, + char **newconfigp) +{ + WT_DECL_RET; + WT_CONFIG_ITEM key, value; + WT_CONFIG_PARSER *parser; + WT_EXTENSION_API *wt_api; + size_t len, vallen; + int nkeys; + char *newconfig; + const char *keyformat, *p; + + len = strlen(config) + strlen(cursor->value_format) + + strlen(cursor->uri) + 20; + if ((newconfig = malloc(len)) == NULL) + return util_err(session, errno, NULL); + *newconfigp = newconfig; + wt_api = session->connection->get_extension_api(session->connection); + if ((ret = wt_api->config_parser_open(wt_api, session, config, + strlen(config), &parser)) != 0) + return (util_err( + session, ret, "WT_EXTENSION_API.config_parser_open")); + keyformat = cursor->key_format; + for (nkeys = 0; *keyformat; keyformat++) + if (!__wt_isdigit((u_char)*keyformat)) + nkeys++; + + /* + * Copy the configuration, replacing some fields to match the + * projection. + */ + while ((ret = parser->next(parser, &key, &value)) == 0) { + WT_RET(dump_add_config(session, &newconfig, &len, + "%.*s=", (int)key.len, key.str)); + if (STRING_MATCH_CONFIG("value_format", key)) + WT_RET(dump_add_config(session, &newconfig, &len, + "%s", cursor->value_format)); + else if (STRING_MATCH_CONFIG("columns", key)) { + /* copy names of keys */ + p = value.str; + vallen = value.len; + while (vallen > 0) { + if ((*p == ',' || *p == ')') && --nkeys == 0) + break; + p++; + vallen--; + } + WT_RET(dump_add_config(session, &newconfig, &len, + "%.*s", (int)(p - value.str), value.str)); + + /* copy names of projected values */ + p = strchr(cursor->uri, '('); + assert(p != NULL); + assert(p[strlen(p) - 1] == ')'); + p++; + if (*p != ')') + WT_RET(dump_add_config(session, &newconfig, + &len, "%s", ",")); + WT_RET(dump_add_config(session, &newconfig, &len, + "%.*s),", (int)(strlen(p) - 1), p)); + } else if (value.type == WT_CONFIG_ITEM_STRING && + value.len != 0) + WT_RET(dump_add_config(session, &newconfig, &len, + "\"%.*s\",", (int)value.len, value.str)); + else + WT_RET(dump_add_config(session, &newconfig, &len, + "%.*s,", (int)value.len, value.str)); + } + if (ret != WT_NOTFOUND) + return (util_err(session, ret, "WT_CONFIG_PARSER.next")); + + assert(len > 0); + if ((ret = parser->close(parser)) != 0) + return (util_err( + session, ret, "WT_CONFIG_PARSER.close")); + + return (0); +} + +/* * dump_table_config -- * Dump the config for a table. */ static int dump_table_config( - WT_SESSION *session, WT_CURSOR *cursor, const char *uri, bool json) + WT_SESSION *session, WT_CURSOR *mcursor, WT_CURSOR *cursor, + const char *uri, bool json) { WT_DECL_RET; + char *proj_config; const char *name, *v; + proj_config = NULL; /* Get the table name. */ if ((name = strchr(uri, ':')) == NULL) { fprintf(stderr, "%s: %s: corrupted uri\n", progname, uri); @@ -246,20 +382,25 @@ dump_table_config( * Dump out the config information: first, dump the uri entry itself, * it overrides all subsequent configurations. */ - cursor->set_key(cursor, uri); - if ((ret = cursor->search(cursor)) != 0) - return (util_cerr(cursor, "search", ret)); - if ((ret = cursor->get_value(cursor, &v)) != 0) - return (util_cerr(cursor, "get_value", ret)); - - WT_RET(print_config(session, uri, v, json, true)); + mcursor->set_key(mcursor, uri); + if ((ret = mcursor->search(mcursor)) != 0) + return (util_cerr(mcursor, "search", ret)); + if ((ret = mcursor->get_value(mcursor, &v)) != 0) + return (util_cerr(mcursor, "get_value", ret)); + + if (strchr(cursor->uri, '(') != NULL) { + WT_ERR(dump_projection(session, v, cursor, &proj_config)); + v = proj_config; + } + WT_ERR(print_config(session, uri, v, json, true)); - WT_RET(dump_table_parts_config( - session, cursor, name, "colgroup:", json)); - WT_RET(dump_table_parts_config( - session, cursor, name, "index:", json)); + WT_ERR(dump_table_parts_config( + session, mcursor, name, "colgroup:", json)); + WT_ERR(dump_table_parts_config( + session, mcursor, name, "index:", json)); - return (0); +err: free(proj_config); + return (ret); } /* @@ -295,9 +436,11 @@ dump_table_parts_config(WT_SESSION *session, WT_CURSOR *cursor, len = strlen(entry) + strlen(name) + 1; if ((uriprefix = malloc(len)) == NULL) - return util_err(session, errno, NULL); - - snprintf(uriprefix, len, "%s%s", entry, name); + return (util_err(session, errno, NULL)); + if ((ret = __wt_snprintf(uriprefix, len, "%s%s", entry, name)) != 0) { + free(uriprefix); + return (util_err(session, ret, NULL)); + } /* * Search the file looking for column group and index key/value pairs: @@ -364,17 +507,18 @@ dump_prefix(WT_SESSION *session, bool hex, bool json) (void)wiredtiger_version(&vmajor, &vminor, &vpatch); + if (json && printf( + " \"%s\" : \"%d (%d.%d.%d)\",\n", + DUMP_JSON_VERSION_MARKER, DUMP_JSON_CURRENT_VERSION, + vmajor, vminor, vpatch) < 0) + return (util_err(session, EIO, NULL)); + if (!json && (printf( "WiredTiger Dump (WiredTiger Version %d.%d.%d)\n", vmajor, vminor, vpatch) < 0 || printf("Format=%s\n", hex ? "hex" : "print") < 0 || printf("Header\n") < 0)) return (util_err(session, EIO, NULL)); - else if (json && printf( - " \"%s\" : \"%d (%d.%d.%d)\",\n", - DUMP_JSON_VERSION_MARKER, DUMP_JSON_CURRENT_VERSION, - vmajor, vminor, vpatch) < 0) - return (util_err(session, EIO, NULL)); return (0); } diff --git a/src/utilities/util_list.c b/src/utilities/util_list.c index e91dbfce05b..f19ba4d1f97 100644 --- a/src/utilities/util_list.c +++ b/src/utilities/util_list.c @@ -19,10 +19,10 @@ util_list(WT_SESSION *session, int argc, char *argv[]) WT_DECL_RET; int ch; bool cflag, vflag; - char *name; + char *uri; cflag = vflag = false; - name = NULL; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "cv")) != EOF) switch (ch) { case 'c': @@ -42,17 +42,16 @@ util_list(WT_SESSION *session, int argc, char *argv[]) case 0: break; case 1: - if ((name = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); break; default: return (usage()); } - ret = list_print(session, name, cflag, vflag); - - free(name); + ret = list_print(session, uri, cflag, vflag); + free(uri); return (ret); } @@ -99,7 +98,7 @@ list_get_allocsize(WT_SESSION *session, const char *key, size_t *allocsize) * List the high-level objects in the database. */ static int -list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag) +list_print(WT_SESSION *session, const char *uri, bool cflag, bool vflag) { WT_CURSOR *cursor; WT_DECL_RET; @@ -120,7 +119,7 @@ list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag) ret, "%s: WT_SESSION.open_cursor", WT_METADATA_URI)); } - found = name == NULL; + found = uri == NULL; while ((ret = cursor->next(cursor)) == 0) { /* Get the key. */ if ((ret = cursor->get_key(cursor, &key)) != 0) @@ -129,8 +128,8 @@ list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag) /* * If a name is specified, only show objects that match. */ - if (name != NULL) { - if (!WT_PREFIX_MATCH(key, name)) + if (uri != NULL) { + if (!WT_PREFIX_MATCH(key, uri)) continue; found = true; } @@ -161,7 +160,7 @@ list_print(WT_SESSION *session, const char *name, bool cflag, bool vflag) if (ret != WT_NOTFOUND) return (util_cerr(cursor, "next", ret)); if (!found) { - fprintf(stderr, "%s: %s: not found\n", progname, name); + fprintf(stderr, "%s: %s: not found\n", progname, uri); return (1); } diff --git a/src/utilities/util_load.c b/src/utilities/util_load.c index ac18df80851..d2f00402217 100644 --- a/src/utilities/util_load.c +++ b/src/utilities/util_load.c @@ -80,8 +80,8 @@ util_load(WT_SESSION *session, int argc, char *argv[]) if (no_overwrite) flags |= LOAD_JSON_NO_OVERWRITE; return (util_load_json(session, filename, flags)); - } else - return (load_dump(session)); + } + return (load_dump(session)); } /* @@ -120,13 +120,15 @@ load_dump(WT_SESSION *session) goto err; /* Open the insert cursor. */ - (void)snprintf(config, sizeof(config), + if ((ret = __wt_snprintf(config, sizeof(config), "dump=%s%s%s", hex ? "hex" : "print", - append ? ",append" : "", no_overwrite ? ",overwrite=false" : ""); + append ? ",append" : "", + no_overwrite ? ",overwrite=false" : "")) != 0) + return (util_err(session, ret, NULL)); if ((ret = session->open_cursor( session, uri, NULL, config, &cursor)) != 0) { - ret = util_err(session, ret, "%s: session.open", uri); + ret = util_err(session, ret, "%s: session.open_cursor", uri); goto err; } @@ -472,6 +474,7 @@ config_update(WT_SESSION *session, char **list) static int config_rename(WT_SESSION *session, char **urip, const char *name) { + WT_DECL_RET; size_t len; char *buf, *p; @@ -490,7 +493,9 @@ config_rename(WT_SESSION *session, char **urip, const char *name) } *p = '\0'; p = strchr(p + 1, ':'); - snprintf(buf, len, "%s:%s%s", *urip, name, p == NULL ? "" : p); + if ((ret = __wt_snprintf( + buf, len, "%s:%s%s", *urip, name, p == NULL ? "" : p)) != 0) + return (util_err(session, ret, NULL)); *urip = buf; return (0); diff --git a/src/utilities/util_load_json.c b/src/utilities/util_load_json.c index 020a4ed9ba9..c693e2b7651 100644 --- a/src/utilities/util_load_json.c +++ b/src/utilities/util_load_json.c @@ -145,6 +145,7 @@ static int json_kvraw_append(WT_SESSION *session, JSON_INPUT_STATE *ins, const char *str, size_t len) { + WT_DECL_RET; size_t needsize; char *tmp; @@ -152,11 +153,15 @@ json_kvraw_append(WT_SESSION *session, needsize = strlen(ins->kvraw) + len + 2; if ((tmp = malloc(needsize)) == NULL) return (util_err(session, errno, NULL)); - snprintf(tmp, needsize, "%s %.*s", ins->kvraw, (int)len, str); + WT_ERR(__wt_snprintf( + tmp, needsize, "%s %.*s", ins->kvraw, (int)len, str)); free(ins->kvraw); ins->kvraw = tmp; } return (0); + +err: free(tmp); + return (util_err(session, ret, NULL)); } /* @@ -181,7 +186,7 @@ json_strdup(WT_SESSION *session, JSON_INPUT_STATE *ins, char **resultp) goto err; } resultlen += 1; - if ((result = (char *)malloc((size_t)resultlen)) == NULL) { + if ((result = malloc((size_t)resultlen)) == NULL) { ret = util_err(session, errno, NULL); goto err; } @@ -236,13 +241,16 @@ json_data(WT_SESSION *session, goto err; uri = clp->list[0]; - (void)snprintf(config, sizeof(config), + if ((ret = __wt_snprintf(config, sizeof(config), "dump=json%s%s", LF_ISSET(LOAD_JSON_APPEND) ? ",append" : "", - LF_ISSET(LOAD_JSON_NO_OVERWRITE) ? ",overwrite=false" : ""); + LF_ISSET(LOAD_JSON_NO_OVERWRITE) ? ",overwrite=false" : "")) != 0) { + ret = util_err(session, ret, NULL); + goto err; + } if ((ret = session->open_cursor( session, uri, NULL, config, &cursor)) != 0) { - ret = util_err(session, ret, "%s: session.open", uri); + ret = util_err(session, ret, "%s: session.open_cursor", uri); goto err; } keyformat = cursor->key_format; @@ -256,7 +264,7 @@ json_data(WT_SESSION *session, nfield = 0; JSON_EXPECT(session, ins, '{'); if (ins->kvraw == NULL) { - if ((ins->kvraw = (char *)malloc(1)) == NULL) { + if ((ins->kvraw = malloc(1)) == NULL) { ret = util_err(session, errno, NULL); goto err; } @@ -358,8 +366,11 @@ json_top_level(WT_SESSION *session, JSON_INPUT_STATE *ins, uint32_t flags) while (json_peek(session, ins) == 's') { JSON_EXPECT(session, ins, 's'); tableuri = realloc(tableuri, ins->toklen); - snprintf(tableuri, ins->toklen, "%.*s", - (int)(ins->toklen - 2), ins->tokstart + 1); + if ((ret = __wt_snprintf(tableuri, ins->toklen, + "%.*s", (int)(ins->toklen - 2), ins->tokstart + 1)) != 0) { + ret = util_err(session, ret, NULL); + goto err; + } JSON_EXPECT(session, ins, ':'); if (!hasversion) { if (strcmp(tableuri, DUMP_JSON_VERSION_MARKER) != 0) { diff --git a/src/utilities/util_loadtext.c b/src/utilities/util_loadtext.c index f9c5b6e9a1f..7602d43f8c9 100644 --- a/src/utilities/util_loadtext.c +++ b/src/utilities/util_loadtext.c @@ -15,9 +15,11 @@ static int usage(void); int util_loadtext(WT_SESSION *session, int argc, char *argv[]) { + WT_DECL_RET; int ch; - const char *uri; + char *uri; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "f:")) != EOF) switch (ch) { case 'f': /* input file */ @@ -35,10 +37,13 @@ util_loadtext(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the uri. */ if (argc != 1) return (usage()); - if ((uri = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - return (text(session, uri)); + ret = text(session, uri); + + free(uri); + return (ret); } /* @@ -61,7 +66,7 @@ text(WT_SESSION *session, const char *uri) */ if ((ret = session->open_cursor( session, uri, NULL, "append,overwrite", &cursor)) != 0) - return (util_err(session, ret, "%s: session.open", uri)); + return (util_err(session, ret, "%s: session.open_cursor", uri)); /* * We're about to load strings, make sure the formats match. diff --git a/src/utilities/util_main.c b/src/utilities/util_main.c index 1da56adf137..c6f225bb667 100644 --- a/src/utilities/util_main.c +++ b/src/utilities/util_main.c @@ -20,7 +20,43 @@ static const char *command; /* Command name */ #define REC_LOGOFF "log=(enabled=false)" #define REC_RECOVER "log=(recover=on)" -static int usage(void); +static void +usage(void) +{ + fprintf(stderr, + "WiredTiger Data Engine (version %d.%d)\n", + WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR); + fprintf(stderr, + "global options:\n" + "\t" "-C\t" "wiredtiger_open configuration\n" + "\t" "-h\t" "database directory\n" + "\t" "-L\t" "turn logging off for debug-mode\n" + "\t" "-R\t" "run recovery if configured\n" + "\t" "-V\t" "display library version and exit\n" + "\t" "-v\t" "verbose\n"); + fprintf(stderr, + "commands:\n" + "\t" "alter\t alter an object\n" + "\t" "backup\t database backup\n" + "\t" "compact\t compact an object\n" + "\t" "copyright copyright information\n" + "\t" "create\t create an object\n" + "\t" "drop\t drop an object\n" + "\t" "dump\t dump an object\n" + "\t" "list\t list database objects\n" + "\t" "load\t load an object\n" + "\t" "loadtext load an object from a text file\n" + "\t" "printlog display the database log\n" + "\t" "read\t read values from an object\n" + "\t" "rebalance rebalance an object\n" + "\t" "rename\t rename an object\n" + "\t" "salvage\t salvage a file\n" + "\t" "stat\t display statistics for an object\n" + "\t" "truncate truncate an object, removing all content\n" + "\t" "upgrade\t upgrade an object\n" + "\t" "verify\t verify an object\n" + "\t" "write\t write values to an object\n"); +} int main(int argc, char *argv[]) @@ -73,8 +109,9 @@ main(int argc, char *argv[]) cmd_config = __wt_optarg; break; case 'E': /* secret key */ + free(secretkey); /* lint: set more than once */ if ((secretkey = strdup(__wt_optarg)) == NULL) { - ret = util_err(NULL, errno, NULL); + (void)util_err(NULL, errno, NULL); goto err; } memset(__wt_optarg, 0, strlen(__wt_optarg)); @@ -92,24 +129,27 @@ main(int argc, char *argv[]) break; case 'V': /* version */ printf("%s\n", wiredtiger_version(NULL, NULL, NULL)); - return (EXIT_SUCCESS); + goto done; case 'v': /* verbose */ verbose = true; break; case '?': default: - return (usage()); + usage(); + goto err; } if (logoff && recover) { fprintf(stderr, "Only one of -L and -R is allowed.\n"); - return (EXIT_FAILURE); + goto err; } argc -= __wt_optind; argv += __wt_optind; /* The next argument is the command name. */ - if (argc < 1) - return (usage()); + if (argc < 1) { + usage(); + goto err; + } command = argv[0]; /* Reset getopt. */ @@ -130,7 +170,7 @@ main(int argc, char *argv[]) func = util_compact; else if (strcmp(command, "copyright") == 0) { util_copyright(); - return (EXIT_SUCCESS); + goto done; } else if (strcmp(command, "create") == 0) { func = util_create; config = "create"; @@ -175,6 +215,10 @@ main(int argc, char *argv[]) config = "statistics=(all)"; } break; + case 't' : + if (strcmp(command, "truncate") == 0) + func = util_truncate; + break; case 'u': if (strcmp(command, "upgrade") == 0) func = util_upgrade; @@ -190,8 +234,10 @@ main(int argc, char *argv[]) default: break; } - if (func == NULL) - return (usage()); + if (func == NULL) { + usage(); + goto err; + } /* Build the configuration string. */ len = 10; /* some slop */ @@ -208,30 +254,39 @@ main(int argc, char *argv[]) } len += strlen(rec_config); if ((p = malloc(len)) == NULL) { - ret = util_err(NULL, errno, NULL); + (void)util_err(NULL, errno, NULL); goto err; } - (void)snprintf(p, len, "%s,%s,%s%s%s%s", + if ((ret = __wt_snprintf(p, len, "%s,%s,%s%s%s%s", config == NULL ? "" : config, - cmd_config == NULL ? "" : cmd_config, rec_config, p1, p2, p3); + cmd_config == NULL ? "" : cmd_config, + rec_config, p1, p2, p3)) != 0) { + (void)util_err(NULL, ret, NULL); + goto err; + } config = p; /* Open the database and a session. */ if ((ret = wiredtiger_open(home, verbose ? verbose_handler : NULL, config, &conn)) != 0) { - ret = util_err(NULL, ret, NULL); + (void)util_err(NULL, ret, NULL); goto err; } if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) { - ret = util_err(NULL, ret, NULL); + (void)util_err(NULL, ret, NULL); goto err; } /* Call the function. */ ret = func(session, argc, argv); + if (0) { +err: ret = 1; + } +done: + /* Close the database. */ -err: if (conn != NULL && (tret = conn->close(conn, NULL)) != 0 && ret == 0) + if (conn != NULL && (tret = conn->close(conn, NULL)) != 0 && ret == 0) ret = tret; free(p); @@ -240,52 +295,14 @@ err: if (conn != NULL && (tret = conn->close(conn, NULL)) != 0 && ret == 0) return (ret == 0 ? EXIT_SUCCESS : EXIT_FAILURE); } -static int -usage(void) -{ - fprintf(stderr, - "WiredTiger Data Engine (version %d.%d)\n", - WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR); - fprintf(stderr, - "global options:\n" - "\t" "-C\t" "wiredtiger_open configuration\n" - "\t" "-h\t" "database directory\n" - "\t" "-L\t" "turn logging off for debug-mode\n" - "\t" "-R\t" "run recovery if configured\n" - "\t" "-V\t" "display library version and exit\n" - "\t" "-v\t" "verbose\n"); - fprintf(stderr, - "commands:\n" - "\t" "alter\t alter an object\n" - "\t" "backup\t database backup\n" - "\t" "compact\t compact an object\n" - "\t" "copyright copyright information\n" - "\t" "create\t create an object\n" - "\t" "drop\t drop an object\n" - "\t" "dump\t dump an object\n" - "\t" "list\t list database objects\n" - "\t" "load\t load an object\n" - "\t" "loadtext load an object from a text file\n" - "\t" "printlog display the database log\n" - "\t" "read\t read values from an object\n" - "\t" "rebalance rebalance an object\n" - "\t" "rename\t rename an object\n" - "\t" "salvage\t salvage a file\n" - "\t" "stat\t display statistics for an object\n" - "\t" "upgrade\t upgrade an object\n" - "\t" "verify\t verify an object\n" - "\t" "write\t write values to an object\n"); - - return (EXIT_FAILURE); -} - /* - * util_name -- + * util_uri -- * Build a name. */ char * -util_name(WT_SESSION *session, const char *s, const char *type) +util_uri(WT_SESSION *session, const char *s, const char *type) { + WT_DECL_RET; size_t len; char *name; @@ -309,8 +326,12 @@ util_name(WT_SESSION *session, const char *s, const char *type) * the default type for the operation. */ if (strchr(s, ':') != NULL) - strcpy(name, s); + WT_ERR(__wt_snprintf(name, len, "%s", s)); else - snprintf(name, len, "%s:%s", type, s); + WT_ERR(__wt_snprintf(name, len, "%s:%s", type, s)); return (name); + +err: free(name); + (void)util_err(session, ret, NULL); + return (NULL); } diff --git a/src/utilities/util_misc.c b/src/utilities/util_misc.c index 0905bfa97be..e26185a0096 100644 --- a/src/utilities/util_misc.c +++ b/src/utilities/util_misc.c @@ -140,7 +140,10 @@ util_flush(WT_SESSION *session, const char *uri) if ((buf = malloc(len)) == NULL) return (util_err(session, errno, NULL)); - (void)snprintf(buf, len, "target=(\"%s\")", uri); + if ((ret = __wt_snprintf(buf, len, "target=(\"%s\")", uri)) != 0) { + free(buf); + return (util_err(session, ret, NULL)); + } ret = session->checkpoint(session, buf); free(buf); diff --git a/src/utilities/util_printlog.c b/src/utilities/util_printlog.c index e7fa2134934..5f3ed43905b 100644 --- a/src/utilities/util_printlog.c +++ b/src/utilities/util_printlog.c @@ -14,8 +14,8 @@ int util_printlog(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; - int ch; uint32_t flags; + int ch; flags = 0; while ((ch = __wt_getopt(progname, argc, argv, "f:x")) != EOF) @@ -41,17 +41,9 @@ util_printlog(WT_SESSION *session, int argc, char *argv[]) if (argc != 0) return (usage()); - ret = __wt_txn_printlog(session, flags); - - if (ret != 0) { - fprintf(stderr, "%s: printlog failed: %s\n", - progname, session->strerror(session, ret)); - goto err; - } + if ((ret = __wt_txn_printlog(session, flags)) != 0) + (void)util_err(session, ret, "printlog"); - if (0) { -err: ret = 1; - } return (ret); } diff --git a/src/utilities/util_read.c b/src/utilities/util_read.c index 2e766377aa9..393949b6a1c 100644 --- a/src/utilities/util_read.c +++ b/src/utilities/util_read.c @@ -18,8 +18,9 @@ util_read(WT_SESSION *session, int argc, char *argv[]) uint64_t recno; int ch; bool rkey, rval; - const char *uri, *value; + char *uri, *value; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) switch (ch) { case '?': @@ -32,13 +33,19 @@ util_read(WT_SESSION *session, int argc, char *argv[]) /* The remaining arguments are a uri followed by a list of keys. */ if (argc < 2) return (usage()); - if ((uri = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - /* Open the object. */ - if ((ret = session->open_cursor( - session, uri, NULL, NULL, &cursor)) != 0) - return (util_err(session, ret, "%s: session.open", uri)); + /* + * Open the object; free allocated memory immediately to simplify + * future error handling. + */ + if ((ret = + session->open_cursor(session, uri, NULL, NULL, &cursor)) != 0) + (void)util_err(session, ret, "%s: session.open_cursor", uri); + free(uri); + if (ret != 0) + return (ret); /* * A simple search only makes sense if the key format is a string or a diff --git a/src/utilities/util_rebalance.c b/src/utilities/util_rebalance.c index 45f161487e5..c188ea17d22 100644 --- a/src/utilities/util_rebalance.c +++ b/src/utilities/util_rebalance.c @@ -15,9 +15,9 @@ util_rebalance(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; int ch; - char *name; + char *uri; - name = NULL; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) switch (ch) { case '?': @@ -30,25 +30,21 @@ util_rebalance(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the table name. */ if (argc != 1) return (usage()); - if ((name = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - if ((ret = session->rebalance(session, name, NULL)) != 0) { - fprintf(stderr, "%s: rebalance(%s): %s\n", - progname, name, session->strerror(session, ret)); - goto err; + if ((ret = session->rebalance(session, uri, NULL)) != 0) + (void)util_err(session, ret, "session.rebalance: %s", uri); + else { + /* + * Verbose configures a progress counter, move to the next + * line. + */ + if (verbose) + printf("\n"); } - /* Verbose configures a progress counter, move to the next line. */ - if (verbose) - printf("\n"); - - if (0) { -err: ret = 1; - } - - free(name); - + free(uri); return (ret); } diff --git a/src/utilities/util_rename.c b/src/utilities/util_rename.c index aee299c6e63..bb2d40cd103 100644 --- a/src/utilities/util_rename.c +++ b/src/utilities/util_rename.c @@ -30,22 +30,15 @@ util_rename(WT_SESSION *session, int argc, char *argv[]) /* The remaining arguments are the object uri and new name. */ if (argc != 2) return (usage()); - if ((uri = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); newuri = argv[1]; - if ((ret = session->rename(session, uri, newuri, NULL)) != 0) { - fprintf(stderr, "%s: rename %s to %s: %s\n", - progname, uri, newuri, session->strerror(session, ret)); - goto err; - } - - if (0) { -err: ret = 1; - } + if ((ret = session->rename(session, uri, newuri, NULL)) != 0) + (void)util_err( + session, ret, "session.rename: %s, %s", uri, newuri); free(uri); - return (ret); } diff --git a/src/utilities/util_salvage.c b/src/utilities/util_salvage.c index 679d1074457..6cc2278b846 100644 --- a/src/utilities/util_salvage.c +++ b/src/utilities/util_salvage.c @@ -16,10 +16,10 @@ util_salvage(WT_SESSION *session, int argc, char *argv[]) WT_DECL_RET; int ch; const char *force; - char *name; + char *uri; force = NULL; - name = NULL; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "F")) != EOF) switch (ch) { case 'F': @@ -35,25 +35,21 @@ util_salvage(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the file name. */ if (argc != 1) return (usage()); - if ((name = util_name(session, *argv, "file")) == NULL) + if ((uri = util_uri(session, *argv, "file")) == NULL) return (1); - if ((ret = session->salvage(session, name, force)) != 0) { - fprintf(stderr, "%s: salvage(%s): %s\n", - progname, name, session->strerror(session, ret)); - goto err; + if ((ret = session->salvage(session, uri, force)) != 0) + (void)util_err(session, ret, "session.salvage: %s", uri); + else { + /* + * Verbose configures a progress counter, move to the next + * line. + */ + if (verbose) + printf("\n"); } - /* Verbose configures a progress counter, move to the next line. */ - if (verbose) - printf("\n"); - - if (0) { -err: ret = 1; - } - - free(name); - + free(uri); return (ret); } diff --git a/src/utilities/util_stat.c b/src/utilities/util_stat.c index 4376f559ceb..0692afe2819 100644 --- a/src/utilities/util_stat.c +++ b/src/utilities/util_stat.c @@ -55,7 +55,7 @@ util_stat(WT_SESSION *session, int argc, char *argv[]) objname = (char *)""; break; case 1: - if ((objname = util_name(session, *argv, "table")) == NULL) + if ((objname = util_uri(session, *argv, "table")) == NULL) return (1); objname_free = true; break; @@ -68,7 +68,10 @@ util_stat(WT_SESSION *session, int argc, char *argv[]) fprintf(stderr, "%s: %s\n", progname, strerror(errno)); goto err; } - snprintf(uri, urilen, "statistics:%s", objname); + if ((ret = __wt_snprintf(uri, urilen, "statistics:%s", objname)) != 0) { + fprintf(stderr, "%s: %s\n", progname, strerror(ret)); + goto err; + } if ((ret = session->open_cursor(session, uri, NULL, config, &cursor)) != 0) { @@ -82,8 +85,8 @@ util_stat(WT_SESSION *session, int argc, char *argv[]) (ret = cursor->next(cursor)) == 0 && (ret = cursor->get_value(cursor, &desc, &pval, NULL)) == 0) if (printf("%s=%s\n", desc, pval) < 0) { - ret = errno; - break; + (void)util_err(session, errno, "printf"); + goto err; } if (ret == WT_NOTFOUND) ret = 0; diff --git a/src/utilities/util_truncate.c b/src/utilities/util_truncate.c new file mode 100644 index 00000000000..35de02345c8 --- /dev/null +++ b/src/utilities/util_truncate.c @@ -0,0 +1,52 @@ +/*- + * Copyright (c) 2014-2016 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" + +static int usage(void); + +int +util_truncate(WT_SESSION *session, int argc, char *argv[]) +{ + WT_DECL_RET; + int ch; + char *uri; + + uri = NULL; + while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) + switch (ch) { + case '?': + default: + return (usage()); + } + + argc -= __wt_optind; + argv += __wt_optind; + + /* The remaining argument is the uri. */ + if (argc != 1) + return (usage()); + if ((uri = util_uri(session, *argv, "table")) == NULL) + return (1); + + if ((ret = session->truncate(session, uri, NULL, NULL, NULL)) != 0) + (void)util_err(session, ret, "session.truncate: %s", uri); + + free(uri); + return (ret); +} + +static int +usage(void) +{ + (void)fprintf(stderr, + "usage: %s %s " + "truncate uri\n", + progname, usage_prefix); + return (1); +} diff --git a/src/utilities/util_upgrade.c b/src/utilities/util_upgrade.c index 63b23f28c16..f89bd46e133 100644 --- a/src/utilities/util_upgrade.c +++ b/src/utilities/util_upgrade.c @@ -15,9 +15,9 @@ util_upgrade(WT_SESSION *session, int argc, char *argv[]) { WT_DECL_RET; int ch; - char *name; + char *uri; - name = NULL; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "")) != EOF) switch (ch) { case '?': @@ -30,25 +30,21 @@ util_upgrade(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the table name. */ if (argc != 1) return (usage()); - if ((name = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - if ((ret = session->upgrade(session, name, NULL)) != 0) { - fprintf(stderr, "%s: upgrade(%s): %s\n", - progname, name, session->strerror(session, ret)); - goto err; + if ((ret = session->upgrade(session, uri, NULL)) != 0) + (void)util_err(session, ret, "session.upgrade: %s", uri); + else { + /* + * Verbose configures a progress counter, move to the next + * line. + */ + if (verbose) + printf("\n"); } - /* Verbose configures a progress counter, move to the next line. */ - if (verbose) - printf("\n"); - - if (0) { -err: ret = 1; - } - - free(name); - + free(uri); return (ret); } diff --git a/src/utilities/util_verify.c b/src/utilities/util_verify.c index 82bdd780cd3..ace1be7a5de 100644 --- a/src/utilities/util_verify.c +++ b/src/utilities/util_verify.c @@ -17,10 +17,10 @@ util_verify(WT_SESSION *session, int argc, char *argv[]) size_t size; int ch; bool dump_address, dump_blocks, dump_layout, dump_pages; - char *config, *dump_offsets, *name; + char *config, *dump_offsets, *uri; dump_address = dump_blocks = dump_layout = dump_pages = false; - config = dump_offsets = name = NULL; + config = dump_offsets = uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "d:")) != EOF) switch (ch) { case 'd': @@ -55,7 +55,7 @@ util_verify(WT_SESSION *session, int argc, char *argv[]) /* The remaining argument is the table name. */ if (argc != 1) return (usage()); - if ((name = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); /* Build the configuration string as necessary. */ @@ -69,10 +69,10 @@ util_verify(WT_SESSION *session, int argc, char *argv[]) strlen("dump_offsets[],") + (dump_offsets == NULL ? 0 : strlen(dump_offsets)) + 20; if ((config = malloc(size)) == NULL) { - (void)util_err(session, errno, NULL); + ret = util_err(session, errno, NULL); goto err; } - snprintf(config, size, + if ((ret = __wt_snprintf(config, size, "%s%s%s%s%s%s%s", dump_address ? "dump_address," : "", dump_blocks ? "dump_blocks," : "", @@ -80,25 +80,24 @@ util_verify(WT_SESSION *session, int argc, char *argv[]) dump_offsets != NULL ? "dump_offsets=[" : "", dump_offsets != NULL ? dump_offsets : "", dump_offsets != NULL ? "]," : "", - dump_pages ? "dump_pages," : ""); - } - if ((ret = session->verify(session, name, config)) != 0) { - fprintf(stderr, "%s: verify(%s): %s\n", - progname, name, session->strerror(session, ret)); - goto err; + dump_pages ? "dump_pages," : "")) != 0) { + (void)util_err(session, ret, NULL); + goto err; + } } - - /* Verbose configures a progress counter, move to the next line. */ - if (verbose) - printf("\n"); - - if (0) { -err: ret = 1; + if ((ret = session->verify(session, uri, config)) != 0) + (void)util_err(session, ret, "session.verify: %s", uri); + else { + /* + * Verbose configures a progress counter, move to the next + * line. + */ + if (verbose) + printf("\n"); } - free(config); - free(name); - +err: free(config); + free(uri); return (ret); } diff --git a/src/utilities/util_write.c b/src/utilities/util_write.c index 7d9bce02b36..1d3e6937f8d 100644 --- a/src/utilities/util_write.c +++ b/src/utilities/util_write.c @@ -18,10 +18,10 @@ util_write(WT_SESSION *session, int argc, char *argv[]) uint64_t recno; int ch; bool append, overwrite, rkey; - const char *uri; - char config[100]; + char *uri, config[100]; append = overwrite = false; + uri = NULL; while ((ch = __wt_getopt(progname, argc, argv, "ao")) != EOF) switch (ch) { case 'a': @@ -47,15 +47,25 @@ util_write(WT_SESSION *session, int argc, char *argv[]) } else if (argc < 3 || ((argc - 1) % 2 != 0)) return (usage()); - if ((uri = util_name(session, *argv, "table")) == NULL) + if ((uri = util_uri(session, *argv, "table")) == NULL) return (1); - /* Open the object. */ - (void)snprintf(config, sizeof(config), "%s,%s", - append ? "append=true" : "", overwrite ? "overwrite=true" : ""); - if ((ret = session->open_cursor( - session, uri, NULL, config, &cursor)) != 0) - return (util_err(session, ret, "%s: session.open", uri)); + /* + * Open the object; free allocated memory immediately to simplify + * future error handling. + */ + if ((ret = __wt_snprintf(config, sizeof(config), "%s,%s", + append ? "append=true" : "", + overwrite ? "overwrite=true" : "")) != 0) { + free(uri); + return (util_err(session, ret, NULL)); + } + if ((ret = + session->open_cursor(session, uri, NULL, config, &cursor)) != 0) + (void)util_err(session, ret, "%s: session.open_cursor", uri); + free(uri); + if (ret != 0) + return (ret); /* * A simple search only makes sense if the key format is a string or a |