diff options
author | Michael Cahill <michael.cahill@mongodb.com> | 2016-03-25 08:19:12 +1100 |
---|---|---|
committer | Michael Cahill <michael.cahill@mongodb.com> | 2016-03-25 08:19:12 +1100 |
commit | 43e885a0f9a3ad046eae1726b005ca1280624be3 (patch) | |
tree | f03d31a7b36c476484f26ea5259777aeb7b13201 /src | |
parent | 5cdd3e320cb19cd54111c2572a3d6e33d3009ad4 (diff) | |
parent | 9cf8eb2f15c6df7da90c19c86ccf7516ed126183 (diff) | |
download | mongodb-3.2.5.tar.gz |
Merge branch 'mongodb-3.4' into mongodb-3.2mongodb-3.2.5
Diffstat (limited to 'src')
142 files changed, 7689 insertions, 3062 deletions
diff --git a/src/async/async_op.c b/src/async/async_op.c index 130c704757b..970c33c3360 100644 --- a/src/async/async_op.c +++ b/src/async/async_op.c @@ -349,14 +349,8 @@ __wt_async_op_init(WT_SESSION_IMPL *session) WT_ERR(__async_op_init(conn, op, i)); } return (0); -err: - if (async->async_ops != NULL) { - __wt_free(session, async->async_ops); - async->async_ops = NULL; - } - if (async->async_queue != NULL) { - __wt_free(session, async->async_queue); - async->async_queue = NULL; - } + +err: __wt_free(session, async->async_ops); + __wt_free(session, async->async_queue); return (ret); } diff --git a/src/block/block_ckpt.c b/src/block/block_ckpt.c index 03059c8f23a..812bf99acfb 100644 --- a/src/block/block_ckpt.c +++ b/src/block/block_ckpt.c @@ -812,8 +812,7 @@ __ckpt_string(WT_SESSION_IMPL *session, WT_RET(__wt_block_buffer_to_ckpt(session, block, addr, ci)); WT_RET(__wt_buf_fmt(session, buf, - "version=%d", - ci->version)); + "version=%" PRIu8, ci->version)); if (ci->root_offset == WT_BLOCK_INVALID_OFFSET) WT_RET(__wt_buf_catfmt(session, buf, ", root=[Empty]")); else diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c index dceaae8bb99..0bb75d129e1 100644 --- a/src/block/block_mgr.c +++ b/src/block/block_mgr.c @@ -69,6 +69,21 @@ __bm_checkpoint(WT_BM *bm, } /* + * __bm_checkpoint_readonly -- + * Write a buffer into a block, creating a checkpoint; readonly version. + */ +static int +__bm_checkpoint_readonly(WT_BM *bm, + WT_SESSION_IMPL *session, WT_ITEM *buf, WT_CKPT *ckptbase, bool data_cksum) +{ + WT_UNUSED(buf); + WT_UNUSED(ckptbase); + WT_UNUSED(data_cksum); + + return (__bm_readonly(bm, session)); +} + +/* * __bm_checkpoint_load -- * Load a checkpoint. */ @@ -113,6 +128,16 @@ __bm_checkpoint_resolve(WT_BM *bm, WT_SESSION_IMPL *session) } /* + * __bm_checkpoint_resolve_readonly -- + * Resolve the checkpoint; readonly version. + */ +static int +__bm_checkpoint_resolve_readonly(WT_BM *bm, WT_SESSION_IMPL *session) +{ + return (__bm_readonly(bm, session)); +} + +/* * __bm_checkpoint_unload -- * Unload a checkpoint point. */ @@ -161,6 +186,16 @@ __bm_compact_end(WT_BM *bm, WT_SESSION_IMPL *session) } /* + * __bm_compact_end_readonly -- + * End a block manager compaction; readonly version. + */ +static int +__bm_compact_end_readonly(WT_BM *bm, WT_SESSION_IMPL *session) +{ + return (__bm_readonly(bm, session)); +} + +/* * __bm_compact_page_skip -- * Return if a page is useful for compaction. */ @@ -173,6 +208,21 @@ __bm_compact_page_skip(WT_BM *bm, WT_SESSION_IMPL *session, } /* + * __bm_compact_page_skip_readonly -- + * Return if a page is useful for compaction; readonly version. + */ +static int +__bm_compact_page_skip_readonly(WT_BM *bm, WT_SESSION_IMPL *session, + const uint8_t *addr, size_t addr_size, bool *skipp) +{ + WT_UNUSED(addr); + WT_UNUSED(addr_size); + WT_UNUSED(skipp); + + return (__bm_readonly(bm, session)); +} + +/* * __bm_compact_skip -- * Return if a file can be compacted. */ @@ -183,6 +233,18 @@ __bm_compact_skip(WT_BM *bm, WT_SESSION_IMPL *session, bool *skipp) } /* + * __bm_compact_skip_readonly -- + * Return if a file can be compacted; readonly version. + */ +static int +__bm_compact_skip_readonly(WT_BM *bm, WT_SESSION_IMPL *session, bool *skipp) +{ + WT_UNUSED(skipp); + + return (__bm_readonly(bm, session)); +} + +/* * __bm_compact_start -- * Start a block manager compaction. */ @@ -193,6 +255,16 @@ __bm_compact_start(WT_BM *bm, WT_SESSION_IMPL *session) } /* + * __bm_compact_start_readonly -- + * Start a block manager compaction; readonly version. + */ +static int +__bm_compact_start_readonly(WT_BM *bm, WT_SESSION_IMPL *session) +{ + return (__bm_readonly(bm, session)); +} + +/* * __bm_free -- * Free a block of space to the underlying file. */ @@ -204,6 +276,20 @@ __bm_free(WT_BM *bm, } /* + * __bm_free_readonly -- + * Free a block of space to the underlying file; readonly version. + */ +static int +__bm_free_readonly(WT_BM *bm, + WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) +{ + WT_UNUSED(addr); + WT_UNUSED(addr_size); + + return (__bm_readonly(bm, session)); +} + +/* * __bm_is_mapped -- * Return if the file is mapped into memory. */ @@ -226,6 +312,31 @@ __bm_salvage_end(WT_BM *bm, WT_SESSION_IMPL *session) } /* + * __bm_salvage_end_readonly -- + * End a block manager salvage; readonly version. + */ +static int +__bm_salvage_end_readonly(WT_BM *bm, WT_SESSION_IMPL *session) +{ + return (__bm_readonly(bm, session)); +} + +/* + * __bm_salvage_next_readonly -- + * Return the next block from the file; readonly version. + */ +static int +__bm_salvage_next_readonly(WT_BM *bm, + WT_SESSION_IMPL *session, uint8_t *addr, size_t *addr_sizep, bool *eofp) +{ + WT_UNUSED(addr); + WT_UNUSED(addr_sizep); + WT_UNUSED(eofp); + + return (__bm_readonly(bm, session)); +} + +/* * __bm_salvage_next -- * Return the next block from the file. */ @@ -248,6 +359,16 @@ __bm_salvage_start(WT_BM *bm, WT_SESSION_IMPL *session) } /* + * __bm_salvage_start_readonly -- + * Start a block manager salvage; readonly version. + */ +static int +__bm_salvage_start_readonly(WT_BM *bm, WT_SESSION_IMPL *session) +{ + return (__bm_readonly(bm, session)); +} + +/* * __bm_salvage_valid -- * Inform salvage a block is valid. */ @@ -260,6 +381,21 @@ __bm_salvage_valid(WT_BM *bm, } /* + * __bm_salvage_valid_readonly -- + * Inform salvage a block is valid; readonly version. + */ +static int +__bm_salvage_valid_readonly(WT_BM *bm, + WT_SESSION_IMPL *session, uint8_t *addr, size_t addr_size, bool valid) +{ + WT_UNUSED(addr); + WT_UNUSED(addr_size); + WT_UNUSED(valid); + + return (__bm_readonly(bm, session)); +} + +/* * __bm_stat -- * Block-manager statistics. */ @@ -283,6 +419,18 @@ __bm_sync(WT_BM *bm, WT_SESSION_IMPL *session, bool async) } /* + * __bm_sync_readonly -- + * Flush a file to disk; readonly version. + */ +static int +__bm_sync_readonly(WT_BM *bm, WT_SESSION_IMPL *session, bool async) +{ + WT_UNUSED(async); + + return (__bm_readonly(bm, session)); +} + +/* * __bm_verify_addr -- * Verify an address. */ @@ -327,6 +475,23 @@ __bm_write(WT_BM *bm, WT_SESSION_IMPL *session, } /* + * __bm_write_readonly -- + * Write a buffer into a block, returning the block's address cookie; + * readonly version. + */ +static int +__bm_write_readonly(WT_BM *bm, WT_SESSION_IMPL *session, + WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, bool data_cksum) +{ + WT_UNUSED(buf); + WT_UNUSED(addr); + WT_UNUSED(addr_sizep); + WT_UNUSED(data_cksum); + + return (__bm_readonly(bm, session)); +} + +/* * __bm_write_size -- * Return the buffer size required to write a block. */ @@ -337,84 +502,68 @@ __bm_write_size(WT_BM *bm, WT_SESSION_IMPL *session, size_t *sizep) } /* + * __bm_write_size_readonly -- + * Return the buffer size required to write a block; readonly version. + */ +static int +__bm_write_size_readonly(WT_BM *bm, WT_SESSION_IMPL *session, size_t *sizep) +{ + WT_UNUSED(sizep); + + return (__bm_readonly(bm, session)); +} + +/* * __bm_method_set -- * Set up the legal methods. */ static void __bm_method_set(WT_BM *bm, bool readonly) { + bm->addr_invalid = __bm_addr_invalid; + bm->addr_string = __bm_addr_string; + bm->block_header = __bm_block_header; + bm->checkpoint = __bm_checkpoint; + bm->checkpoint_load = __bm_checkpoint_load; + bm->checkpoint_resolve = __bm_checkpoint_resolve; + bm->checkpoint_unload = __bm_checkpoint_unload; + bm->close = __bm_close; + bm->compact_end = __bm_compact_end; + bm->compact_page_skip = __bm_compact_page_skip; + bm->compact_skip = __bm_compact_skip; + bm->compact_start = __bm_compact_start; + bm->free = __bm_free; + bm->is_mapped = __bm_is_mapped; + bm->preload = __wt_bm_preload; + bm->read = __wt_bm_read; + bm->salvage_end = __bm_salvage_end; + bm->salvage_next = __bm_salvage_next; + bm->salvage_start = __bm_salvage_start; + bm->salvage_valid = __bm_salvage_valid; + bm->size = __wt_block_manager_size; + bm->stat = __bm_stat; + bm->sync = __bm_sync; + bm->verify_addr = __bm_verify_addr; + bm->verify_end = __bm_verify_end; + bm->verify_start = __bm_verify_start; + bm->write = __bm_write; + bm->write_size = __bm_write_size; + if (readonly) { - bm->addr_invalid = __bm_addr_invalid; - bm->addr_string = __bm_addr_string; - bm->block_header = __bm_block_header; - bm->checkpoint = (int (*)(WT_BM *, WT_SESSION_IMPL *, - WT_ITEM *, WT_CKPT *, bool))__bm_readonly; - bm->checkpoint_load = __bm_checkpoint_load; - bm->checkpoint_resolve = - (int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly; - bm->checkpoint_unload = __bm_checkpoint_unload; - bm->close = __bm_close; - bm->compact_end = - (int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly; - bm->compact_page_skip = (int (*)(WT_BM *, WT_SESSION_IMPL *, - const uint8_t *, size_t, bool *))__bm_readonly; - bm->compact_skip = (int (*) - (WT_BM *, WT_SESSION_IMPL *, bool *))__bm_readonly; - bm->compact_start = - (int (*)(WT_BM *, WT_SESSION_IMPL *))__bm_readonly; - bm->free = (int (*)(WT_BM *, - WT_SESSION_IMPL *, const uint8_t *, size_t))__bm_readonly; - bm->is_mapped = __bm_is_mapped; - bm->preload = __wt_bm_preload; - bm->read = __wt_bm_read; - bm->salvage_end = (int (*) - (WT_BM *, WT_SESSION_IMPL *))__bm_readonly; - bm->salvage_next = (int (*)(WT_BM *, WT_SESSION_IMPL *, - uint8_t *, size_t *, bool *))__bm_readonly; - bm->salvage_start = (int (*) - (WT_BM *, WT_SESSION_IMPL *))__bm_readonly; - bm->salvage_valid = (int (*)(WT_BM *, - WT_SESSION_IMPL *, uint8_t *, size_t, bool))__bm_readonly; - bm->size = __wt_block_manager_size; - bm->stat = __bm_stat; - bm->sync = - (int (*)(WT_BM *, WT_SESSION_IMPL *, bool))__bm_readonly; - bm->verify_addr = __bm_verify_addr; - bm->verify_end = __bm_verify_end; - bm->verify_start = __bm_verify_start; - bm->write = (int (*)(WT_BM *, WT_SESSION_IMPL *, - WT_ITEM *, uint8_t *, size_t *, bool))__bm_readonly; - bm->write_size = (int (*) - (WT_BM *, WT_SESSION_IMPL *, size_t *))__bm_readonly; - } else { - bm->addr_invalid = __bm_addr_invalid; - bm->addr_string = __bm_addr_string; - bm->block_header = __bm_block_header; - bm->checkpoint = __bm_checkpoint; - bm->checkpoint_load = __bm_checkpoint_load; - bm->checkpoint_resolve = __bm_checkpoint_resolve; - bm->checkpoint_unload = __bm_checkpoint_unload; - bm->close = __bm_close; - bm->compact_end = __bm_compact_end; - bm->compact_page_skip = __bm_compact_page_skip; - bm->compact_skip = __bm_compact_skip; - bm->compact_start = __bm_compact_start; - bm->free = __bm_free; - bm->is_mapped = __bm_is_mapped; - bm->preload = __wt_bm_preload; - bm->read = __wt_bm_read; - bm->salvage_end = __bm_salvage_end; - bm->salvage_next = __bm_salvage_next; - bm->salvage_start = __bm_salvage_start; - bm->salvage_valid = __bm_salvage_valid; - bm->size = __wt_block_manager_size; - bm->stat = __bm_stat; - bm->sync = __bm_sync; - bm->verify_addr = __bm_verify_addr; - bm->verify_end = __bm_verify_end; - bm->verify_start = __bm_verify_start; - bm->write = __bm_write; - bm->write_size = __bm_write_size; + bm->checkpoint = __bm_checkpoint_readonly; + bm->checkpoint_resolve = __bm_checkpoint_resolve_readonly; + bm->compact_end = __bm_compact_end_readonly; + bm->compact_page_skip = __bm_compact_page_skip_readonly; + bm->compact_skip = __bm_compact_skip_readonly; + bm->compact_start = __bm_compact_start_readonly; + bm->free = __bm_free_readonly; + bm->salvage_end = __bm_salvage_end_readonly; + bm->salvage_next = __bm_salvage_next_readonly; + bm->salvage_start = __bm_salvage_start_readonly; + bm->salvage_valid = __bm_salvage_valid_readonly; + bm->sync = __bm_sync_readonly; + bm->write = __bm_write_readonly; + bm->write_size = __bm_write_size_readonly; } } diff --git a/src/block/block_open.c b/src/block/block_open.c index d9b2f908737..adb745c99e7 100644 --- a/src/block/block_open.c +++ b/src/block/block_open.c @@ -369,7 +369,7 @@ __desc_read(WT_SESSION_IMPL *session, WT_BLOCK *block) WT_ERR_MSG(session, WT_ERROR, "unsupported WiredTiger file version: this build only " "supports major/minor versions up to %d/%d, and the file " - "is version %d/%d", + "is version %" PRIu16 "/%" PRIu16, WT_BLOCK_MAJOR_VERSION, WT_BLOCK_MINOR_VERSION, desc->majorv, desc->minorv); diff --git a/src/block/block_write.c b/src/block/block_write.c index 4c6ac198fe4..e05a430832e 100644 --- a/src/block/block_write.c +++ b/src/block/block_write.c @@ -206,10 +206,16 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, uint32_t cksum; bool local_locked; - blk = WT_BLOCK_HEADER_REF(buf->mem); fh = block->fh; /* + * Clear the block header to ensure all of it is initialized, even the + * unused fields. + */ + blk = WT_BLOCK_HEADER_REF(buf->mem); + memset(blk, 0, sizeof(*blk)); + + /* * Swap the page-header as needed; this doesn't belong here, but it's * the best place to catch all callers. */ diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c index 12df19a7e04..9cc56c56452 100644 --- a/src/btree/bt_compact.c +++ b/src/btree/bt_compact.c @@ -96,14 +96,13 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) WT_BTREE *btree; WT_DECL_RET; WT_REF *ref; - bool block_manager_begin, skip; + bool skip; WT_UNUSED(cfg); btree = S2BT(session); bm = btree->bm; ref = NULL; - block_manager_begin = false; WT_STAT_FAST_DATA_INCR(session, session_compact); @@ -123,24 +122,12 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) * We need to ensure we don't race with page reconciliation as it's * writing the page modify information. * - * There are three ways we call reconciliation: checkpoints, threads - * writing leaf pages (usually in preparation for a checkpoint or if - * closing a file), and eviction. - * - * We're holding the schema lock which serializes with checkpoints. - */ - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); - - /* - * Get the tree handle's flush lock which blocks threads writing leaf - * pages. + * There are two ways we call reconciliation: checkpoints and eviction. + * Get the tree's flush lock which blocks threads writing pages for + * checkpoints. */ __wt_spin_lock(session, &btree->flush_lock); - /* Start compaction. */ - WT_ERR(bm->compact_start(bm, session)); - block_manager_begin = true; - /* Walk the tree reviewing pages to see if they should be re-written. */ for (;;) { /* @@ -170,9 +157,6 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) err: if (ref != NULL) WT_TRET(__wt_page_release(session, ref, 0)); - if (block_manager_begin) - WT_TRET(bm->compact_end(bm, session)); - /* Unblock threads writing leaf pages. */ __wt_spin_unlock(session, &btree->flush_lock); diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c index a083ec4016e..7475c0f1312 100644 --- a/src/btree/bt_curprev.c +++ b/src/btree/bt_curprev.c @@ -51,7 +51,8 @@ restart: if (cbt->btree->type == BTREE_ROW) { key.data = WT_INSERT_KEY(current); key.size = WT_INSERT_KEY_SIZE(current); - WT_RET(__wt_search_insert(session, cbt, &key)); + WT_RET(__wt_search_insert( + session, cbt, cbt->ins_head, &key)); } else cbt->ins = __col_insert_search(cbt->ins_head, cbt->ins_stack, cbt->next_stack, diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index c11b7d35de6..1f3ac443495 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -173,13 +173,18 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) */ break; case BTREE_COL_VAR: + /* The search function doesn't check for empty pages. */ + if (page->pg_var_entries == 0) + return (false); + WT_ASSERT(session, cbt->slot < page->pg_var_entries); + /* - * If search returned an insert object, there may or may not be - * a matching on-page object, we have to check. Variable-length - * column-store pages don't map one-to-one to keys, but have - * "slots", check if search returned a valid slot. + * Column-store updates aren't stored on the page, instead they + * are stored as "insert" objects. If search returned an insert + * object we can't return, the returned on-page object must be + * checked for a match. */ - if (cbt->slot >= page->pg_var_entries) + if (cbt->ins != NULL && !F_ISSET(cbt, WT_CBT_VAR_ONPAGE_MATCH)) return (false); /* @@ -194,6 +199,11 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) return (false); break; case BTREE_ROW: + /* The search function doesn't check for empty pages. */ + if (page->pg_row_entries == 0) + return (false); + WT_ASSERT(session, cbt->slot < page->pg_row_entries); + /* * See above: for row-store, no insert object can have the same * key as an on-page object, we're done. @@ -201,15 +211,6 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) if (cbt->ins != NULL) return (false); - /* - * Check if searched returned a valid slot (the failure mode is - * an empty page, the search function doesn't check, and so the - * more exact test is "page->pg_row_entries == 0", but this test - * mirrors the column-store test). - */ - if (cbt->slot >= page->pg_row_entries) - return (false); - /* Updates are stored on the page, check for a delete. */ if (page->pg_row_upd != NULL && (upd = __wt_txn_read( session, page->pg_row_upd[cbt->slot])) != NULL) { @@ -1162,22 +1163,14 @@ int __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) { WT_BTREE *btree; - WT_CURSOR_BTREE *cbt; WT_DECL_RET; WT_SESSION_IMPL *session; - cbt = (start != NULL) ? start : stop; - session = (WT_SESSION_IMPL *)cbt->iface.session; - btree = cbt->btree; + session = (WT_SESSION_IMPL *)start->iface.session; + btree = start->btree; WT_STAT_FAST_DATA_INCR(session, cursor_truncate); /* - * We always delete in a forward direction because it's faster, assert - * our caller provided us with a start cursor. - */ - WT_ASSERT(session, start != NULL); - - /* * For recovery, log the start and stop keys for a truncate operation, * not the individual records removed. On the other hand, for rollback * we need to keep track of all the in-memory operations. diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c index 795111d53f9..1f739c9572e 100644 --- a/src/btree/bt_discard.c +++ b/src/btree/bt_discard.c @@ -337,8 +337,7 @@ __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) copy = WT_ROW_KEY_COPY(rip); (void)__wt_row_leaf_key_info( page, copy, &ikey, NULL, NULL, NULL); - if (ikey != NULL) - __wt_free(session, ikey); + __wt_free(session, ikey); } /* diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index 2db3ca7d984..1d33a7e7c9a 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -36,7 +36,8 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]) btree = S2BT(session); /* Checkpoint files are readonly. */ - readonly = dhandle->checkpoint != NULL; + readonly = (dhandle->checkpoint != NULL || + F_ISSET(S2C(session), WT_CONN_READONLY)); /* Get the checkpoint information for this name/checkpoint pair. */ WT_CLEAR(ckpt); @@ -349,7 +350,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) /* Initialize locks. */ WT_RET(__wt_rwlock_alloc( session, &btree->ovfl_lock, "btree overflow lock")); - WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush lock")); + WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush")); btree->checkpointing = WT_CKPT_OFF; /* Not checkpointing */ btree->modified = 0; /* Clean */ diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index ac9faef4ff2..5cf6a9bf2bc 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -281,10 +281,8 @@ err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); * On error, upd points to a single unlinked WT_UPDATE structure, * first_upd points to a list. */ - if (upd != NULL) - __wt_free(session, upd); - if (first_upd != NULL) - __wt_free_update_list(session, first_upd); + __wt_free(session, upd); + __wt_free_update_list(session, first_upd); __wt_scr_free(session, ¤t_key); __wt_scr_free(session, &las_addr); @@ -460,12 +458,12 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags WT_DECL_RET; WT_PAGE *page; u_int sleep_cnt, wait_cnt; - bool busy, cache_work, oldgen, stalled; + bool busy, cache_work, evict_soon, stalled; int force_attempts; btree = S2BT(session); - for (oldgen = stalled = false, + for (evict_soon = stalled = false, force_attempts = 0, sleep_cnt = wait_cnt = 0;;) { switch (ref->state) { case WT_REF_DELETED: @@ -486,7 +484,16 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags WT_RET(__wt_cache_eviction_check( session, 1, NULL)); WT_RET(__page_read(session, ref)); - oldgen = LF_ISSET(WT_READ_WONT_NEED) || + + /* + * If configured to not trash the cache, leave the page + * generation unset, we'll set it before returning to + * the oldest read generation, so the page is forcibly + * evicted as soon as possible. We don't do that set + * here because we don't want to evict the page before + * we "acquire" it. + */ + evict_soon = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: @@ -575,20 +582,24 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags } /* - * If we read the page and we are configured to not - * trash the cache, set the oldest read generation so - * the page is forcibly evicted as soon as possible. + * If we read the page and are configured to not trash + * the cache, and no other thread has already used the + * page, set the oldest read generation so the page is + * forcibly evicted as soon as possible. * - * Otherwise, update the page's read generation. + * Otherwise, if we read the page, or, if configured to + * update the page's read generation and the page isn't + * already flagged for forced eviction, update the page + * read generation. */ page = ref->page; - if (oldgen && page->read_gen == WT_READGEN_NOTSET) - __wt_page_evict_soon(page); - else if (!LF_ISSET(WT_READ_NO_GEN) && - page->read_gen != WT_READGEN_OLDEST && - page->read_gen < __wt_cache_read_gen(session)) - page->read_gen = - __wt_cache_read_gen_bump(session); + if (page->read_gen == WT_READGEN_NOTSET) { + if (evict_soon) + __wt_page_evict_soon(page); + else + __wt_cache_read_gen_new(session, page); + } else if (!LF_ISSET(WT_READ_NO_GEN)) + __wt_cache_read_gen_bump(session, page); skip_evict: /* * Check if we need an autocommit transaction. diff --git a/src/btree/bt_rebalance.c b/src/btree/bt_rebalance.c index 86360e83ddf..d94eb2ddd80 100644 --- a/src/btree/bt_rebalance.c +++ b/src/btree/bt_rebalance.c @@ -412,6 +412,7 @@ __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) WT_UNUSED(cfg); btree = S2BT(session); + evict_reset = false; /* * If the tree has never been written to disk, we're done, rebalance @@ -438,7 +439,8 @@ __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) * cache is the root page, and that cannot be evicted; however, this way * eviction ignores the tree entirely.) */ - WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset)); + WT_ERR(__wt_evict_file_exclusive_on(session)); + evict_reset = true; /* Recursively walk the tree. */ switch (rs->type) { @@ -470,7 +472,10 @@ __wt_bt_rebalance(WT_SESSION_IMPL *session, const char *cfg[]) btree->root.page = rs->root; rs->root = NULL; -err: /* Discard any leftover root page we created. */ +err: if (evict_reset) + __wt_evict_file_exclusive_off(session); + + /* Discard any leftover root page we created. */ if (rs->root != NULL) { __wt_page_modify_clear(session, rs->root); __wt_page_out(session, &rs->root); diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c index 8d78bda79fb..0e064d306b6 100644 --- a/src/btree/bt_slvg.c +++ b/src/btree/bt_slvg.c @@ -1206,8 +1206,7 @@ __slvg_col_build_internal( __wt_root_ref_init(&ss->root_ref, page, true); if (0) { -err: if (addr != NULL) - __wt_free(session, addr); +err: __wt_free(session, addr); __wt_page_out(session, &page); } return (ret); @@ -1868,8 +1867,7 @@ __slvg_row_build_internal( __wt_root_ref_init(&ss->root_ref, page, false); if (0) { -err: if (addr != NULL) - __wt_free(session, addr); +err: __wt_free(session, addr); __wt_page_out(session, &page); } return (ret); diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index bd38451d5d1..4f16a290958 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -151,8 +151,7 @@ __wt_split_stash_discard_all( for (i = 0, stash = session->split_stash; i < session->split_stash_cnt; ++i, ++stash) - if (stash->p != NULL) - __wt_free(session_safe, stash->p); + __wt_free(session_safe, stash->p); __wt_free(session_safe, session->split_stash); session->split_stash_cnt = session->split_stash_alloc = 0; @@ -1383,11 +1382,27 @@ __split_internal_should_split(WT_SESSION_IMPL *session, WT_REF *ref) static int __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard) { + WT_BTREE *btree; WT_DECL_RET; WT_PAGE *parent; WT_REF *ref; bool parent_hazard; + btree = S2BT(session); + + /* + * Disallow internal splits during the final pass of a checkpoint. Most + * splits are already disallowed during checkpoints, but an important + * exception is insert splits. The danger is an insert split creates a + * new chunk of the namespace, and then the internal split will move it + * to a different part of the tree where it will be written; in other + * words, in one part of the tree we'll skip the newly created insert + * split chunk, but we'll write it upon finding it in a different part + * of the tree. + */ + if (btree->checkpointing != WT_CKPT_OFF) + return (__split_internal_unlock(session, page, page_hazard)); + /* * Page splits trickle up the tree, that is, as leaf pages grow large * enough and are evicted, they'll split into their parent. And, as @@ -1771,8 +1786,8 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) type, WT_INSERT_RECNO(moved_ins), 0, false, &right)); /* - * The new page is dirty by definition, column-store splits update the - * page-modify structure, so create it now. + * The new page is dirty by definition, plus column-store splits update + * the page-modify structure, so create it now. */ WT_ERR(__wt_page_modify_init(session, right)); __wt_page_modify_set(session, right); @@ -1813,15 +1828,6 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) } /* - * We modified the page above, which will have set the first dirty - * transaction to the last transaction current running. However, the - * updates we installed may be older than that. Set the first dirty - * transaction to an impossibly old value so this page is never skipped - * in a checkpoint. - */ - right->modify->first_dirty_txn = WT_TXN_FIRST; - - /* * Calculate how much memory we're moving: figure out how deep the skip * list stack is for the element we are moving, and the memory used by * the item's list of updates. @@ -1919,6 +1925,24 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) #endif /* + * We perform insert splits concurrently with checkpoints, where the + * requirement is a checkpoint must include either the original page + * or both new pages. The page we're splitting is dirty, but that's + * insufficient: set the first dirty transaction to an impossibly old + * value so this page is not skipped by a checkpoint. + */ + page->modify->first_dirty_txn = WT_TXN_FIRST; + + /* + * We modified the page above, which will have set the first dirty + * transaction to the last transaction current running. However, the + * updates we installed may be older than that. Set the first dirty + * transaction to an impossibly old value so this page is never skipped + * in a checkpoint. + */ + right->modify->first_dirty_txn = WT_TXN_FIRST; + + /* * Update the page accounting. * * XXX @@ -1928,10 +1952,14 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) __wt_cache_page_inmem_incr(session, right, right_incr); /* - * Split into the parent. On successful return, the original page is no - * longer locked, so we cannot safely look at it. + * The act of splitting into the parent releases the pages for eviction; + * ensure the page contents are consistent. + */ + WT_WRITE_BARRIER(); + + /* + * Split into the parent. */ - page = NULL; if ((ret = __split_parent( session, ref, split_ref, 2, parent_incr, false, true)) == 0) return (0); @@ -1941,7 +1969,8 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) * * Reset the split column-store page record. */ - page->modify->mod_split_recno = WT_RECNO_OOB; + if (type != WT_PAGE_ROW_LEAF) + page->modify->mod_split_recno = WT_RECNO_OOB; /* * Clear the allocated page's reference to the moved insert list element diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index 5cbd8d1e996..57056eb5c99 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -17,18 +17,18 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) { struct timespec end, start; WT_BTREE *btree; + WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF *walk; WT_TXN *txn; uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; - uint64_t saved_snap_min; + uint64_t oldest_id, saved_snap_min; uint32_t flags; - bool evict_reset; + conn = S2C(session); btree = S2BT(session); - walk = NULL; txn = &session->txn; saved_snap_min = WT_SESSION_TXN_STATE(session)->snap_min; @@ -56,6 +56,15 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) return (0); } + /* + * Save the oldest transaction ID we need to keep around. + * Otherwise, in a busy system, we could be updating pages so + * fast that write leaves never catches up. We deliberately + * have no transaction running at this point that would keep + * the oldest ID from moving forwards as we walk the tree. + */ + oldest_id = __wt_txn_oldest_id(session); + flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); @@ -64,13 +73,13 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) /* * Write dirty pages if nobody beat us to it. Don't - * try to write the hottest pages: checkpoint will have - * to visit them anyway. + * try to write hot pages (defined as pages that have + * been updated since the write phase leaves started): + * checkpoint will have to visit them anyway. */ page = walk->page; if (__wt_page_is_modified(page) && - __wt_txn_visible_all( - session, page->modify->update_txn)) { + WT_TXNID_LT(page->modify->update_txn, oldest_id)) { if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; @@ -105,19 +114,18 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) __wt_spin_lock(session, &btree->flush_lock); /* - * When internal pages are being reconciled by checkpoint their - * child pages cannot disappear from underneath them or be split - * into them, nor can underlying blocks be freed until the block - * lists for the checkpoint are stable. Set the checkpointing - * flag to block eviction of dirty pages until the checkpoint's - * internal page pass is complete, then wait for any existing - * eviction to complete. + * In the final checkpoint pass, child pages cannot be evicted + * from underneath internal pages nor can underlying blocks be + * freed until the checkpoint's block lists are stable. Also, + * we cannot split child pages into parents unless we know the + * final pass will write a consistent view of that namespace. + * Set the checkpointing flag to block such actions and wait for + * any problematic eviction or page splits to complete. */ WT_PUBLISH(btree->checkpointing, WT_CKPT_PREPARE); - WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset)); - if (evict_reset) - __wt_evict_file_exclusive_off(session); + WT_ERR(__wt_evict_file_exclusive_on(session)); + __wt_evict_file_exclusive_off(session); WT_PUBLISH(btree->checkpointing, WT_CKPT_RUNNING); @@ -215,7 +223,7 @@ err: /* On error, clear any left-over tree walk. */ * so that eviction knows that the checkpoint has completed. */ WT_PUBLISH(btree->checkpoint_gen, - S2C(session)->txn_global.checkpoint_gen); + conn->txn_global.checkpoint_gen); WT_STAT_FAST_DATA_SET(session, btree_checkpoint_generation, btree->checkpoint_gen); @@ -249,7 +257,8 @@ err: /* On error, clear any left-over tree walk. */ * before checkpointing the file). Start a flush to stable storage, * but don't wait for it. */ - if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES) + if (ret == 0 && + syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC)) WT_RET(btree->bm->sync(btree->bm, session, true)); return (ret); @@ -260,24 +269,18 @@ err: /* On error, clear any left-over tree walk. */ * Cache operations. */ int -__wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_CACHE_OP op) +__wt_cache_op(WT_SESSION_IMPL *session, WT_CACHE_OP op) { - WT_DECL_RET; - WT_BTREE *btree; - - btree = S2BT(session); - switch (op) { case WT_SYNC_CHECKPOINT: case WT_SYNC_CLOSE: /* - * Set the checkpoint reference for reconciliation; it's ugly, - * but drilling a function parameter path from our callers to - * the reconciliation of the tree's root page is going to be - * worse. + * Make sure the checkpoint reference is set for + * reconciliation; it's ugly, but drilling a function parameter + * path from our callers to the reconciliation of the tree's + * root page is going to be worse. */ - WT_ASSERT(session, btree->ckpt == NULL); - btree->ckpt = ckptbase; + WT_ASSERT(session, S2BT(session)->ckpt != NULL); break; case WT_SYNC_DISCARD: case WT_SYNC_WRITE_LEAVES: @@ -287,23 +290,10 @@ __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_CACHE_OP op) switch (op) { case WT_SYNC_CHECKPOINT: case WT_SYNC_WRITE_LEAVES: - WT_ERR(__sync_file(session, op)); - break; + return (__sync_file(session, op)); case WT_SYNC_CLOSE: case WT_SYNC_DISCARD: - WT_ERR(__wt_evict_file(session, op)); - break; + return (__wt_evict_file(session, op)); + WT_ILLEGAL_VALUE(session); } - -err: switch (op) { - case WT_SYNC_CHECKPOINT: - case WT_SYNC_CLOSE: - btree->ckpt = NULL; - break; - case WT_SYNC_DISCARD: - case WT_SYNC_WRITE_LEAVES: - break; - } - - return (ret); } diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c index ae2c20be1b6..952298f2456 100644 --- a/src/btree/bt_vrfy.c +++ b/src/btree/bt_vrfy.c @@ -226,7 +226,7 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) WT_WITH_PAGE_INDEX(session, ret = __verify_tree(session, &btree->root, vs)); - WT_TRET(__wt_cache_op(session, NULL, WT_SYNC_DISCARD)); + WT_TRET(__wt_cache_op(session, WT_SYNC_DISCARD)); } /* Unload the checkpoint. */ diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c index 55b11d7b2d1..bb8a750d848 100644 --- a/src/btree/bt_walk.c +++ b/src/btree/bt_walk.c @@ -583,14 +583,14 @@ restart: /* break; } WT_ERR(ret); + couple = ref; /* * A new page: configure for traversal of any internal * page's children, else return the leaf page. */ if (WT_PAGE_IS_INTERNAL(ref->page)) { -descend: couple = ref; - empty_internal = true; +descend: empty_internal = true; /* * There's a split race when a cursor is setting @@ -649,7 +649,6 @@ descend: couple = ref; */ if (skipleafcntp != NULL || LF_ISSET(WT_READ_SKIP_LEAF)) { - couple = ref; if (LF_ISSET(WT_READ_SKIP_LEAF)) break; if (*skipleafcntp > 0) { diff --git a/src/btree/col_modify.c b/src/btree/col_modify.c index 645d98d9c9b..fd60b12538a 100644 --- a/src/btree/col_modify.c +++ b/src/btree/col_modify.c @@ -25,6 +25,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, **ins_headp; WT_ITEM _value; WT_PAGE *page; + WT_PAGE_MODIFY *mod; WT_UPDATE *old_upd, *upd; size_t ins_size, upd_size; u_int i, skipdepth; @@ -60,6 +61,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, /* If we don't yet have a modify structure, we'll need one. */ WT_RET(__wt_page_modify_init(session, page)); + mod = page->modify; /* * Delete, insert or update a column-store entry. @@ -105,17 +107,17 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, /* Allocate the append/update list reference as necessary. */ if (append) { WT_PAGE_ALLOC_AND_SWAP(session, - page, page->modify->mod_append, ins_headp, 1); - ins_headp = &page->modify->mod_append[0]; + page, mod->mod_append, ins_headp, 1); + ins_headp = &mod->mod_append[0]; } else if (page->type == WT_PAGE_COL_FIX) { WT_PAGE_ALLOC_AND_SWAP(session, - page, page->modify->mod_update, ins_headp, 1); - ins_headp = &page->modify->mod_update[0]; + page, mod->mod_update, ins_headp, 1); + ins_headp = &mod->mod_update[0]; } else { WT_PAGE_ALLOC_AND_SWAP(session, - page, page->modify->mod_update, ins_headp, + page, mod->mod_update, ins_headp, page->pg_var_entries); - ins_headp = &page->modify->mod_update[cbt->slot]; + ins_headp = &mod->mod_update[cbt->slot]; } /* Allocate the WT_INSERT_HEAD structure as necessary. */ @@ -135,6 +137,14 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, cbt->ins_head = ins_head; cbt->ins = ins; + /* + * Check for insert split and checkpoint races in column-store: + * it's easy (as opposed to in row-store) and a difficult bug to + * otherwise diagnose. + */ + WT_ASSERT(session, mod->mod_split_recno == WT_RECNO_OOB || + (recno != WT_RECNO_OOB && mod->mod_split_recno > recno)); + if (upd_arg == NULL) { WT_ERR( __wt_update_alloc(session, value, &upd, &upd_size)); diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c index 3aa31044b82..4730267a545 100644 --- a/src/btree/col_srch.c +++ b/src/btree/col_srch.c @@ -77,6 +77,7 @@ __wt_col_search(WT_SESSION_IMPL *session, int depth; btree = S2BT(session); + current = NULL; __cursor_pos_clear(cbt); @@ -116,12 +117,19 @@ __wt_col_search(WT_SESSION_IMPL *session, goto leaf_only; } -restart_root: + if (0) { +restart: /* + * Discard the currently held page and restart the search from + * the root. + */ + WT_RET(__wt_page_release(session, current, 0)); + } + /* Search the internal pages of the tree. */ current = &btree->root; for (depth = 2, pindex = NULL;; ++depth) { parent_pindex = pindex; -restart_page: page = current->page; + page = current->page; if (page->type != WT_PAGE_COL_INT) break; @@ -138,10 +146,8 @@ restart_page: page = current->page; * on the page), check for an internal page split race. */ if (__wt_split_descent_race( - session, current, parent_pindex)) { - WT_RET(__wt_page_release(session, current, 0)); - goto restart_root; - } + session, current, parent_pindex)) + goto restart; goto descend; } @@ -178,8 +184,14 @@ descend: /* /* * Swap the current page for the child page. If the page splits - * while we're retrieving it, restart the search in the current - * page; otherwise return on error, the swap call ensures we're + * while we're retrieving it, restart the search at the root. + * We cannot restart in the "current" page; for example, if a + * thread is appending to the tree, the page it's waiting for + * did an insert-split into the parent, then the parent split + * into its parent, the name space we are searching for may have + * moved above the current page in the tree. + * + * On other error, simply return, the swap call ensures we're * holding nothing on failure. */ if ((ret = __wt_page_swap( @@ -188,7 +200,7 @@ descend: /* continue; } if (ret == WT_RESTART) - goto restart_page; + goto restart; return (ret); } @@ -199,7 +211,6 @@ descend: /* leaf_only: page = current->page; cbt->ref = current; - cbt->recno = recno; /* * Don't bother searching if the caller is appending a new record where @@ -213,13 +224,6 @@ leaf_only: } /* - * Set the on-page slot to an impossible value larger than any possible - * slot (it's used to interpret the search function's return after the - * search returns an insert list for a page that has no entries). - */ - cbt->slot = UINT32_MAX; - - /* * Search the leaf page. * * Search after a page is pinned does a search of the pinned page before @@ -232,28 +236,38 @@ leaf_only: * that's impossibly large for the page. We do have additional setup to * do in that case, the record may be appended to the page. */ - cbt->compare = 0; if (page->type == WT_PAGE_COL_FIX) { if (recno < page->pg_fix_recno) { + cbt->recno = page->pg_fix_recno; cbt->compare = 1; return (0); } if (recno >= page->pg_fix_recno + page->pg_fix_entries) { cbt->recno = page->pg_fix_recno + page->pg_fix_entries; goto past_end; - } else + } else { + cbt->recno = recno; + cbt->compare = 0; ins_head = WT_COL_UPDATE_SINGLE(page); + } } else { if (recno < page->pg_var_recno) { + cbt->recno = page->pg_var_recno; + cbt->slot = 0; cbt->compare = 1; return (0); } if ((cip = __col_var_search(page, recno, NULL)) == NULL) { cbt->recno = __col_var_last_recno(page); + cbt->slot = page->pg_var_entries == 0 ? + 0 : page->pg_var_entries - 1; goto past_end; } else { + cbt->recno = recno; cbt->slot = WT_COL_SLOT(page, cip); + cbt->compare = 0; ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot); + F_SET(cbt, WT_CBT_VAR_ONPAGE_MATCH); } } diff --git a/src/btree/row_key.c b/src/btree/row_key.c index 8b9e858ec18..9fff092d079 100644 --- a/src/btree/row_key.c +++ b/src/btree/row_key.c @@ -52,6 +52,7 @@ __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page) WT_RET(__wt_scr_alloc(session, 0, &key)); WT_RET(__wt_scr_alloc(session, (uint32_t)__bitstr_size(page->pg_row_entries), &tmp)); + memset(tmp->mem, 0, tmp->memsize); if ((gap = btree->key_gap) == 0) gap = 1; diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c index 28c55a4ccd0..6169a0a810a 100644 --- a/src/btree/row_srch.c +++ b/src/btree/row_srch.c @@ -9,18 +9,17 @@ #include "wt_internal.h" /* - * __wt_search_insert_append -- + * __search_insert_append -- * Fast append search of a row-store insert list, creating a skiplist stack * as we go. */ static inline int -__wt_search_insert_append(WT_SESSION_IMPL *session, - WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, bool *donep) +__search_insert_append(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, + WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key, bool *donep) { WT_BTREE *btree; WT_COLLATOR *collator; WT_INSERT *ins; - WT_INSERT_HEAD *inshead; WT_ITEM key; int cmp, i; @@ -28,8 +27,7 @@ __wt_search_insert_append(WT_SESSION_IMPL *session, collator = btree->collator; *donep = 0; - inshead = cbt->ins_head; - if ((ins = WT_SKIP_LAST(inshead)) == NULL) + if ((ins = WT_SKIP_LAST(ins_head)) == NULL) return (0); key.data = WT_INSERT_KEY(ins); key.size = WT_INSERT_KEY_SIZE(ins); @@ -48,12 +46,13 @@ __wt_search_insert_append(WT_SESSION_IMPL *session, */ for (i = WT_SKIP_MAXDEPTH - 1; i >= 0; i--) { cbt->ins_stack[i] = (i == 0) ? &ins->next[0] : - (inshead->tail[i] != NULL) ? - &inshead->tail[i]->next[i] : &inshead->head[i]; + (ins_head->tail[i] != NULL) ? + &ins_head->tail[i]->next[i] : &ins_head->head[i]; cbt->next_stack[i] = NULL; } cbt->compare = -cmp; cbt->ins = ins; + cbt->ins_head = ins_head; *donep = 1; } return (0); @@ -64,20 +63,18 @@ __wt_search_insert_append(WT_SESSION_IMPL *session, * Search a row-store insert list, creating a skiplist stack as we go. */ int -__wt_search_insert( - WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key) +__wt_search_insert(WT_SESSION_IMPL *session, + WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key) { WT_BTREE *btree; WT_COLLATOR *collator; WT_INSERT *ins, **insp, *last_ins; - WT_INSERT_HEAD *inshead; WT_ITEM key; size_t match, skiphigh, skiplow; int cmp, i; btree = S2BT(session); collator = btree->collator; - inshead = cbt->ins_head; cmp = 0; /* -Wuninitialized */ /* @@ -86,7 +83,7 @@ __wt_search_insert( */ match = skiphigh = skiplow = 0; ins = last_ins = NULL; - for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0;) { + for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0;) { if ((ins = *insp) == NULL) { cbt->next_stack[i] = NULL; cbt->ins_stack[i--] = insp--; @@ -128,6 +125,7 @@ __wt_search_insert( */ cbt->compare = -cmp; cbt->ins = (ins != NULL) ? ins : last_ins; + cbt->ins_head = ins_head; return (0); } @@ -212,6 +210,7 @@ __wt_row_search(WT_SESSION_IMPL *session, WT_BTREE *btree; WT_COLLATOR *collator; WT_DECL_RET; + WT_INSERT_HEAD *ins_head; WT_ITEM *item; WT_PAGE *page; WT_PAGE_INDEX *pindex, *parent_pindex; @@ -276,12 +275,20 @@ __wt_row_search(WT_SESSION_IMPL *session, goto leaf_only; } + if (0) { +restart: /* + * Discard the currently held page and restart the search from + * the root. + */ + WT_RET(__wt_page_release(session, current, 0)); + skiphigh = skiplow = 0; + } + /* Search the internal pages of the tree. */ -restart_root: current = &btree->root; for (depth = 2, pindex = NULL;; ++depth) { parent_pindex = pindex; -restart_page: page = current->page; + page = current->page; if (page->type != WT_PAGE_ROW_INT) break; @@ -419,20 +426,20 @@ restart_page: page = current->page; */ if (pindex->entries == base) { append: if (__wt_split_descent_race( - session, current, parent_pindex)) { - if ((ret = __wt_page_release( - session, current, 0)) != 0) - return (ret); - - skiplow = skiphigh = 0; - goto restart_root; - } + session, current, parent_pindex)) + goto restart; } descend: /* * Swap the current page for the child page. If the page splits - * while we're retrieving it, restart the search in the current - * page; otherwise return on error, the swap call ensures we're + * while we're retrieving it, restart the search at the root. + * We cannot restart in the "current" page; for example, if a + * thread is appending to the tree, the page it's waiting for + * did an insert-split into the parent, then the parent split + * into its parent, the name space we are searching for may have + * moved above the current page in the tree. + * + * On other error, simply return, the swap call ensures we're * holding nothing on failure. */ if ((ret = __wt_page_swap( @@ -440,10 +447,8 @@ descend: /* current = descent; continue; } - if (ret == WT_RESTART) { - skiphigh = skiplow = 0; - goto restart_page; - } + if (ret == WT_RESTART) + goto restart; return (ret); } @@ -456,6 +461,12 @@ leaf_only: cbt->ref = current; /* + * Clear current now that we have moved the reference into the btree + * cursor, so that cleanup never releases twice. + */ + current = NULL; + + /* * In the case of a right-side tree descent during an insert, do a fast * check for an append to the page, try to catch cursors appending data * into the tree. @@ -479,24 +490,18 @@ leaf_only: cbt->slot = WT_ROW_SLOT(page, page->pg_row_d); F_SET(cbt, WT_CBT_SEARCH_SMALLEST); - cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); + ins_head = WT_ROW_INSERT_SMALLEST(page); } else { cbt->slot = WT_ROW_SLOT(page, page->pg_row_d + (page->pg_row_entries - 1)); - cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot); + ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot); } - WT_ERR( - __wt_search_insert_append(session, cbt, srch_key, &done)); + WT_ERR(__search_insert_append( + session, cbt, ins_head, srch_key, &done)); if (done) return (0); - - /* - * Don't leave the insert list head set, code external to the - * search uses it. - */ - cbt->ins_head = NULL; } /* @@ -589,16 +594,16 @@ leaf_match: cbt->compare = 0; cbt->slot = WT_ROW_SLOT(page, page->pg_row_d); F_SET(cbt, WT_CBT_SEARCH_SMALLEST); - cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); + ins_head = WT_ROW_INSERT_SMALLEST(page); } else { cbt->compare = -1; cbt->slot = WT_ROW_SLOT(page, page->pg_row_d + (base - 1)); - cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot); + ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot); } /* If there's no insert list, we're done. */ - if (WT_SKIP_FIRST(cbt->ins_head) == NULL) + if (WT_SKIP_FIRST(ins_head) == NULL) return (0); /* @@ -606,23 +611,16 @@ leaf_match: cbt->compare = 0; * catch cursors repeatedly inserting at a single point. */ if (insert) { - WT_ERR( - __wt_search_insert_append(session, cbt, srch_key, &done)); + WT_ERR(__search_insert_append( + session, cbt, ins_head, srch_key, &done)); if (done) return (0); } - WT_ERR(__wt_search_insert(session, cbt, srch_key)); + WT_ERR(__wt_search_insert(session, cbt, ins_head, srch_key)); return (0); -err: /* - * Release the current page if the search started at the root. If the - * search didn't start at the root we should never have gone looking - * beyond the start page. - */ - WT_ASSERT(session, leaf == NULL || leaf == current); - if (leaf == NULL) - WT_TRET(__wt_page_release(session, current, 0)); +err: WT_TRET(__wt_page_release(session, current, 0)); return (ret); } @@ -660,19 +658,16 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) /* * If the tree is new (and not empty), it might have a large insert * list. - */ - F_SET(cbt, WT_CBT_SEARCH_SMALLEST); - if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL) - return (WT_NOTFOUND); - - /* + * * Walk down the list until we find a level with at least 50 entries, * that's where we'll start rolling random numbers. The value 50 is * used to ignore levels with only a few entries, that is, levels which * are potentially badly skewed. */ - for (ins_head = cbt->ins_head, - level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) { + F_SET(cbt, WT_CBT_SEARCH_SMALLEST); + if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL) + return (WT_NOTFOUND); + for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) { start = &ins_head->head[level]; for (entries = 0, stop = start; *stop != NULL; stop = &(*stop)->next[level]) @@ -767,6 +762,7 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) ins = ins->next[0]; cbt->ins = ins; + cbt->ins_head = ins_head; cbt->compare = 0; return (0); @@ -786,11 +782,19 @@ __wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_REF *current, *descent; btree = S2BT(session); + current = NULL; __cursor_pos_clear(cbt); -restart_root: - /* Walk the internal pages of the tree. */ + if (0) { +restart: /* + * Discard the currently held page and restart the search from + * the root. + */ + WT_RET(__wt_page_release(session, current, 0)); + } + + /* Search the internal pages of the tree. */ current = &btree->root; for (;;) { page = current->page; @@ -802,22 +806,19 @@ restart_root: __wt_random(&session->rnd) % pindex->entries]; /* - * Swap the parent page for the child page; return on error, - * the swap function ensures we're holding nothing on failure. + * Swap the current page for the child page. If the page splits + * while we're retrieving it, restart the search at the root. + * + * On other error, simply return, the swap call ensures we're + * holding nothing on failure. */ if ((ret = __wt_page_swap( session, current, descent, WT_READ_RESTART_OK)) == 0) { current = descent; continue; } - /* - * Restart is returned if we find a page that's been split; the - * held page isn't discarded when restart is returned, discard - * it and restart the search from the top of the tree. - */ - if (ret == WT_RESTART && - (ret = __wt_page_release(session, current, 0)) == 0) - goto restart_root; + if (ret == WT_RESTART) + goto restart; return (ret); } diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c index 1ef8dd32bb4..8796ec6b2fc 100644 --- a/src/cache/cache_las.c +++ b/src/cache/cache_las.c @@ -58,6 +58,8 @@ __wt_las_create(WT_SESSION_IMPL *session) conn = S2C(session); + if (F_ISSET(conn, WT_CONN_READONLY)) + return (0); /* * Done at startup: we cannot do it on demand because we require the * schema lock to create and drop the table, and it may not always be @@ -203,7 +205,7 @@ __wt_las_cursor( * useful more than once. */ *session_flags = - F_ISSET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); + F_MASK(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); conn = S2C(session); diff --git a/src/config/config.c b/src/config/config.c index f480ab83dbd..96ef7a4e62a 100644 --- a/src/config/config.c +++ b/src/config/config.c @@ -16,9 +16,9 @@ static int __config_err(WT_CONFIG *conf, const char *msg, int err) { WT_RET_MSG(conf->session, err, - "Error parsing '%.*s' at byte %u: %s", + "Error parsing '%.*s' at offset %" WT_PTRDIFFT_FMT ": %s", (int)(conf->end - conf->orig), conf->orig, - (u_int)(conf->cur - conf->orig), msg); + conf->cur - conf->orig, msg); } /* diff --git a/src/config/config_def.c b/src/config/config_def.c index 879de670695..c752e5eb265 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -99,6 +99,7 @@ static const WT_CONFIG_CHECK static const WT_CONFIG_CHECK confchk_wiredtiger_open_statistics_log_subconfigs[] = { + { "json", "boolean", NULL, NULL, NULL, 0 }, { "on_close", "boolean", NULL, NULL, NULL, 0 }, { "path", "string", NULL, NULL, NULL, 0 }, { "sources", "list", NULL, NULL, NULL, 0 }, @@ -146,7 +147,7 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { NULL, 0 }, { "statistics_log", "category", NULL, NULL, - confchk_wiredtiger_open_statistics_log_subconfigs, 5 }, + confchk_wiredtiger_open_statistics_log_subconfigs, 6 }, { "verbose", "list", NULL, "choices=[\"api\",\"block\",\"checkpoint\",\"compact\"," "\"evict\",\"evictserver\",\"fileops\",\"log\",\"lsm\"," @@ -390,6 +391,61 @@ static const WT_CONFIG_CHECK confchk_colgroup_meta[] = { { NULL, NULL, NULL, NULL, NULL, 0 } }; +static const WT_CONFIG_CHECK confchk_file_config[] = { + { "allocation_size", "int", + NULL, "min=512B,max=128MB", + NULL, 0 }, + { "app_metadata", "string", NULL, NULL, NULL, 0 }, + { "block_allocation", "string", + NULL, "choices=[\"first\",\"best\"]", + NULL, 0 }, + { "block_compressor", "string", NULL, NULL, NULL, 0 }, + { "cache_resident", "boolean", NULL, NULL, NULL, 0 }, + { "checksum", "string", + NULL, "choices=[\"on\",\"off\",\"uncompressed\"]", + NULL, 0 }, + { "collator", "string", NULL, NULL, NULL, 0 }, + { "columns", "list", NULL, NULL, NULL, 0 }, + { "dictionary", "int", NULL, "min=0", NULL, 0 }, + { "encryption", "category", + NULL, NULL, + confchk_WT_SESSION_create_encryption_subconfigs, 2 }, + { "format", "string", NULL, "choices=[\"btree\"]", NULL, 0 }, + { "huffman_key", "string", NULL, NULL, NULL, 0 }, + { "huffman_value", "string", NULL, NULL, NULL, 0 }, + { "internal_item_max", "int", NULL, "min=0", NULL, 0 }, + { "internal_key_max", "int", NULL, "min=0", NULL, 0 }, + { "internal_key_truncate", "boolean", NULL, NULL, NULL, 0 }, + { "internal_page_max", "int", + NULL, "min=512B,max=512MB", + NULL, 0 }, + { "key_format", "format", __wt_struct_confchk, NULL, NULL, 0 }, + { "key_gap", "int", NULL, "min=0", NULL, 0 }, + { "leaf_item_max", "int", NULL, "min=0", NULL, 0 }, + { "leaf_key_max", "int", NULL, "min=0", NULL, 0 }, + { "leaf_page_max", "int", + NULL, "min=512B,max=512MB", + NULL, 0 }, + { "leaf_value_max", "int", NULL, "min=0", NULL, 0 }, + { "log", "category", + NULL, NULL, + confchk_WT_SESSION_create_log_subconfigs, 1 }, + { "memory_page_max", "int", + NULL, "min=512B,max=10TB", + NULL, 0 }, + { "os_cache_dirty_max", "int", NULL, "min=0", NULL, 0 }, + { "os_cache_max", "int", NULL, "min=0", NULL, 0 }, + { "prefix_compression", "boolean", NULL, NULL, NULL, 0 }, + { "prefix_compression_min", "int", NULL, "min=0", NULL, 0 }, + { "split_deepen_min_child", "int", NULL, NULL, NULL, 0 }, + { "split_deepen_per_child", "int", NULL, NULL, NULL, 0 }, + { "split_pct", "int", NULL, "min=25,max=100", NULL, 0 }, + { "value_format", "format", + __wt_struct_confchk, NULL, + NULL, 0 }, + { NULL, NULL, NULL, NULL, NULL, 0 } +}; + static const WT_CONFIG_CHECK confchk_file_meta[] = { { "allocation_size", "int", NULL, "min=512B,max=128MB", @@ -465,6 +521,67 @@ static const WT_CONFIG_CHECK confchk_index_meta[] = { { NULL, NULL, NULL, NULL, NULL, 0 } }; +static const WT_CONFIG_CHECK confchk_lsm_meta[] = { + { "allocation_size", "int", + NULL, "min=512B,max=128MB", + NULL, 0 }, + { "app_metadata", "string", NULL, NULL, NULL, 0 }, + { "block_allocation", "string", + NULL, "choices=[\"first\",\"best\"]", + NULL, 0 }, + { "block_compressor", "string", NULL, NULL, NULL, 0 }, + { "cache_resident", "boolean", NULL, NULL, NULL, 0 }, + { "checksum", "string", + NULL, "choices=[\"on\",\"off\",\"uncompressed\"]", + NULL, 0 }, + { "chunks", "string", NULL, NULL, NULL, 0 }, + { "collator", "string", NULL, NULL, NULL, 0 }, + { "columns", "list", NULL, NULL, NULL, 0 }, + { "dictionary", "int", NULL, "min=0", NULL, 0 }, + { "encryption", "category", + NULL, NULL, + confchk_WT_SESSION_create_encryption_subconfigs, 2 }, + { "format", "string", NULL, "choices=[\"btree\"]", NULL, 0 }, + { "huffman_key", "string", NULL, NULL, NULL, 0 }, + { "huffman_value", "string", NULL, NULL, NULL, 0 }, + { "internal_item_max", "int", NULL, "min=0", NULL, 0 }, + { "internal_key_max", "int", NULL, "min=0", NULL, 0 }, + { "internal_key_truncate", "boolean", NULL, NULL, NULL, 0 }, + { "internal_page_max", "int", + NULL, "min=512B,max=512MB", + NULL, 0 }, + { "key_format", "format", __wt_struct_confchk, NULL, NULL, 0 }, + { "key_gap", "int", NULL, "min=0", NULL, 0 }, + { "last", "string", NULL, NULL, NULL, 0 }, + { "leaf_item_max", "int", NULL, "min=0", NULL, 0 }, + { "leaf_key_max", "int", NULL, "min=0", NULL, 0 }, + { "leaf_page_max", "int", + NULL, "min=512B,max=512MB", + NULL, 0 }, + { "leaf_value_max", "int", NULL, "min=0", NULL, 0 }, + { "log", "category", + NULL, NULL, + confchk_WT_SESSION_create_log_subconfigs, 1 }, + { "lsm", "category", + NULL, NULL, + confchk_WT_SESSION_create_lsm_subconfigs, 11 }, + { "memory_page_max", "int", + NULL, "min=512B,max=10TB", + NULL, 0 }, + { "old_chunks", "string", NULL, NULL, NULL, 0 }, + { "os_cache_dirty_max", "int", NULL, "min=0", NULL, 0 }, + { "os_cache_max", "int", NULL, "min=0", NULL, 0 }, + { "prefix_compression", "boolean", NULL, NULL, NULL, 0 }, + { "prefix_compression_min", "int", NULL, "min=0", NULL, 0 }, + { "split_deepen_min_child", "int", NULL, NULL, NULL, 0 }, + { "split_deepen_per_child", "int", NULL, NULL, NULL, 0 }, + { "split_pct", "int", NULL, "min=25,max=100", NULL, 0 }, + { "value_format", "format", + __wt_struct_confchk, NULL, + NULL, 0 }, + { NULL, NULL, NULL, NULL, NULL, 0 } +}; + static const WT_CONFIG_CHECK confchk_table_meta[] = { { "app_metadata", "string", NULL, NULL, NULL, 0 }, { "colgroups", "list", NULL, NULL, NULL, 0 }, @@ -544,6 +661,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { { "lsm_merge", "boolean", NULL, NULL, NULL, 0 }, { "mmap", "boolean", NULL, NULL, NULL, 0 }, { "multiprocess", "boolean", NULL, NULL, NULL, 0 }, + { "readonly", "boolean", NULL, NULL, NULL, 0 }, { "session_max", "int", NULL, "min=1", NULL, 0 }, { "session_scratch_max", "int", NULL, NULL, NULL, 0 }, { "shared_cache", "category", @@ -554,7 +672,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { NULL, 0 }, { "statistics_log", "category", NULL, NULL, - confchk_wiredtiger_open_statistics_log_subconfigs, 5 }, + confchk_wiredtiger_open_statistics_log_subconfigs, 6 }, { "transaction_sync", "category", NULL, NULL, confchk_wiredtiger_open_transaction_sync_subconfigs, 2 }, @@ -624,6 +742,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { { "lsm_merge", "boolean", NULL, NULL, NULL, 0 }, { "mmap", "boolean", NULL, NULL, NULL, 0 }, { "multiprocess", "boolean", NULL, NULL, NULL, 0 }, + { "readonly", "boolean", NULL, NULL, NULL, 0 }, { "session_max", "int", NULL, "min=1", NULL, 0 }, { "session_scratch_max", "int", NULL, NULL, NULL, 0 }, { "shared_cache", "category", @@ -634,7 +753,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { NULL, 0 }, { "statistics_log", "category", NULL, NULL, - confchk_wiredtiger_open_statistics_log_subconfigs, 5 }, + confchk_wiredtiger_open_statistics_log_subconfigs, 6 }, { "transaction_sync", "category", NULL, NULL, confchk_wiredtiger_open_transaction_sync_subconfigs, 2 }, @@ -701,6 +820,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { { "lsm_merge", "boolean", NULL, NULL, NULL, 0 }, { "mmap", "boolean", NULL, NULL, NULL, 0 }, { "multiprocess", "boolean", NULL, NULL, NULL, 0 }, + { "readonly", "boolean", NULL, NULL, NULL, 0 }, { "session_max", "int", NULL, "min=1", NULL, 0 }, { "session_scratch_max", "int", NULL, NULL, NULL, 0 }, { "shared_cache", "category", @@ -711,7 +831,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { NULL, 0 }, { "statistics_log", "category", NULL, NULL, - confchk_wiredtiger_open_statistics_log_subconfigs, 5 }, + confchk_wiredtiger_open_statistics_log_subconfigs, 6 }, { "transaction_sync", "category", NULL, NULL, confchk_wiredtiger_open_transaction_sync_subconfigs, 2 }, @@ -776,6 +896,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { { "lsm_merge", "boolean", NULL, NULL, NULL, 0 }, { "mmap", "boolean", NULL, NULL, NULL, 0 }, { "multiprocess", "boolean", NULL, NULL, NULL, 0 }, + { "readonly", "boolean", NULL, NULL, NULL, 0 }, { "session_max", "int", NULL, "min=1", NULL, 0 }, { "session_scratch_max", "int", NULL, NULL, NULL, 0 }, { "shared_cache", "category", @@ -786,7 +907,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { NULL, 0 }, { "statistics_log", "category", NULL, NULL, - confchk_wiredtiger_open_statistics_log_subconfigs, 5 }, + confchk_wiredtiger_open_statistics_log_subconfigs, 6 }, { "transaction_sync", "category", NULL, NULL, confchk_wiredtiger_open_transaction_sync_subconfigs, 2 }, @@ -853,7 +974,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "file_max=100MB,path=,prealloc=,recover=on,zero_fill=0)," "lsm_manager=(merge=,worker_thread_max=4),lsm_merge=," "shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB)," - "statistics=none,statistics_log=(on_close=0," + "statistics=none,statistics_log=(json=0,on_close=0," "path=\"WiredTigerStat.%d.%H\",sources=," "timestamp=\"%b %d %H:%M:%S\",wait=0),verbose=", confchk_WT_CONNECTION_reconfigure, 18 @@ -980,6 +1101,20 @@ static const WT_CONFIG_ENTRY config_entries[] = { "app_metadata=,collator=,columns=,source=,type=file", confchk_colgroup_meta, 5 }, + { "file.config", + "allocation_size=4KB,app_metadata=,block_allocation=best," + "block_compressor=,cache_resident=0,checksum=uncompressed," + "collator=,columns=,dictionary=0,encryption=(keyid=,name=)," + "format=btree,huffman_key=,huffman_value=,internal_item_max=0," + "internal_key_max=0,internal_key_truncate=,internal_page_max=4KB," + "key_format=u,key_gap=10,leaf_item_max=0,leaf_key_max=0," + "leaf_page_max=32KB,leaf_value_max=0,log=(enabled=)," + "memory_page_max=5MB,os_cache_dirty_max=0,os_cache_max=0," + "prefix_compression=0,prefix_compression_min=4," + "split_deepen_min_child=0,split_deepen_per_child=0,split_pct=75," + "value_format=u", + confchk_file_config, 33 + }, { "file.meta", "allocation_size=4KB,app_metadata=,block_allocation=best," "block_compressor=,cache_resident=0,checkpoint=,checkpoint_lsn=," @@ -1000,6 +1135,23 @@ static const WT_CONFIG_ENTRY config_entries[] = { "index_key_columns=,key_format=u,source=,type=file,value_format=u", confchk_index_meta, 10 }, + { "lsm.meta", + "allocation_size=4KB,app_metadata=,block_allocation=best," + "block_compressor=,cache_resident=0,checksum=uncompressed,chunks=" + ",collator=,columns=,dictionary=0,encryption=(keyid=,name=)," + "format=btree,huffman_key=,huffman_value=,internal_item_max=0," + "internal_key_max=0,internal_key_truncate=,internal_page_max=4KB," + "key_format=u,key_gap=10,last=,leaf_item_max=0,leaf_key_max=0," + "leaf_page_max=32KB,leaf_value_max=0,log=(enabled=)," + "lsm=(auto_throttle=,bloom=,bloom_bit_count=16,bloom_config=," + "bloom_hash_count=8,bloom_oldest=0,chunk_count_limit=0," + "chunk_max=5GB,chunk_size=10MB,merge_max=15,merge_min=0)," + "memory_page_max=5MB,old_chunks=,os_cache_dirty_max=0," + "os_cache_max=0,prefix_compression=0,prefix_compression_min=4," + "split_deepen_min_child=0,split_deepen_per_child=0,split_pct=75," + "value_format=u", + confchk_lsm_meta, 37 + }, { "table.meta", "app_metadata=,colgroups=,collator=,columns=,key_format=u," "value_format=u", @@ -1017,14 +1169,14 @@ static const WT_CONFIG_ENTRY config_entries[] = { "close_idle_time=30,close_scan_interval=10),hazard_max=1000," "in_memory=0,log=(archive=,compressor=,enabled=0,file_max=100MB," "path=,prealloc=,recover=on,zero_fill=0),lsm_manager=(merge=," - "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0," + "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,readonly=0," "session_max=100,session_scratch_max=2MB,shared_cache=(chunk=10MB" ",name=,quota=0,reserve=0,size=500MB),statistics=none," - "statistics_log=(on_close=0,path=\"WiredTigerStat.%d.%H\"," + "statistics_log=(json=0,on_close=0,path=\"WiredTigerStat.%d.%H\"," "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," "transaction_sync=(enabled=0,method=fsync),use_environment=," "use_environment_priv=0,verbose=,write_through=", - confchk_wiredtiger_open, 37 + confchk_wiredtiger_open, 38 }, { "wiredtiger_open_all", "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1," @@ -1038,15 +1190,15 @@ static const WT_CONFIG_ENTRY config_entries[] = { "close_idle_time=30,close_scan_interval=10),hazard_max=1000," "in_memory=0,log=(archive=,compressor=,enabled=0,file_max=100MB," "path=,prealloc=,recover=on,zero_fill=0),lsm_manager=(merge=," - "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0," + "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,readonly=0," "session_max=100,session_scratch_max=2MB,shared_cache=(chunk=10MB" ",name=,quota=0,reserve=0,size=500MB),statistics=none," - "statistics_log=(on_close=0,path=\"WiredTigerStat.%d.%H\"," + "statistics_log=(json=0,on_close=0,path=\"WiredTigerStat.%d.%H\"," "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," "transaction_sync=(enabled=0,method=fsync),use_environment=," "use_environment_priv=0,verbose=,version=(major=0,minor=0)," "write_through=", - confchk_wiredtiger_open_all, 38 + confchk_wiredtiger_open_all, 39 }, { "wiredtiger_open_basecfg", "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1," @@ -1059,14 +1211,14 @@ static const WT_CONFIG_ENTRY config_entries[] = { ",close_idle_time=30,close_scan_interval=10),hazard_max=1000," "log=(archive=,compressor=,enabled=0,file_max=100MB,path=," "prealloc=,recover=on,zero_fill=0),lsm_manager=(merge=," - "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0," + "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,readonly=0," "session_max=100,session_scratch_max=2MB,shared_cache=(chunk=10MB" ",name=,quota=0,reserve=0,size=500MB),statistics=none," - "statistics_log=(on_close=0,path=\"WiredTigerStat.%d.%H\"," + "statistics_log=(json=0,on_close=0,path=\"WiredTigerStat.%d.%H\"," "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," "transaction_sync=(enabled=0,method=fsync),verbose=," "version=(major=0,minor=0),write_through=", - confchk_wiredtiger_open_basecfg, 32 + confchk_wiredtiger_open_basecfg, 33 }, { "wiredtiger_open_usercfg", "async=(enabled=0,ops_max=1024,threads=2),buffer_alignment=-1," @@ -1079,14 +1231,14 @@ static const WT_CONFIG_ENTRY config_entries[] = { ",close_idle_time=30,close_scan_interval=10),hazard_max=1000," "log=(archive=,compressor=,enabled=0,file_max=100MB,path=," "prealloc=,recover=on,zero_fill=0),lsm_manager=(merge=," - "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0," + "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0,readonly=0," "session_max=100,session_scratch_max=2MB,shared_cache=(chunk=10MB" ",name=,quota=0,reserve=0,size=500MB),statistics=none," - "statistics_log=(on_close=0,path=\"WiredTigerStat.%d.%H\"," + "statistics_log=(json=0,on_close=0,path=\"WiredTigerStat.%d.%H\"," "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," "transaction_sync=(enabled=0,method=fsync),verbose=," "write_through=", - confchk_wiredtiger_open_usercfg, 31 + confchk_wiredtiger_open_usercfg, 32 }, { NULL, NULL, NULL, 0 } }; diff --git a/src/conn/api_strerror.c b/src/conn/api_strerror.c index edb11957556..87864f7f4b0 100644 --- a/src/conn/api_strerror.c +++ b/src/conn/api_strerror.c @@ -40,6 +40,8 @@ __wt_wiredtiger_error(int error) return ("WT_RUN_RECOVERY: recovery must be run to continue"); case WT_CACHE_FULL: return ("WT_CACHE_FULL: operation would overflow cache"); + case WT_PERM_DENIED: + return ("WT_PERM_DENIED: permission denied (internal)"); } /* diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index 27977de63b2..6d115c8fdcd 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -772,6 +772,19 @@ __conn_get_extension_api(WT_CONNECTION *wt_conn) conn->extension_api.transaction_visible = __wt_ext_transaction_visible; conn->extension_api.version = wiredtiger_version; + /* Streaming pack/unpack API */ + conn->extension_api.pack_start = __wt_ext_pack_start; + conn->extension_api.unpack_start = __wt_ext_unpack_start; + conn->extension_api.pack_close = __wt_ext_pack_close; + conn->extension_api.pack_item = __wt_ext_pack_item; + conn->extension_api.pack_int = __wt_ext_pack_int; + conn->extension_api.pack_str = __wt_ext_pack_str; + conn->extension_api.pack_uint = __wt_ext_pack_uint; + conn->extension_api.unpack_item = __wt_ext_unpack_item; + conn->extension_api.unpack_int = __wt_ext_unpack_int; + conn->extension_api.unpack_str = __wt_ext_unpack_str; + conn->extension_api.unpack_uint = __wt_ext_unpack_uint; + return (&conn->extension_api); } @@ -1109,6 +1122,29 @@ __conn_config_append(const char *cfg[], const char *config) } /* + * __conn_config_readonly -- + * Append an entry to a config stack that overrides some settings + * when read-only is configured. + */ +static void +__conn_config_readonly(const char *cfg[]) +{ + const char *readonly; + + /* + * Override certain settings. In general we override the options + * whose default conflicts. Other settings at odds will return + * an error and will be checked when those settings are processed. + */ + readonly="checkpoint=(wait=0)," + "config_base=false," + "create=false," + "log=(archive=false,prealloc=false)," + "lsm_manager=(merge=false),"; + __conn_config_append(cfg, readonly); +} + +/* * __conn_config_check_version -- * Check if a configuration version isn't compatible. */ @@ -1382,7 +1418,7 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[]) WT_FH *fh; size_t len; wt_off_t size; - bool exist, is_create; + bool bytelock, exist, is_create; char buf[256]; conn = S2C(session); @@ -1391,6 +1427,10 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_config_gets(session, cfg, "create", &cval)); is_create = cval.val != 0; + if (F_ISSET(conn, WT_CONN_READONLY)) + is_create = false; + + bytelock = true; __wt_spin_lock(session, &__wt_process.spinlock); /* @@ -1448,47 +1488,89 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[]) exist = false; if (!is_create) WT_ERR(__wt_exist(session, WT_WIREDTIGER, &exist)); - WT_ERR(__wt_open(session, - WT_SINGLETHREAD, is_create || exist, false, 0, &conn->lock_fh)); + ret = __wt_open(session, + WT_SINGLETHREAD, is_create || exist, false, 0, &conn->lock_fh); /* - * Lock a byte of the file: if we don't get the lock, some other process - * is holding it, we're done. The file may be zero-length, and that's - * OK, the underlying call supports locking past the end-of-file. + * If this is a read-only connection and we cannot grab the lock + * file, check if it is because there is not write permission or + * if the file does not exist. If so, then ignore the error. + * XXX Ignoring the error does allow multiple read-only + * connections to exist at the same time on a read-only directory. */ - if (__wt_bytelock(conn->lock_fh, (wt_off_t)0, true) != 0) - WT_ERR_MSG(session, EBUSY, - "WiredTiger database is already being managed by another " - "process"); + if (F_ISSET(conn, WT_CONN_READONLY)) { + /* + * If we got an expected permission or non-existence error + * then skip the byte lock. + */ + ret = __wt_map_error_rdonly(ret); + if (ret == WT_NOTFOUND || ret == WT_PERM_DENIED) { + bytelock = false; + ret = 0; + } + } + WT_ERR(ret); + if (bytelock) { + /* + * Lock a byte of the file: if we don't get the lock, some other + * process is holding it, we're done. The file may be + * zero-length, and that's OK, the underlying call supports + * locking past the end-of-file. + */ + if (__wt_bytelock(conn->lock_fh, (wt_off_t)0, true) != 0) + WT_ERR_MSG(session, EBUSY, + "WiredTiger database is already being managed by " + "another process"); - /* - * If the size of the lock file is non-zero, we created it (or won a - * locking race with the thread that created it, it doesn't matter). - * - * Write something into the file, zero-length files make me nervous. - * - * The test against the expected length is sheer paranoia (the length - * should be 0 or correct), but it shouldn't hurt. - */ + /* + * If the size of the lock file is non-zero, we created it (or + * won a locking race with the thread that created it, it + * doesn't matter). + * + * Write something into the file, zero-length files make me + * nervous. + * + * The test against the expected length is sheer paranoia (the + * length should be 0 or correct), but it shouldn't hurt. + */ #define WT_SINGLETHREAD_STRING "WiredTiger lock file\n" - WT_ERR(__wt_filesize(session, conn->lock_fh, &size)); - if (size != strlen(WT_SINGLETHREAD_STRING)) - WT_ERR(__wt_write(session, conn->lock_fh, (wt_off_t)0, - strlen(WT_SINGLETHREAD_STRING), WT_SINGLETHREAD_STRING)); + WT_ERR(__wt_filesize(session, conn->lock_fh, &size)); + if (size != strlen(WT_SINGLETHREAD_STRING)) + WT_ERR(__wt_write(session, conn->lock_fh, (wt_off_t)0, + strlen(WT_SINGLETHREAD_STRING), + WT_SINGLETHREAD_STRING)); + + } /* We own the lock file, optionally create the WiredTiger file. */ - WT_ERR(__wt_open(session, WT_WIREDTIGER, is_create, false, 0, &fh)); + ret = __wt_open(session, WT_WIREDTIGER, is_create, false, 0, &fh); /* - * Lock the WiredTiger file (for backward compatibility reasons as - * described above). Immediately release the lock, it's just a test. + * If we're read-only, check for success as well as handled errors. + * Even if we're able to open the WiredTiger file successfully, we + * do not try to lock it. The lock file test above is the only + * one we do for read-only. */ - if (__wt_bytelock(fh, (wt_off_t)0, true) != 0) { - WT_ERR_MSG(session, EBUSY, - "WiredTiger database is already being managed by another " - "process"); + if (F_ISSET(conn, WT_CONN_READONLY)) { + ret = __wt_map_error_rdonly(ret); + if (ret == 0 || ret == WT_NOTFOUND || ret == WT_PERM_DENIED) + ret = 0; + WT_ERR(ret); + } else { + WT_ERR(ret); + + /* + * Lock the WiredTiger file (for backward compatibility reasons + * as described above). Immediately release the lock, it's + * just a test. + */ + if (__wt_bytelock(fh, (wt_off_t)0, true) != 0) { + WT_ERR_MSG(session, EBUSY, + "WiredTiger database is already being managed by " + "another process"); + } + WT_ERR(__wt_bytelock(fh, (wt_off_t)0, false)); } - WT_ERR(__wt_bytelock(fh, (wt_off_t)0, false)); /* * We own the database home, figure out if we're creating it. There are @@ -1502,11 +1584,21 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[]) conn->is_new = exist ? 0 : 1; if (conn->is_new) { + if (F_ISSET(conn, WT_CONN_READONLY)) + WT_ERR_MSG(session, EINVAL, "Creating a new database is" + " incompatible with read-only configuration."); len = (size_t)snprintf(buf, sizeof(buf), "%s\n%s\n", WT_WIREDTIGER, WIREDTIGER_VERSION_STRING); WT_ERR(__wt_write(session, fh, (wt_off_t)0, len, buf)); WT_ERR(__wt_fsync(session, fh)); } else { + /* + * Although exclusive and the read-only configuration settings + * are at odds, we do not have to check against read-only here + * because it falls out from earlier code in this function + * preventing creation and confirming the database + * already exists. + */ WT_ERR(__wt_config_gets(session, cfg, "exclusive", &cval)); if (cval.val != 0) WT_ERR_MSG(session, EEXIST, @@ -1602,6 +1694,7 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[]) { "fileops", WT_VERB_FILEOPS }, { "log", WT_VERB_LOG }, { "lsm", WT_VERB_LSM }, + { "lsm_manager", WT_VERB_LSM_MANAGER }, { "metadata", WT_VERB_METADATA }, { "mutex", WT_VERB_MUTEX }, { "overflow", WT_VERB_OVERFLOW }, @@ -1736,6 +1829,7 @@ __conn_write_base_config(WT_SESSION_IMPL *session, const char *cfg[]) "exclusive=," "in_memory=," "log=(recover=)," + "readonly=," "use_environment_priv=," "verbose=,", &base_config)); WT_ERR(__wt_config_init(session, &parser, base_config)); @@ -1808,7 +1902,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const WT_NAME_FLAG *ft; WT_SESSION_IMPL *session; bool config_base_set; - const char *enc_cfg[] = { NULL, NULL }; + const char *enc_cfg[] = { NULL, NULL }, *merge_cfg; char version[64]; /* Leave lots of space for optional additional configuration. */ @@ -1819,6 +1913,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, conn = NULL; session = NULL; + merge_cfg = NULL; WT_RET(__wt_library_init()); @@ -1860,6 +1955,16 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, session, cval.str, cval.len, &conn->error_prefix)); /* + * We need to look for read-only early so that we can use it + * in __conn_single and whether to use the base config file. + * XXX that means we can only make the choice in __conn_single if the + * user passes it in via the config string to wiredtiger_open. + */ + WT_ERR(__wt_config_gets(session, cfg, "readonly", &cval)); + if (cval.val) + F_SET(conn, WT_CONN_READONLY); + + /* * XXX ideally, we would check "in_memory" here, so we could completely * avoid having a database directory. However, it can be convenient to * pass "in_memory" via the WIREDTIGER_CONFIG environment variable, and @@ -1883,6 +1988,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, * 4. the config passed in by the application * 5. user configuration file (optional) * 6. environment variable settings (optional) + * 7. overrides for a read-only connection * * Clear the entries we added to the stack, we're going to build it in * order. @@ -1898,8 +2004,8 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, (int)sizeof(version), ENOMEM); __conn_config_append(cfg, version); - /* Ignore the base_config file if we config_base set to false. */ - if (config_base_set) + /* Ignore the base_config file if config_base_set is false. */ + if (config_base_set || F_ISSET(conn, WT_CONN_READONLY)) WT_ERR( __conn_config_file(session, WT_BASECONFIG, false, cfg, i1)); __conn_config_append(cfg, config); @@ -1909,7 +2015,35 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, /* * Merge the full configuration stack and save it for reconfiguration. */ - WT_ERR(__wt_config_merge(session, cfg, NULL, &conn->cfg)); + WT_ERR(__wt_config_merge(session, cfg, NULL, &merge_cfg)); + /* + * The read-only setting may have been set in a configuration file. + * Get it again so that we can override other configuration settings + * before they are processed by the subsystems. + */ + WT_ERR(__wt_config_gets(session, cfg, "readonly", &cval)); + if (cval.val) + F_SET(conn, WT_CONN_READONLY); + if (F_ISSET(conn, WT_CONN_READONLY)) { + /* + * Create a new stack with the merged configuration as the + * base. The read-only string will use entry 1 and then + * we'll merge it again. + */ + cfg[0] = merge_cfg; + cfg[1] = NULL; + cfg[2] = NULL; + /* + * We override some configuration settings for read-only. + * Other settings that conflict with and are an error with + * read-only are tested in their individual locations later. + */ + __conn_config_readonly(cfg); + WT_ERR(__wt_config_merge(session, cfg, NULL, &conn->cfg)); + } else { + conn->cfg = merge_cfg; + merge_cfg = NULL; + } /* * Configuration ... @@ -2082,6 +2216,7 @@ err: /* Discard the scratch buffers. */ __wt_scr_free(session, &i2); __wt_scr_free(session, &i3); + __wt_free(session, merge_cfg); /* * We may have allocated scratch memory when using the dummy session or * the subsequently created real session, and we don't want to tie down diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c index 1831aad5895..9a2c394e9a6 100644 --- a/src/conn/conn_cache.c +++ b/src/conn/conn_cache.c @@ -140,6 +140,12 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_cache_config(session, false, cfg)); /* + * The lowest possible page read-generation has a special meaning, it + * marks a page for forcible eviction; don't let it happen by accident. + */ + cache->read_gen = WT_READGEN_START_VALUE; + + /* * The target size must be lower than the trigger size or we will never * get any work done. */ @@ -147,8 +153,8 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR_MSG(session, EINVAL, "eviction target must be lower than the eviction trigger"); - WT_ERR(__wt_cond_alloc(session, - "cache eviction server", false, &cache->evict_cond)); + WT_ERR(__wt_cond_auto_alloc(session, "cache eviction server", + false, 10000, WT_MILLION, &cache->evict_cond)); WT_ERR(__wt_cond_alloc(session, "eviction waiters", false, &cache->evict_waiter_cond)); WT_ERR(__wt_spin_init(session, &cache->evict_lock, "cache eviction")); @@ -246,7 +252,7 @@ __wt_cache_destroy(WT_SESSION_IMPL *session) " bytes dirty and %" PRIu64 " pages dirty", cache->bytes_dirty, cache->pages_dirty); - WT_TRET(__wt_cond_destroy(session, &cache->evict_cond)); + WT_TRET(__wt_cond_auto_destroy(session, &cache->evict_cond)); WT_TRET(__wt_cond_destroy(session, &cache->evict_waiter_cond)); __wt_spin_destroy(session, &cache->evict_lock); __wt_spin_destroy(session, &cache->evict_walk_lock); diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index 60136a71b99..5019ab59fe3 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -129,7 +129,7 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) WT_BTREE *btree; WT_DATA_HANDLE *dhandle; WT_DECL_RET; - bool evict_reset, marked_dead, no_schema_lock; + bool marked_dead, no_schema_lock; btree = S2BT(session); bm = btree->bm; @@ -139,8 +139,8 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) if (!F_ISSET(dhandle, WT_DHANDLE_OPEN)) return (0); - /* Ensure that we aren't racing with the eviction server */ - WT_RET(__wt_evict_file_exclusive_on(session, &evict_reset)); + /* Turn off eviction. */ + WT_RET(__wt_evict_file_exclusive_on(session)); /* * If we don't already have the schema lock, make it an error to try @@ -176,23 +176,19 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) WT_BTREE_SALVAGE | WT_BTREE_UPGRADE | WT_BTREE_VERIFY)) { if (force && (bm == NULL || !bm->is_mapped(bm, session))) { F_SET(session->dhandle, WT_DHANDLE_DEAD); + marked_dead = true; - /* - * Reset the tree's eviction priority, and the tree is - * evictable by definition. - */ + /* Reset the tree's eviction priority (if any). */ __wt_evict_priority_clear(session); - F_CLR(S2BT(session), WT_BTREE_NO_EVICTION); - - marked_dead = true; } if (!marked_dead || final) WT_ERR(__wt_checkpoint_close(session, final)); } WT_TRET(__wt_btree_close(session)); + /* - * If we marked a handle as dead it will be closed by sweep, via + * If we marked a handle dead it will be closed by sweep, via * another call to sync and close. */ if (!marked_dead) { @@ -206,12 +202,11 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force) err: __wt_spin_unlock(session, &dhandle->close_lock); - if (evict_reset) - __wt_evict_file_exclusive_off(session); - if (no_schema_lock) F_CLR(session, WT_SESSION_NO_SCHEMA_LOCK); + __wt_evict_file_exclusive_off(session); + return (ret); } @@ -355,42 +350,52 @@ err: F_CLR(btree, WT_BTREE_SPECIAL_FLAGS); /* * __conn_btree_apply_internal -- - * Apply a function to the open btree handles. + * Apply a function to an open data handle. */ static int __conn_btree_apply_internal(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, - int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]) + int (*file_func)(WT_SESSION_IMPL *, const char *[]), + int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), + const char *cfg[]) { WT_DECL_RET; + bool skip; + + /* Always apply the name function, if supplied. */ + skip = false; + if (name_func != NULL) + WT_RET(name_func(session, dhandle->name, &skip)); + + /* If there is no file function, don't bother locking the handle */ + if (file_func == NULL || skip) + return (0); /* * We need to pull the handle into the session handle cache and make * sure it's referenced to stop other internal code dropping the handle * (e.g in LSM when cleaning up obsolete chunks). */ - ret = __wt_session_get_btree(session, - dhandle->name, dhandle->checkpoint, NULL, 0); - if (ret == 0) { - WT_SAVE_DHANDLE(session, - ret = func(session, cfg)); - if (WT_META_TRACKING(session)) - WT_TRET(__wt_meta_track_handle_lock(session, false)); - else - WT_TRET(__wt_session_release_btree(session)); - } else if (ret == EBUSY) - ret = __wt_conn_btree_apply_single(session, dhandle->name, - dhandle->checkpoint, func, cfg); + if ((ret = __wt_session_get_btree(session, + dhandle->name, dhandle->checkpoint, NULL, 0)) != 0) + return (ret == EBUSY ? 0 : ret); + + WT_SAVE_DHANDLE(session, ret = file_func(session, cfg)); + if (WT_META_TRACKING(session)) + WT_TRET(__wt_meta_track_handle_lock(session, false)); + else + WT_TRET(__wt_session_release_btree(session)); return (ret); } /* * __wt_conn_btree_apply -- - * Apply a function to all open btree handles apart from the metadata. + * Apply a function to all open btree handles with the given URI. */ int -__wt_conn_btree_apply(WT_SESSION_IMPL *session, - bool apply_checkpoints, const char *uri, - int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]) +__wt_conn_btree_apply(WT_SESSION_IMPL *session, const char *uri, + int (*file_func)(WT_SESSION_IMPL *, const char *[]), + int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), + const char *cfg[]) { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; @@ -407,116 +412,27 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session, if (uri != NULL) { bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE; - TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) - if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && - !F_ISSET(dhandle, WT_DHANDLE_DEAD) && - strcmp(uri, dhandle->name) == 0 && - (apply_checkpoints || dhandle->checkpoint == NULL)) - WT_RET(__conn_btree_apply_internal( - session, dhandle, func, cfg)); + TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) { + if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) || + F_ISSET(dhandle, WT_DHANDLE_DEAD) || + dhandle->checkpoint != NULL || + strcmp(uri, dhandle->name) != 0) + continue; + WT_RET(__conn_btree_apply_internal( + session, dhandle, file_func, name_func, cfg)); + } } else { - TAILQ_FOREACH(dhandle, &conn->dhqh, q) - if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && - !F_ISSET(dhandle, WT_DHANDLE_DEAD) && - (apply_checkpoints || - dhandle->checkpoint == NULL) && - WT_PREFIX_MATCH(dhandle->name, "file:") && - !WT_IS_METADATA(session, dhandle)) - WT_RET(__conn_btree_apply_internal( - session, dhandle, func, cfg)); - } - - return (0); -} - -/* - * __wt_conn_btree_apply_single_ckpt -- - * Decode any checkpoint information from the configuration string then - * call btree apply single. - */ -int -__wt_conn_btree_apply_single_ckpt(WT_SESSION_IMPL *session, - const char *uri, - int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]) -{ - WT_CONFIG_ITEM cval; - WT_DECL_RET; - const char *checkpoint; - - checkpoint = NULL; - - /* - * This function exists to handle checkpoint configuration. Callers - * that never open a checkpoint call the underlying function directly. - */ - WT_RET_NOTFOUND_OK( - __wt_config_gets_def(session, cfg, "checkpoint", 0, &cval)); - if (cval.len != 0) { - /* - * The internal checkpoint name is special, find the last - * unnamed checkpoint of the object. - */ - if (WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len)) { - WT_RET(__wt_meta_checkpoint_last_name( - session, uri, &checkpoint)); - } else - WT_RET(__wt_strndup( - session, cval.str, cval.len, &checkpoint)); - } - - ret = __wt_conn_btree_apply_single(session, uri, checkpoint, func, cfg); - - __wt_free(session, checkpoint); - - return (ret); -} - -/* - * __wt_conn_btree_apply_single -- - * Apply a function to a single btree handle that couldn't be locked - * (attempting to get the handle returned EBUSY). - */ -int -__wt_conn_btree_apply_single(WT_SESSION_IMPL *session, - const char *uri, const char *checkpoint, - int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]) -{ - WT_CONNECTION_IMPL *conn; - WT_DATA_HANDLE *dhandle; - WT_DECL_RET; - uint64_t bucket, hash; - - conn = S2C(session); - - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); - - hash = __wt_hash_city64(uri, strlen(uri)); - bucket = hash % WT_HASH_ARRAY_SIZE; - TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) - if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && - !F_ISSET(dhandle, WT_DHANDLE_DEAD) && - (hash == dhandle->name_hash && - strcmp(uri, dhandle->name) == 0) && - ((dhandle->checkpoint == NULL && checkpoint == NULL) || - (dhandle->checkpoint != NULL && checkpoint != NULL && - strcmp(dhandle->checkpoint, checkpoint) == 0))) { - /* - * We're holding the handle list lock which locks out - * handle open (which might change the state of the - * underlying object). However, closing a handle - * doesn't require the handle list lock, lock out - * closing the handle and then confirm the handle is - * still open. - */ - __wt_spin_lock(session, &dhandle->close_lock); - if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && - !F_ISSET(dhandle, WT_DHANDLE_DEAD)) { - WT_WITH_DHANDLE(session, dhandle, - ret = func(session, cfg)); - } - __wt_spin_unlock(session, &dhandle->close_lock); - WT_RET(ret); + TAILQ_FOREACH(dhandle, &conn->dhqh, q) { + if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) || + F_ISSET(dhandle, WT_DHANDLE_DEAD) || + dhandle->checkpoint != NULL || + !WT_PREFIX_MATCH(dhandle->name, "file:") || + WT_IS_METADATA(session, dhandle)) + continue; + WT_RET(__conn_btree_apply_internal( + session, dhandle, file_func, name_func, cfg)); } + } return (0); } diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c index 12b4e87e921..16717597f4d 100644 --- a/src/conn/conn_handle.c +++ b/src/conn/conn_handle.c @@ -56,6 +56,7 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_RET(__wt_rwlock_alloc(session, &conn->hot_backup_lock, "hot backup")); WT_RET(__wt_spin_init(session, &conn->las_lock, "lookaside table")); + WT_RET(__wt_spin_init(session, &conn->metadata_lock, "metadata")); WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure")); WT_RET(__wt_spin_init(session, &conn->schema_lock, "schema")); WT_RET(__wt_spin_init(session, &conn->table_lock, "table creation")); @@ -123,7 +124,8 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) * underlying file-close code uses the mutex to guard lists of * open files. */ - WT_TRET(__wt_close(session, &conn->lock_fh)); + if (conn->lock_fh) + WT_TRET(__wt_close(session, &conn->lock_fh)); /* Remove from the list of connections. */ __wt_spin_lock(session, &__wt_process.spinlock); @@ -143,6 +145,7 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_spin_destroy(session, &conn->fh_lock); WT_TRET(__wt_rwlock_destroy(session, &conn->hot_backup_lock)); __wt_spin_destroy(session, &conn->las_lock); + __wt_spin_destroy(session, &conn->metadata_lock); __wt_spin_destroy(session, &conn->reconfig_lock); __wt_spin_destroy(session, &conn->schema_lock); __wt_spin_destroy(session, &conn->table_lock); diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index 60f46288072..757d69bf240 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -133,10 +133,17 @@ __logmgr_config( FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR); WT_RET(__wt_config_gets(session, cfg, "log.zero_fill", &cval)); - if (cval.val != 0) + if (cval.val != 0) { + if (F_ISSET(conn, WT_CONN_READONLY)) + WT_RET_MSG(session, EINVAL, + "Read-only configuration incompatible with " + "zero-filling log files"); FLD_SET(conn->log_flags, WT_CONN_LOG_ZERO_FILL); + } WT_RET(__logmgr_sync_cfg(session, cfg)); + if (conn->log_cond != NULL) + WT_RET(__wt_cond_auto_signal(session, conn->log_cond)); return (0); } @@ -463,7 +470,7 @@ __log_file_server(void *arg) locked = false; __wt_spin_unlock(session, &log->log_sync_lock); } else { - WT_ERR(__wt_cond_signal( + WT_ERR(__wt_cond_auto_signal( session, conn->log_wrlsn_cond)); /* * We do not want to wait potentially a second @@ -633,7 +640,7 @@ restart: if (slot->slot_start_lsn.l.offset != slot->slot_last_offset) slot->slot_start_lsn.l.offset = - slot->slot_last_offset; + (uint32_t)slot->slot_last_offset; log->write_start_lsn = slot->slot_start_lsn; log->write_lsn = slot->slot_end_lsn; WT_ERR(__wt_cond_signal( @@ -662,31 +669,54 @@ __log_wrlsn_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; + WT_LOG *log; + WT_LSN prev; WT_SESSION_IMPL *session; int yield; + bool did_work; session = arg; conn = S2C(session); + log = conn->log; yield = 0; + WT_INIT_LSN(&prev); + did_work = false; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* - * Write out any log record buffers. + * Write out any log record buffers if anything was done + * since last time. Only call the function to walk the + * slots if the system is not idle. On an idle system + * the alloc_lsn will not advance and the written lsn will + * match the alloc_lsn. */ - WT_ERR(__wt_log_wrlsn(session, &yield)); + if (__wt_log_cmp(&prev, &log->alloc_lsn) != 0 || + __wt_log_cmp(&log->write_lsn, &log->alloc_lsn) != 0) + WT_ERR(__wt_log_wrlsn(session, &yield)); + else + WT_STAT_FAST_CONN_INCR(session, log_write_lsn_skip); + prev = log->alloc_lsn; + if (yield == 0) + did_work = true; + else + did_work = false; /* * If __wt_log_wrlsn did work we want to yield instead of sleep. */ if (yield++ < WT_THOUSAND) __wt_yield(); else - WT_ERR(__wt_cond_wait( - session, conn->log_wrlsn_cond, 10000)); + /* + * Send in false because if we did any work we would + * not be on this path. + */ + WT_ERR(__wt_cond_auto_wait( + session, conn->log_wrlsn_cond, did_work)); } /* * On close we need to do this one more time because there could * be straggling log writes that need to be written. */ - WT_ERR(__wt_log_force_write(session, 1)); + WT_ERR(__wt_log_force_write(session, 1, NULL)); WT_ERR(__wt_log_wrlsn(session, NULL)); if (0) { err: __wt_err(session, ret, "log wrlsn server error"); @@ -701,12 +731,13 @@ err: __wt_err(session, ret, "log wrlsn server error"); static WT_THREAD_RET __log_server(void *arg) { + struct timespec start, now; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_SESSION_IMPL *session; - int freq_per_sec; - bool locked, signalled; + uint64_t timediff; + bool did_work, locked, signalled; session = arg; conn = S2C(session); @@ -714,11 +745,10 @@ __log_server(void *arg) locked = signalled = false; /* - * Set this to the number of times per second we want to force out the - * log slot buffer. + * Set this to the number of milliseconds we want to run archive and + * pre-allocation. Start it so that we run on the first time through. */ -#define WT_FORCE_PER_SECOND 20 - freq_per_sec = WT_FORCE_PER_SECOND; + timediff = WT_THOUSAND; /* * The log server thread does a variety of work. It forces out any @@ -731,6 +761,7 @@ __log_server(void *arg) * don't want log records sitting in the buffer over the time it * takes to sync out an earlier file. */ + did_work = true; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* * Slots depend on future activity. Force out buffered @@ -739,15 +770,14 @@ __log_server(void *arg) * and a buffer may need to wait for the write_lsn to advance * in the case of a synchronous buffer. We end up with a hang. */ - WT_ERR_BUSY_OK(__wt_log_force_write(session, 0)); + WT_ERR_BUSY_OK(__wt_log_force_write(session, 0, &did_work)); /* * We don't want to archive or pre-allocate files as often as * we want to force out log buffers. Only do it once per second * or if the condition was signalled. */ - if (--freq_per_sec <= 0 || signalled) { - freq_per_sec = WT_FORCE_PER_SECOND; + if (timediff >= WT_THOUSAND || signalled) { /* * Perform log pre-allocation. @@ -788,8 +818,12 @@ __log_server(void *arg) } /* Wait until the next event. */ - WT_ERR(__wt_cond_wait_signal(session, conn->log_cond, - WT_MILLION / WT_FORCE_PER_SECOND, &signalled)); + + WT_ERR(__wt_epoch(session, &start)); + WT_ERR(__wt_cond_auto_wait_signal(session, conn->log_cond, + did_work, &signalled)); + WT_ERR(__wt_epoch(session, &now)); + timediff = WT_TIMEDIFF_MS(now, start); } if (0) { @@ -901,8 +935,9 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) */ WT_RET(__wt_open_internal_session(conn, "log-wrlsn-server", false, session_flags, &conn->log_wrlsn_session)); - WT_RET(__wt_cond_alloc(conn->log_wrlsn_session, - "log write lsn server", false, &conn->log_wrlsn_cond)); + WT_RET(__wt_cond_auto_alloc(conn->log_wrlsn_session, + "log write lsn server", false, 10000, WT_MILLION, + &conn->log_wrlsn_cond)); WT_RET(__wt_thread_create(conn->log_wrlsn_session, &conn->log_wrlsn_tid, __log_wrlsn_server, conn->log_wrlsn_session)); conn->log_wrlsn_tid_set = true; @@ -916,13 +951,13 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) if (conn->log_session != NULL) { WT_ASSERT(session, conn->log_cond != NULL); WT_ASSERT(session, conn->log_tid_set == true); - WT_RET(__wt_cond_signal(session, conn->log_cond)); + WT_RET(__wt_cond_auto_signal(session, conn->log_cond)); } else { /* The log server gets its own session. */ WT_RET(__wt_open_internal_session(conn, "log-server", false, session_flags, &conn->log_session)); - WT_RET(__wt_cond_alloc(conn->log_session, - "log server", false, &conn->log_cond)); + WT_RET(__wt_cond_auto_alloc(conn->log_session, + "log server", false, 50000, WT_MILLION, &conn->log_cond)); /* * Start the thread. @@ -958,7 +993,7 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) return (0); } if (conn->log_tid_set) { - WT_TRET(__wt_cond_signal(session, conn->log_cond)); + WT_TRET(__wt_cond_auto_signal(session, conn->log_cond)); WT_TRET(__wt_thread_join(session, conn->log_tid)); conn->log_tid_set = false; } @@ -973,7 +1008,7 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) conn->log_file_session = NULL; } if (conn->log_wrlsn_tid_set) { - WT_TRET(__wt_cond_signal(session, conn->log_wrlsn_cond)); + WT_TRET(__wt_cond_auto_signal(session, conn->log_wrlsn_cond)); WT_TRET(__wt_thread_join(session, conn->log_wrlsn_tid)); conn->log_wrlsn_tid_set = false; } @@ -994,9 +1029,9 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) } /* Destroy the condition variables now that all threads are stopped */ - WT_TRET(__wt_cond_destroy(session, &conn->log_cond)); + WT_TRET(__wt_cond_auto_destroy(session, &conn->log_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond)); - WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond)); + WT_TRET(__wt_cond_auto_destroy(session, &conn->log_wrlsn_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log->log_write_cond)); diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c index 58577b4587d..aff422654d7 100644 --- a/src/conn/conn_open.c +++ b/src/conn/conn_open.c @@ -210,10 +210,8 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) /* * If hash arrays were allocated, free them now. */ - if (s->dhhash != NULL) - __wt_free(session, s->dhhash); - if (s->tablehash != NULL) - __wt_free(session, s->tablehash); + __wt_free(session, s->dhhash); + __wt_free(session, s->tablehash); __wt_free(session, s->hazard); } diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c index 08ad105c725..d6e59a50da5 100644 --- a/src/conn/conn_stat.c +++ b/src/conn/conn_stat.c @@ -86,6 +86,11 @@ __statlog_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp) conn->stat_usecs = (uint64_t)cval.val * WT_MILLION; WT_RET(__wt_config_gets( + session, cfg, "statistics_log.json", &cval)); + if (cval.val != 0) + FLD_SET(conn->stat_flags, WT_CONN_STAT_JSON); + + WT_RET(__wt_config_gets( session, cfg, "statistics_log.on_close", &cval)); if (cval.val != 0) FLD_SET(conn->stat_flags, WT_CONN_STAT_ON_CLOSE); @@ -97,6 +102,10 @@ __statlog_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp) if (!*runp && !FLD_ISSET(conn->stat_flags, WT_CONN_STAT_ON_CLOSE)) return (0); + /* + * If any statistics logging is done, this must not be a read-only + * connection. + */ WT_RET(__wt_config_gets(session, cfg, "statistics_log.sources", &cval)); WT_RET(__wt_config_subinit(session, &objectconf, &cval)); for (cnt = 0; (ret = __wt_config_next(&objectconf, &k, &v)) == 0; ++cnt) @@ -132,9 +141,24 @@ __statlog_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp) WT_ERR(__wt_config_gets(session, cfg, "statistics_log.path", &cval)); WT_ERR(__wt_nfilename(session, cval.str, cval.len, &conn->stat_path)); - WT_ERR(__wt_config_gets( - session, cfg, "statistics_log.timestamp", &cval)); - WT_ERR(__wt_strndup(session, cval.str, cval.len, &conn->stat_format)); + /* + * When using JSON format, use the same timestamp format as MongoDB by + * default. + */ + if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_JSON)) { + ret = __wt_config_gets( + session, &cfg[1], "statistics_log.timestamp", &cval); + if (ret == WT_NOTFOUND) + WT_ERR(__wt_strdup( + session, "%FT%T.000Z", &conn->stat_format)); + WT_ERR_NOTFOUND_OK(ret); + } + if (conn->stat_format == NULL) { + WT_ERR(__wt_config_gets( + session, cfg, "statistics_log.timestamp", &cval)); + WT_ERR(__wt_strndup( + session, cval.str, cval.len, &conn->stat_format)); + } err: __stat_sources_free(session, &sources); return (ret); @@ -149,22 +173,25 @@ __statlog_dump(WT_SESSION_IMPL *session, const char *name, bool conn_stats) { WT_CONNECTION_IMPL *conn; WT_CURSOR *cursor; - WT_CURSOR_STAT *cst; WT_DECL_ITEM(tmp); WT_DECL_RET; - int64_t *stats; - int i; - const char *desc, *uri; + int64_t val; + size_t prefixlen; + const char *desc, *endprefix, *valstr, *uri; const char *cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL }; + bool first, groupfirst; conn = S2C(session); + cursor = NULL; + + WT_RET(__wt_scr_alloc(session, 0, &tmp)); + first = groupfirst = true; /* Build URI and configuration string. */ if (conn_stats) uri = "statistics:"; else { - WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__wt_buf_fmt(session, tmp, "statistics:%s", name)); uri = tmp->data; } @@ -175,31 +202,54 @@ __statlog_dump(WT_SESSION_IMPL *session, const char *name, bool conn_stats) * If we don't find an underlying object, silently ignore it, the object * may exist only intermittently. */ - switch (ret = __wt_curstat_open(session, uri, NULL, cfg, &cursor)) { - case 0: - cst = (WT_CURSOR_STAT *)cursor; - for (stats = cst->stats, i = 0; i < cst->stats_count; ++i) { - if (conn_stats) - WT_ERR(__wt_stat_connection_desc(cst, i, - &desc)); - else - WT_ERR(__wt_stat_dsrc_desc(cst, i, &desc)); + if ((ret = __wt_curstat_open(session, uri, NULL, cfg, &cursor)) != 0) { + if (ret == EBUSY || ret == ENOENT || ret == WT_NOTFOUND) + ret = 0; + goto err; + } + + if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_JSON)) { + WT_ERR(__wt_fprintf(conn->stat_fp, + "{\"version\":\"%s\",\"localTime\":\"%s\"", + WIREDTIGER_VERSION_STRING, conn->stat_stamp)); + WT_ERR(__wt_fprintf(conn->stat_fp, ",\"wiredTiger\":{")); + while ((ret = cursor->next(cursor)) == 0) { + WT_ERR(cursor->get_value(cursor, &desc, &valstr, &val)); + /* Check if we are starting a new section. */ + endprefix = strchr(desc, ':'); + prefixlen = WT_PTRDIFF(endprefix, desc); + WT_ASSERT(session, endprefix != NULL); + if (first || + tmp->size != prefixlen || + strncmp(desc, tmp->data, tmp->size) != 0) { + WT_ERR(__wt_buf_set( + session, tmp, desc, prefixlen)); + WT_ERR(__wt_fprintf(conn->stat_fp, + "%s\"%.*s\":{", first ? "" : "},", + (int)prefixlen, desc)); + first = false; + groupfirst = true; + } + WT_ERR(__wt_fprintf(conn->stat_fp, + "%s\"%s\":%" PRId64, + groupfirst ? "" : ",", endprefix + 2, val)); + groupfirst = false; + } + WT_ERR_NOTFOUND_OK(ret); + WT_ERR(__wt_fprintf(conn->stat_fp, "}}}\n")); + } else { + while ((ret = cursor->next(cursor)) == 0) { + WT_ERR(cursor->get_value(cursor, &desc, &valstr, &val)); WT_ERR(__wt_fprintf(conn->stat_fp, "%s %" PRId64 " %s %s\n", - conn->stat_stamp, stats[i], name, desc)); + conn->stat_stamp, val, name, desc)); } - WT_ERR(cursor->close(cursor)); - break; - case EBUSY: - case ENOENT: - case WT_NOTFOUND: - ret = 0; - break; - default: - break; + WT_ERR_NOTFOUND_OK(ret); } err: __wt_scr_free(session, &tmp); + if (cursor != NULL) + WT_TRET(cursor->close(cursor)); return (ret); } @@ -342,7 +392,7 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp) if (conn->stat_sources != NULL) { WT_WITH_HANDLE_LIST_LOCK(session, ret = __wt_conn_btree_apply( - session, false, NULL, __statlog_apply, NULL)); + session, NULL, __statlog_apply, NULL, NULL)); WT_RET(ret); } diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c index 7628076e605..cc0aa5a1322 100644 --- a/src/conn/conn_sweep.c +++ b/src/conn/conn_sweep.c @@ -91,9 +91,9 @@ __sweep_expire_one(WT_SESSION_IMPL *session) goto err; /* - * Mark the handle as dead and close the underlying file - * handle. Closing the handle decrements the open file count, - * meaning the close loop won't overrun the configured minimum. + * Mark the handle dead and close the underlying file handle. + * Closing the handle decrements the open file count, meaning the close + * loop won't overrun the configured minimum. */ ret = __wt_conn_btree_sync_and_close(session, false, true); @@ -163,7 +163,7 @@ __sweep_discard_trees(WT_SESSION_IMPL *session, u_int *dead_handlesp) !F_ISSET(dhandle, WT_DHANDLE_DEAD)) continue; - /* If the handle is marked "dead", flush it from cache. */ + /* If the handle is marked dead, flush it from cache. */ WT_WITH_DHANDLE(session, dhandle, ret = __wt_conn_btree_sync_and_close(session, false, false)); diff --git a/src/cursor/cur_backup.c b/src/cursor/cur_backup.c index d7d74da48d4..2fb0c464a76 100644 --- a/src/cursor/cur_backup.c +++ b/src/cursor/cur_backup.c @@ -8,12 +8,12 @@ #include "wt_internal.h" -static int __backup_all(WT_SESSION_IMPL *, WT_CURSOR_BACKUP *); +static int __backup_all(WT_SESSION_IMPL *); static int __backup_cleanup_handles(WT_SESSION_IMPL *, WT_CURSOR_BACKUP *); static int __backup_file_create(WT_SESSION_IMPL *, WT_CURSOR_BACKUP *, bool); -static int __backup_list_all_append(WT_SESSION_IMPL *, const char *[]); static int __backup_list_append( WT_SESSION_IMPL *, WT_CURSOR_BACKUP *, const char *); +static int __backup_list_uri_append(WT_SESSION_IMPL *, const char *, bool *); static int __backup_start( WT_SESSION_IMPL *, WT_CURSOR_BACKUP *, const char *[]); static int __backup_stop(WT_SESSION_IMPL *); @@ -103,22 +103,22 @@ __wt_curbackup_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, - __wt_cursor_get_key, /* get-key */ - __wt_cursor_notsup, /* get-value */ - __wt_cursor_notsup, /* set-key */ - __wt_cursor_notsup, /* set-value */ - __wt_cursor_notsup, /* compare */ - __wt_cursor_notsup, /* equals */ - __curbackup_next, /* next */ - __wt_cursor_notsup, /* prev */ - __curbackup_reset, /* reset */ - __wt_cursor_notsup, /* search */ - __wt_cursor_notsup, /* search-near */ - __wt_cursor_notsup, /* insert */ - __wt_cursor_notsup, /* update */ - __wt_cursor_notsup, /* remove */ - __wt_cursor_notsup, /* reconfigure */ - __curbackup_close); /* close */ + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value_notsup, /* get-value */ + __wt_cursor_set_key_notsup, /* set-key */ + __wt_cursor_set_value_notsup, /* set-value */ + __wt_cursor_compare_notsup, /* compare */ + __wt_cursor_equals_notsup, /* equals */ + __curbackup_next, /* next */ + __wt_cursor_notsup, /* prev */ + __curbackup_reset, /* reset */ + __wt_cursor_notsup, /* search */ + __wt_cursor_search_near_notsup, /* search-near */ + __wt_cursor_notsup, /* insert */ + __wt_cursor_notsup, /* update */ + __wt_cursor_notsup, /* remove */ + __wt_cursor_reconfigure_notsup, /* reconfigure */ + __curbackup_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_BACKUP *cb; WT_DECL_RET; @@ -140,8 +140,9 @@ __wt_curbackup_open(WT_SESSION_IMPL *session, * Start the backup and fill in the cursor's list. Acquire the schema * lock, we need a consistent view when creating a copy. */ - WT_WITH_SCHEMA_LOCK(session, ret, - ret = __backup_start(session, cb, cfg)); + WT_WITH_CHECKPOINT_LOCK(session, ret, + WT_WITH_SCHEMA_LOCK(session, ret, + ret = __backup_start(session, cb, cfg))); WT_ERR(ret); /* __wt_cursor_init is last so we don't have to clean up on error. */ @@ -241,7 +242,7 @@ __backup_start( if (!target_list) { WT_ERR(__backup_log_append(session, cb, true)); - WT_ERR(__backup_all(session, cb)); + WT_ERR(__backup_all(session)); } /* Add the hot backup and standard WiredTiger files to the list. */ @@ -332,55 +333,14 @@ __backup_stop(WT_SESSION_IMPL *session) * Backup all objects in the database. */ static int -__backup_all(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb) +__backup_all(WT_SESSION_IMPL *session) { - WT_CONFIG_ITEM cval; - WT_CURSOR *cursor; WT_DECL_RET; - const char *key, *value; - - cursor = NULL; - - /* Copy all of the metadata entries to the hot backup file. */ - WT_RET(__wt_metadata_cursor(session, &cursor)); - while ((ret = cursor->next(cursor)) == 0) { - WT_ERR(cursor->get_key(cursor, &key)); - WT_ERR(cursor->get_value(cursor, &value)); - WT_ERR(__wt_fprintf(cb->bfp, "%s\n%s\n", key, value)); - - /* - * While reading the metadata file, check there are no "sources" - * or "types" which can't support hot backup. This checks for - * a data source that's non-standard, which can't be backed up, - * but is also sanity checking: if there's an entry backed by - * anything other than a file or lsm entry, we're confused. - */ - if ((ret = __wt_config_getones( - session, value, "type", &cval)) == 0 && - !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "file") && - !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "lsm")) - WT_ERR_MSG(session, ENOTSUP, - "hot backup is not supported for objects of " - "type %.*s", (int)cval.len, cval.str); - WT_ERR_NOTFOUND_OK(ret); - if ((ret =__wt_config_getones( - session, value, "source", &cval)) == 0 && - !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "file:") && - !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "lsm:")) - WT_ERR_MSG(session, ENOTSUP, - "hot backup is not supported for objects of " - "source %.*s", (int)cval.len, cval.str); - WT_ERR_NOTFOUND_OK(ret); - } - WT_ERR_NOTFOUND_OK(ret); - - WT_ERR(__wt_metadata_cursor_release(session, &cursor)); /* Build a list of the file objects that need to be copied. */ WT_WITH_HANDLE_LIST_LOCK(session, ret = - __wt_meta_btree_apply(session, __backup_list_all_append, NULL)); + __wt_meta_apply_all(session, NULL, __backup_list_uri_append, NULL)); -err: WT_TRET(__wt_metadata_cursor_release(session, &cursor)); return (ret); } @@ -430,11 +390,11 @@ __backup_uri(WT_SESSION_IMPL *session, */ if (WT_PREFIX_MATCH(uri, "log:")) { *log_only = !target_list; - WT_ERR(__wt_backup_list_uri_append(session, uri, NULL)); + WT_ERR(__backup_list_uri_append(session, uri, NULL)); } else { *log_only = false; WT_ERR(__wt_schema_worker(session, - uri, NULL, __wt_backup_list_uri_append, cfg, 0)); + uri, NULL, __backup_list_uri_append, cfg, 0)); } } WT_ERR_NOTFOUND_OK(ret); @@ -471,12 +431,12 @@ __wt_backup_file_remove(WT_SESSION_IMPL *session) } /* - * __wt_backup_list_uri_append -- + * __backup_list_uri_append -- * Append a new file name to the list, allocate space as necessary. * Called via the schema_worker function. */ -int -__wt_backup_list_uri_append( +static int +__backup_list_uri_append( WT_SESSION_IMPL *session, const char *name, bool *skip) { WT_CURSOR_BACKUP *cb; @@ -485,11 +445,31 @@ __wt_backup_list_uri_append( cb = session->bkp_cursor; WT_UNUSED(skip); + /* + * While reading the metadata file, check there are no data sources + * that can't support hot backup. This checks for a data source that's + * non-standard, which can't be backed up, but is also sanity checking: + * if there's an entry backed by anything other than a file or lsm + * entry, we're confused. + */ if (WT_PREFIX_MATCH(name, "log:")) { WT_RET(__backup_log_append(session, cb, false)); return (0); } + if (!WT_PREFIX_MATCH(name, "file:") && + !WT_PREFIX_MATCH(name, "colgroup:") && + !WT_PREFIX_MATCH(name, "index:") && + !WT_PREFIX_MATCH(name, "lsm:") && + !WT_PREFIX_MATCH(name, "table:")) + WT_RET_MSG(session, ENOTSUP, + "hot backup is not supported for objects of type %s", + name); + + /* Ignore the lookaside table. */ + if (strcmp(name, WT_LAS_URI) == 0) + return (0); + /* Add the metadata entry to the backup file. */ WT_RET(__wt_metadata_search(session, name, &value)); WT_RET(__wt_fprintf(cb->bfp, "%s\n%s\n", name, value)); @@ -503,34 +483,6 @@ __wt_backup_list_uri_append( } /* - * __backup_list_all_append -- - * Append a new file name to the list, allocate space as necessary. - * Called via the __wt_meta_btree_apply function. - */ -static int -__backup_list_all_append(WT_SESSION_IMPL *session, const char *cfg[]) -{ - WT_CURSOR_BACKUP *cb; - const char *name; - - WT_UNUSED(cfg); - - cb = session->bkp_cursor; - name = session->dhandle->name; - - /* Ignore files in the process of being bulk-loaded. */ - if (F_ISSET(S2BT(session), WT_BTREE_BULK)) - return (0); - - /* Ignore the lookaside table. */ - if (strcmp(name, WT_LAS_URI) == 0) - return (0); - - /* Add the file to the list of files to be copied. */ - return (__backup_list_append(session, cb, name)); -} - -/* * __backup_list_append -- * Append a new file name to the list, allocate space as necessary. */ @@ -541,7 +493,6 @@ __backup_list_append( WT_CURSOR_BACKUP_ENTRY *p; WT_DATA_HANDLE *old_dhandle; WT_DECL_RET; - bool need_handle; const char *name; /* Leave a NULL at the end to mark the end of the list. */ @@ -551,11 +502,26 @@ __backup_list_append( p[0].name = p[1].name = NULL; p[0].handle = p[1].handle = NULL; - need_handle = false; name = uri; + + /* + * If it's a file in the database, get a handle for the underlying + * object (this handle blocks schema level operations, for example + * WT_SESSION.drop or an LSM file discard after level merging). + * + * If the handle is busy (e.g., it is being bulk-loaded), silently skip + * it. We have a special fake checkpoint in the metadata, and recovery + * will recreate an empty file. + */ if (WT_PREFIX_MATCH(uri, "file:")) { - need_handle = true; name += strlen("file:"); + + old_dhandle = session->dhandle; + ret = __wt_session_get_btree(session, uri, NULL, NULL, 0); + p->handle = session->dhandle; + session->dhandle = old_dhandle; + if (ret != 0) + return (ret == EBUSY ? 0 : ret); } /* @@ -569,20 +535,6 @@ __backup_list_append( */ WT_RET(__wt_strdup(session, name, &p->name)); - /* - * If it's a file in the database, get a handle for the underlying - * object (this handle blocks schema level operations, for example - * WT_SESSION.drop or an LSM file discard after level merging). - */ - if (need_handle) { - old_dhandle = session->dhandle; - if ((ret = - __wt_session_get_btree(session, uri, NULL, NULL, 0)) == 0) - p->handle = session->dhandle; - session->dhandle = old_dhandle; - WT_RET(ret); - } - ++cb->list_next; return (0); } diff --git a/src/cursor/cur_config.c b/src/cursor/cur_config.c index 1b2fec0eb89..e0d270e4245 100644 --- a/src/cursor/cur_config.c +++ b/src/cursor/cur_config.c @@ -27,21 +27,21 @@ __wt_curconfig_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, - __wt_cursor_get_key, /* get-key */ - __wt_cursor_get_value, /* get-value */ - __wt_cursor_set_key, /* set-key */ - __wt_cursor_set_value, /* set-value */ - __wt_cursor_notsup, /* compare */ - __wt_cursor_notsup, /* equals */ - __wt_cursor_notsup, /* next */ - __wt_cursor_notsup, /* prev */ - __wt_cursor_noop, /* reset */ - __wt_cursor_notsup, /* search */ - __wt_cursor_notsup, /* search-near */ - __wt_cursor_notsup, /* insert */ - __wt_cursor_notsup, /* update */ - __wt_cursor_notsup, /* remove */ - __wt_cursor_notsup, /* reconfigure */ + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ + __wt_cursor_compare_notsup, /* compare */ + __wt_cursor_equals_notsup, /* equals */ + __wt_cursor_notsup, /* next */ + __wt_cursor_notsup, /* prev */ + __wt_cursor_noop, /* reset */ + __wt_cursor_notsup, /* search */ + __wt_cursor_search_near_notsup, /* search-near */ + __wt_cursor_notsup, /* insert */ + __wt_cursor_notsup, /* update */ + __wt_cursor_notsup, /* remove */ + __wt_cursor_reconfigure_notsup, /* reconfigure */ __curconfig_close); WT_CURSOR_CONFIG *cconfig; WT_CURSOR *cursor; diff --git a/src/cursor/cur_ds.c b/src/cursor/cur_ds.c index 2a598c99523..804c24a3d2e 100644 --- a/src/cursor/cur_ds.c +++ b/src/cursor/cur_ds.c @@ -449,22 +449,22 @@ __wt_curds_open( const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, - __wt_cursor_get_key, /* get-key */ - __wt_cursor_get_value, /* get-value */ - __wt_cursor_set_key, /* set-key */ - __wt_cursor_set_value, /* set-value */ - __curds_compare, /* compare */ - __wt_cursor_equals, /* equals */ - __curds_next, /* next */ - __curds_prev, /* prev */ - __curds_reset, /* reset */ - __curds_search, /* search */ - __curds_search_near, /* search-near */ - __curds_insert, /* insert */ - __curds_update, /* update */ - __curds_remove, /* remove */ - __wt_cursor_notsup, /* reconfigure */ - __curds_close); /* close */ + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ + __curds_compare, /* compare */ + __wt_cursor_equals, /* equals */ + __curds_next, /* next */ + __curds_prev, /* prev */ + __curds_reset, /* reset */ + __curds_search, /* search */ + __curds_search_near, /* search-near */ + __curds_insert, /* insert */ + __curds_update, /* update */ + __curds_remove, /* remove */ + __wt_cursor_reconfigure_notsup, /* reconfigure */ + __curds_close); /* close */ WT_CONFIG_ITEM cval, metadata; WT_CURSOR *cursor, *source; WT_CURSOR_DATA_SOURCE *data_source; diff --git a/src/cursor/cur_dump.c b/src/cursor/cur_dump.c index 3324efd96cc..a7b1c98871a 100644 --- a/src/cursor/cur_dump.c +++ b/src/cursor/cur_dump.c @@ -348,22 +348,22 @@ int __wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, - __curdump_get_key, /* get-key */ - __curdump_get_value, /* get-value */ - __curdump_set_key, /* set-key */ - __curdump_set_value, /* set-value */ - __wt_cursor_notsup, /* compare */ - __wt_cursor_notsup, /* equals */ - __curdump_next, /* next */ - __curdump_prev, /* prev */ - __curdump_reset, /* reset */ - __curdump_search, /* search */ - __curdump_search_near, /* search-near */ - __curdump_insert, /* insert */ - __curdump_update, /* update */ - __curdump_remove, /* remove */ - __wt_cursor_notsup, /* reconfigure */ - __curdump_close); /* close */ + __curdump_get_key, /* get-key */ + __curdump_get_value, /* get-value */ + __curdump_set_key, /* set-key */ + __curdump_set_value, /* set-value */ + __wt_cursor_compare_notsup, /* compare */ + __wt_cursor_equals_notsup, /* equals */ + __curdump_next, /* next */ + __curdump_prev, /* prev */ + __curdump_reset, /* reset */ + __curdump_search, /* search */ + __curdump_search_near, /* search-near */ + __curdump_insert, /* insert */ + __curdump_update, /* update */ + __curdump_remove, /* remove */ + __wt_cursor_reconfigure_notsup, /* reconfigure */ + __curdump_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_DUMP *cdump; WT_CURSOR_JSON *json; diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c index 8bbe1cc8eda..fac903b4770 100644 --- a/src/cursor/cur_file.c +++ b/src/cursor/cur_file.c @@ -397,22 +397,22 @@ __wt_curfile_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, - __wt_cursor_get_key, /* get-key */ - __wt_cursor_get_value, /* get-value */ - __wt_cursor_set_key, /* set-key */ - __wt_cursor_set_value, /* set-value */ - __curfile_compare, /* compare */ - __curfile_equals, /* equals */ - __curfile_next, /* next */ - __curfile_prev, /* prev */ - __curfile_reset, /* reset */ - __curfile_search, /* search */ - __curfile_search_near, /* search-near */ - __curfile_insert, /* insert */ - __curfile_update, /* update */ - __curfile_remove, /* remove */ - __wt_cursor_reconfigure, /* reconfigure */ - __curfile_close); /* close */ + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ + __curfile_compare, /* compare */ + __curfile_equals, /* equals */ + __curfile_next, /* next */ + __curfile_prev, /* prev */ + __curfile_reset, /* reset */ + __curfile_search, /* search */ + __curfile_search_near, /* search-near */ + __curfile_insert, /* insert */ + __curfile_update, /* update */ + __curfile_remove, /* remove */ + __wt_cursor_reconfigure, /* reconfigure */ + __curfile_close); /* close */ WT_BTREE *btree; WT_CONFIG_ITEM cval; WT_CURSOR *cursor; diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c index 6822055131a..dbe8046ca21 100644 --- a/src/cursor/cur_index.c +++ b/src/cursor/cur_index.c @@ -386,22 +386,22 @@ __wt_curindex_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, - __wt_cursor_get_key, /* get-key */ - __curindex_get_value, /* get-value */ - __wt_cursor_set_key, /* set-key */ - __curindex_set_value, /* set-value */ - __curindex_compare, /* compare */ - __wt_cursor_equals, /* equals */ - __curindex_next, /* next */ - __curindex_prev, /* prev */ - __curindex_reset, /* reset */ - __curindex_search, /* search */ - __curindex_search_near, /* search-near */ - __wt_cursor_notsup, /* insert */ - __wt_cursor_notsup, /* update */ - __wt_cursor_notsup, /* remove */ - __wt_cursor_notsup, /* reconfigure */ - __curindex_close); /* close */ + __wt_cursor_get_key, /* get-key */ + __curindex_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __curindex_set_value, /* set-value */ + __curindex_compare, /* compare */ + __wt_cursor_equals, /* equals */ + __curindex_next, /* next */ + __curindex_prev, /* prev */ + __curindex_reset, /* reset */ + __curindex_search, /* search */ + __curindex_search_near, /* search-near */ + __wt_cursor_notsup, /* insert */ + __wt_cursor_notsup, /* update */ + __wt_cursor_notsup, /* remove */ + __wt_cursor_reconfigure_notsup, /* reconfigure */ + __curindex_close); /* close */ WT_CURSOR_INDEX *cindex; WT_CURSOR *cursor; WT_DECL_ITEM(tmp); diff --git a/src/cursor/cur_join.c b/src/cursor/cur_join.c index 2cbefa68c5e..38a83217933 100644 --- a/src/cursor/cur_join.c +++ b/src/cursor/cur_join.c @@ -8,6 +8,9 @@ #include "wt_internal.h" +static int __curjoin_insert_endpoint(WT_SESSION_IMPL *, + WT_CURSOR_JOIN_ENTRY *, u_int, WT_CURSOR_JOIN_ENDPOINT **); + /* * __curjoin_entry_iter_init -- * Initialize an iteration for the index managed by a join entry. @@ -17,49 +20,56 @@ static int __curjoin_entry_iter_init(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_CURSOR_JOIN_ENTRY *entry, WT_CURSOR_JOIN_ITER **iterp) { - WT_CURSOR *newcur; WT_CURSOR *to_dup; WT_DECL_RET; const char *raw_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), "raw", NULL }; const char *def_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), NULL }; - const char *uri, **config; - char *uribuf; + const char *urimain, **config; + char *mainbuf, *uri; WT_CURSOR_JOIN_ITER *iter; size_t size; iter = NULL; - uribuf = NULL; + mainbuf = uri = NULL; to_dup = entry->ends[0].cursor; - uri = to_dup->uri; if (F_ISSET((WT_CURSOR *)cjoin, WT_CURSTD_RAW)) config = &raw_cfg[0]; else config = &def_cfg[0]; + size = strlen(to_dup->internal_uri) + 3; + WT_ERR(__wt_calloc(session, size, 1, &uri)); + snprintf(uri, size, "%s()", to_dup->internal_uri); + urimain = cjoin->table->name; if (cjoin->projection != NULL) { - size = strlen(uri) + strlen(cjoin->projection) + 1; - WT_ERR(__wt_calloc(session, size, 1, &uribuf)); - snprintf(uribuf, size, "%s%s", uri, cjoin->projection); - uri = uribuf; + size = strlen(urimain) + strlen(cjoin->projection) + 1; + WT_ERR(__wt_calloc(session, size, 1, &mainbuf)); + snprintf(mainbuf, size, "%s%s", urimain, cjoin->projection); + urimain = mainbuf; } - WT_ERR(__wt_open_cursor(session, uri, (WT_CURSOR *)cjoin, config, - &newcur)); - WT_ERR(__wt_cursor_dup_position(to_dup, newcur)); + WT_ERR(__wt_calloc_one(session, &iter)); + WT_ERR(__wt_open_cursor(session, uri, (WT_CURSOR *)cjoin, config, + &iter->cursor)); + WT_ERR(__wt_cursor_dup_position(to_dup, iter->cursor)); + WT_ERR(__wt_open_cursor(session, urimain, (WT_CURSOR *)cjoin, config, + &iter->main)); iter->cjoin = cjoin; iter->session = session; iter->entry = entry; - iter->cursor = newcur; - iter->advance = false; + iter->positioned = false; + iter->isequal = (entry->ends_next == 1 && + WT_CURJOIN_END_RANGE(&entry->ends[0]) == WT_CURJOIN_END_EQ); *iterp = iter; if (0) { err: __wt_free(session, iter); } - __wt_free(session, uribuf); + __wt_free(session, mainbuf); + __wt_free(session, uri); return (ret); } @@ -72,18 +82,70 @@ static int __curjoin_pack_recno(WT_SESSION_IMPL *session, uint64_t r, uint8_t *buf, size_t bufsize, WT_ITEM *item) { - WT_DECL_RET; WT_SESSION *wtsession; size_t sz; wtsession = (WT_SESSION *)session; - WT_ERR(wiredtiger_struct_size(wtsession, &sz, "r", r)); + WT_RET(wiredtiger_struct_size(wtsession, &sz, "r", r)); WT_ASSERT(session, sz < bufsize); - WT_ERR(wiredtiger_struct_pack(wtsession, buf, bufsize, "r", r)); + WT_RET(wiredtiger_struct_pack(wtsession, buf, bufsize, "r", r)); item->size = sz; item->data = buf; + return (0); +} + +/* + * __curjoin_split_key -- + * Copy the primary key from a cursor (either main table or index) + * to another cursor. When copying from an index file, the index + * key is also returned. + * + */ +static int +__curjoin_split_key(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, + WT_ITEM *idxkey, WT_CURSOR *tocur, WT_CURSOR *fromcur, + const char *repack_fmt, bool isindex) +{ + WT_CURSOR *firstcg_cur; + WT_CURSOR_INDEX *cindex; + WT_ITEM *keyp; + const uint8_t *p; -err: return (ret); + if (isindex) { + cindex = ((WT_CURSOR_INDEX *)fromcur); + /* + * Repack tells us where the index key ends; advance past + * that to get where the raw primary key starts. + */ + WT_RET(__wt_struct_repack(session, cindex->child->key_format, + repack_fmt != NULL ? repack_fmt : cindex->iface.key_format, + &cindex->child->key, idxkey)); + WT_ASSERT(session, cindex->child->key.size > idxkey->size); + tocur->key.data = (uint8_t *)idxkey->data + idxkey->size; + tocur->key.size = cindex->child->key.size - idxkey->size; + if (WT_CURSOR_RECNO(tocur)) { + p = (const uint8_t *)tocur->key.data; + WT_RET(__wt_vunpack_uint(&p, tocur->key.size, + &tocur->recno)); + } else + tocur->recno = 0; + } else { + firstcg_cur = ((WT_CURSOR_TABLE *)fromcur)->cg_cursors[0]; + keyp = &firstcg_cur->key; + if (WT_CURSOR_RECNO(tocur)) { + WT_ASSERT(session, keyp->size == sizeof(uint64_t)); + tocur->recno = *(uint64_t *)keyp->data; + WT_RET(__curjoin_pack_recno(session, tocur->recno, + cjoin->recno_buf, sizeof(cjoin->recno_buf), + &tocur->key)); + } else { + WT_ITEM_SET(tocur->key, *keyp); + tocur->recno = 0; + } + idxkey->data = NULL; + idxkey->size = 0; + } + return (0); } /* @@ -92,45 +154,24 @@ err: return (ret); * */ static int -__curjoin_entry_iter_next(WT_CURSOR_JOIN_ITER *iter, WT_ITEM *primkey, - uint64_t *rp) +__curjoin_entry_iter_next(WT_CURSOR_JOIN_ITER *iter, WT_CURSOR *cursor) { - WT_CURSOR *firstcg_cur; - WT_CURSOR_JOIN *cjoin; - WT_DECL_RET; - WT_SESSION_IMPL *session; - uint64_t r; - - if (iter->advance) - WT_ERR(iter->cursor->next(iter->cursor)); + if (iter->positioned) + WT_RET(iter->cursor->next(iter->cursor)); else - iter->advance = true; - - session = iter->session; - cjoin = iter->cjoin; + iter->positioned = true; /* * Set our key to the primary key, we'll also need this * to check membership. */ - if (iter->entry->index != NULL) - firstcg_cur = ((WT_CURSOR_INDEX *)iter->cursor)->cg_cursors[0]; - else - firstcg_cur = ((WT_CURSOR_TABLE *)iter->cursor)->cg_cursors[0]; - if (WT_CURSOR_RECNO(&cjoin->iface)) { - r = *(uint64_t *)firstcg_cur->key.data; - WT_ERR(__curjoin_pack_recno(session, r, cjoin->recno_buf, - sizeof(cjoin->recno_buf), primkey)); - *rp = r; - } else { - WT_ITEM_SET(*primkey, firstcg_cur->key); - *rp = 0; - } - iter->curkey = primkey; + WT_RET(__curjoin_split_key(iter->session, iter->cjoin, &iter->idxkey, + cursor, iter->cursor, iter->entry->repack_format, + iter->entry->index != NULL)); + iter->curkey = &cursor->key; iter->entry->stats.actual_count++; iter->entry->stats.accesses++; - -err: return (ret); + return (0); } /* @@ -141,17 +182,15 @@ err: return (ret); static int __curjoin_entry_iter_reset(WT_CURSOR_JOIN_ITER *iter) { - WT_DECL_RET; - - if (iter->advance) { - WT_ERR(iter->cursor->reset(iter->cursor)); - WT_ERR(__wt_cursor_dup_position( + if (iter->positioned) { + WT_RET(iter->cursor->reset(iter->cursor)); + WT_RET(iter->main->reset(iter->main)); + WT_RET(__wt_cursor_dup_position( iter->cjoin->entries[0].ends[0].cursor, iter->cursor)); - iter->advance = false; + iter->positioned = false; iter->entry->stats.actual_count = 0; } - -err: return (ret); + return (0); } /* @@ -162,7 +201,7 @@ err: return (ret); static bool __curjoin_entry_iter_ready(WT_CURSOR_JOIN_ITER *iter) { - return (iter->advance); + return (iter->positioned); } /* @@ -177,6 +216,8 @@ __curjoin_entry_iter_close(WT_CURSOR_JOIN_ITER *iter) if (iter->cursor != NULL) WT_TRET(iter->cursor->close(iter->cursor)); + if (iter->main != NULL) + WT_TRET(iter->main->close(iter->main)); __wt_free(iter->session, iter); return (ret); @@ -232,10 +273,8 @@ __curjoin_get_value(WT_CURSOR *cursor, ...) !__curjoin_entry_iter_ready(iter)) WT_ERR_MSG(session, EINVAL, "join cursor must be advanced with next()"); - if (iter->entry->index != NULL) - WT_ERR(__wt_curindex_get_valuev(iter->cursor, ap)); - else - WT_ERR(__wt_curtable_get_valuev(iter->cursor, ap)); + + WT_ERR(__wt_curtable_get_valuev(iter->main, ap)); err: va_end(ap); API_END_RET(session, ret); @@ -251,43 +290,26 @@ __curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, { WT_COLLATOR *collator; WT_CURSOR *c; - WT_CURSOR_INDEX *cindex; WT_CURSOR_JOIN_ENDPOINT *end, *endmax; WT_DECL_RET; WT_DECL_ITEM(uribuf); - WT_ITEM curkey, curvalue, *k; - WT_TABLE *maintable; + WT_ITEM curkey, curvalue; const char *raw_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), "raw", NULL }; - const char *mainkey_str, *p; - void *allocbuf; - size_t mainkey_len, size; - u_int i; + const char *uri; + size_t size; int cmp, skip; c = NULL; - allocbuf = NULL; skip = 0; - if (entry->index != NULL) { + if (entry->index != NULL) /* - * Open a cursor having a projection of the keys of the - * index we're comparing against. Open it raw, we're - * going to compare it to the raw keys of the - * reference cursors. + * Open the raw index. We're avoiding any references + * to the main table, they may be expensive. */ - maintable = ((WT_CURSOR_TABLE *)entry->main)->table; - mainkey_str = maintable->colconf.str + 1; - for (p = mainkey_str, i = 0; - p != NULL && i < maintable->nkey_columns; i++) - p = strchr(p + 1, ','); - WT_ASSERT(session, p != 0); - mainkey_len = WT_PTRDIFF(p, mainkey_str); - size = strlen(entry->index->name) + mainkey_len + 3; - WT_ERR(__wt_scr_alloc(session, size, &uribuf)); - WT_ERR(__wt_buf_fmt(session, uribuf, "%s(%.*s)", - entry->index->name, (int)mainkey_len, mainkey_str)); - } else { + uri = entry->index->source; + else { /* * For joins on the main table, we just need the primary * key for comparison, we don't need any values. @@ -296,35 +318,38 @@ __curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_ERR(__wt_scr_alloc(session, size, &uribuf)); WT_ERR(__wt_buf_fmt(session, uribuf, "%s()", cjoin->table->name)); + uri = uribuf->data; } - WT_ERR(__wt_open_cursor( - session, uribuf->data, &cjoin->iface, raw_cfg, &c)); + WT_ERR(__wt_open_cursor(session, uri, &cjoin->iface, raw_cfg, &c)); /* Initially position the cursor if necessary. */ endmax = &entry->ends[entry->ends_next]; - if ((end = &entry->ends[0]) < endmax && - F_ISSET(end, WT_CURJOIN_END_GE)) { - WT_ERR(__wt_cursor_dup_position(end->cursor, c)); - if (end->flags == WT_CURJOIN_END_GE) - skip = 1; + if ((end = &entry->ends[0]) < endmax) { + if (F_ISSET(end, WT_CURJOIN_END_GT) || + WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_EQ) { + WT_ERR(__wt_cursor_dup_position(end->cursor, c)); + if (WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_GE) + skip = 1; + } else if (F_ISSET(end, WT_CURJOIN_END_LT)) { + if ((ret = c->next(c)) == WT_NOTFOUND) + goto done; + WT_ERR(ret); + } else + WT_ERR(__wt_illegal_value(session, NULL)); } collator = (entry->index == NULL) ? NULL : entry->index->collator; while (ret == 0) { WT_ERR(c->get_key(c, &curkey)); if (entry->index != NULL) { - cindex = (WT_CURSOR_INDEX *)c; - if (cindex->index->extractor == NULL) { - /* - * Repack so it's comparable to the - * reference endpoints. - */ - k = &cindex->child->key; - WT_ERR(__wt_struct_repack(session, - cindex->child->key_format, - entry->main->value_format, k, &curkey, - &allocbuf)); - } else - curkey = cindex->child->key; + /* + * Repack so it's comparable to the + * reference endpoints. + */ + WT_ERR(__wt_struct_repack(session, + c->key_format, + (entry->repack_format != NULL ? + entry->repack_format : entry->index->idxkey_format), + &c->key, &curkey)); } for (end = &entry->ends[skip]; end < endmax; end++) { WT_ERR(__wt_compare(session, collator, &curkey, @@ -345,8 +370,12 @@ __curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, goto done; } } - if (entry->index != NULL) - WT_ERR(c->get_value(c, &curvalue)); + if (entry->index != NULL) { + curvalue.data = + (unsigned char *)curkey.data + curkey.size; + WT_ASSERT(session, c->key.size > curkey.size); + curvalue.size = c->key.size - curkey.size; + } else WT_ERR(c->get_key(c, &curvalue)); WT_ERR(__wt_bloom_insert(bloom, &curvalue)); @@ -361,7 +390,6 @@ done: err: if (c != NULL) WT_TRET(c->close(c)); __wt_scr_free(session, &uribuf); - __wt_free(session, allocbuf); return (ret); } @@ -375,27 +403,23 @@ __curjoin_endpoint_init_key(WT_SESSION_IMPL *session, { WT_CURSOR *cursor; WT_CURSOR_INDEX *cindex; - WT_DECL_RET; WT_ITEM *k; uint64_t r; - void *allocbuf; - allocbuf = NULL; if ((cursor = endpoint->cursor) != NULL) { if (entry->index != NULL) { /* Extract and save the index's logical key. */ cindex = (WT_CURSOR_INDEX *)endpoint->cursor; - WT_ERR(__wt_struct_repack(session, + WT_RET(__wt_struct_repack(session, cindex->child->key_format, - cindex->iface.key_format, - &cindex->child->key, &endpoint->key, &allocbuf)); - if (allocbuf != NULL) - F_SET(endpoint, WT_CURJOIN_END_OWN_KEY); + (entry->repack_format != NULL ? + entry->repack_format : cindex->iface.key_format), + &cindex->child->key, &endpoint->key)); } else { k = &((WT_CURSOR_TABLE *)cursor)->cg_cursors[0]->key; if (WT_CURSOR_RECNO(cursor)) { r = *(uint64_t *)k->data; - WT_ERR(__curjoin_pack_recno(session, r, + WT_RET(__curjoin_pack_recno(session, r, endpoint->recno_buf, sizeof(endpoint->recno_buf), &endpoint->key)); @@ -404,10 +428,7 @@ __curjoin_endpoint_init_key(WT_SESSION_IMPL *session, endpoint->key = *k; } } - if (0) { -err: __wt_free(session, allocbuf); - } - return (ret); + return (0); } /* @@ -419,8 +440,13 @@ __curjoin_init_iter(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin) { WT_BLOOM *bloom; WT_DECL_RET; + WT_CURSOR *origcur; WT_CURSOR_JOIN_ENTRY *je, *jeend, *je2; WT_CURSOR_JOIN_ENDPOINT *end; + const char *def_cfg[] = { WT_CONFIG_BASE( + session, WT_SESSION_open_cursor), NULL }; + const char *raw_cfg[] = { WT_CONFIG_BASE( + session, WT_SESSION_open_cursor), "raw", NULL }; uint32_t f, k; if (cjoin->entries_next == 0) @@ -429,9 +455,27 @@ __curjoin_init_iter(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin) "cursors"); je = &cjoin->entries[0]; + jeend = &cjoin->entries[cjoin->entries_next]; + + /* + * For a single compare=le endpoint in the first iterated entry, + * construct a companion compare=ge endpoint that will actually + * be iterated. + */ + if (((je = cjoin->entries) != jeend) && + je->ends_next == 1 && F_ISSET(&je->ends[0], WT_CURJOIN_END_LT)) { + origcur = je->ends[0].cursor; + WT_RET(__curjoin_insert_endpoint(session, je, 0, &end)); + WT_RET(__wt_open_cursor(session, origcur->uri, + (WT_CURSOR *)cjoin, + F_ISSET(origcur, WT_CURSTD_RAW) ? raw_cfg : def_cfg, + &end->cursor)); + WT_RET(end->cursor->next(end->cursor)); + end->flags = WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ | + WT_CURJOIN_END_OWN_CURSOR; + } WT_RET(__curjoin_entry_iter_init(session, cjoin, je, &cjoin->iter)); - jeend = &cjoin->entries[cjoin->entries_next]; for (je = cjoin->entries; je < jeend; je++) { __wt_stat_join_init_single(&je->stats); for (end = &je->ends[0]; end < &je->ends[je->ends_next]; @@ -449,6 +493,10 @@ __curjoin_init_iter(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin) F_SET(cjoin, WT_CURJOIN_SKIP_FIRST_LEFT); if (F_ISSET(je, WT_CURJOIN_ENTRY_BLOOM)) { + if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED) + WT_RET_MSG(session, EINVAL, + "join cursors with Bloom filters cannot be " + "used with read-uncommitted isolation"); if (je->bloom == NULL) { /* * Look for compatible filters to be shared, @@ -520,35 +568,34 @@ __curjoin_entry_in_range(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry, { WT_COLLATOR *collator; WT_CURSOR_JOIN_ENDPOINT *end, *endmax; - WT_DECL_RET; int cmp; collator = (entry->index != NULL) ? entry->index->collator : NULL; endmax = &entry->ends[entry->ends_next]; for (end = &entry->ends[skip_left ? 1 : 0]; end < endmax; end++) { - WT_ERR(__wt_compare(session, collator, curkey, &end->key, + WT_RET(__wt_compare(session, collator, curkey, &end->key, &cmp)); if (!F_ISSET(end, WT_CURJOIN_END_LT)) { if (cmp < 0 || (cmp == 0 && !F_ISSET(end, WT_CURJOIN_END_EQ)) || (cmp > 0 && !F_ISSET(end, WT_CURJOIN_END_GT))) - WT_ERR(WT_NOTFOUND); + WT_RET(WT_NOTFOUND); } else { if (cmp > 0 || (cmp == 0 && !F_ISSET(end, WT_CURJOIN_END_EQ)) || (cmp < 0 && !F_ISSET(end, WT_CURJOIN_END_LT))) - WT_ERR(WT_NOTFOUND); + WT_RET(WT_NOTFOUND); } } -err: return (ret); + return (0); } typedef struct { WT_CURSOR iface; WT_CURSOR_JOIN_ENTRY *entry; - int ismember; + bool ismember; } WT_CURJOIN_EXTRACTOR; /* @@ -584,8 +631,8 @@ __curjoin_extract_insert(WT_CURSOR *cursor) { ret = __curjoin_entry_in_range(session, cextract->entry, &ikey, false); if (ret == WT_NOTFOUND) ret = 0; - else - cextract->ismember = 1; + else if (ret == 0) + cextract->ismember = true; return (ret); } @@ -602,27 +649,29 @@ __curjoin_entry_member(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_CURJOIN_EXTRACTOR extract_cursor; WT_CURSOR *c; WT_CURSOR_STATIC_INIT(iface, - __wt_cursor_get_key, /* get-key */ - __wt_cursor_get_value, /* get-value */ - __wt_cursor_set_key, /* set-key */ - __wt_cursor_set_value, /* set-value */ - __wt_cursor_notsup, /* compare */ - __wt_cursor_notsup, /* equals */ - __wt_cursor_notsup, /* next */ - __wt_cursor_notsup, /* prev */ - __wt_cursor_notsup, /* reset */ - __wt_cursor_notsup, /* search */ - __wt_cursor_notsup, /* search-near */ - __curjoin_extract_insert, /* insert */ - __wt_cursor_notsup, /* update */ - __wt_cursor_notsup, /* reconfigure */ - __wt_cursor_notsup, /* remove */ - __wt_cursor_notsup); /* close */ + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ + __wt_cursor_compare_notsup, /* compare */ + __wt_cursor_equals_notsup, /* equals */ + __wt_cursor_notsup, /* next */ + __wt_cursor_notsup, /* prev */ + __wt_cursor_notsup, /* reset */ + __wt_cursor_notsup, /* search */ + __wt_cursor_search_near_notsup, /* search-near */ + __curjoin_extract_insert, /* insert */ + __wt_cursor_notsup, /* update */ + __wt_cursor_notsup, /* remove */ + __wt_cursor_reconfigure_notsup, /* reconfigure */ + __wt_cursor_notsup); /* close */ WT_DECL_RET; WT_INDEX *idx; WT_ITEM *key, v; bool bloom_found; + if (skip_left && entry->ends_next == 1) + return (0); /* no checks to make */ key = cjoin->iter->curkey; entry->stats.accesses++; bloom_found = false; @@ -645,24 +694,35 @@ __curjoin_entry_member(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, bloom_found = true; } if (entry->index != NULL) { - memset(&v, 0, sizeof(v)); /* Keep lint quiet. */ - c = entry->main; - c->set_key(c, key); - if ((ret = c->search(c)) == 0) - ret = c->get_value(c, &v); - else if (ret == WT_NOTFOUND) - WT_ERR_MSG(session, WT_ERROR, - "main table for join is missing entry."); - WT_TRET(c->reset(c)); - WT_ERR(ret); + /* + * If this entry is used by the iterator, then we already + * have the index key, and we won't have to do any extraction + * either. + */ + if (entry == cjoin->iter->entry) + WT_ITEM_SET(v, cjoin->iter->idxkey); + else { + memset(&v, 0, sizeof(v)); /* Keep lint quiet. */ + c = entry->main; + c->set_key(c, key); + if ((ret = c->search(c)) == 0) + ret = c->get_value(c, &v); + else if (ret == WT_NOTFOUND) + WT_ERR_MSG(session, WT_ERROR, + "main table for join is missing entry"); + WT_TRET(c->reset(c)); + WT_ERR(ret); + } } else - v = *key; + WT_ITEM_SET(v, *key); - if ((idx = entry->index) != NULL && idx->extractor != NULL) { + if ((idx = entry->index) != NULL && idx->extractor != NULL && + entry != cjoin->iter->entry) { + WT_CLEAR(extract_cursor); extract_cursor.iface = iface; extract_cursor.iface.session = &session->iface; extract_cursor.iface.key_format = idx->exkey_format; - extract_cursor.ismember = 0; + extract_cursor.ismember = false; extract_cursor.entry = entry; WT_ERR(idx->extractor->extract(idx->extractor, &session->iface, key, &v, &extract_cursor.iface)); @@ -685,7 +745,9 @@ err: if (ret == WT_NOTFOUND && bloom_found) static int __curjoin_next(WT_CURSOR *cursor) { + WT_CURSOR *c; WT_CURSOR_JOIN *cjoin; + WT_CURSOR_JOIN_ITER *iter; WT_DECL_RET; WT_SESSION_IMPL *session; bool skip_left; @@ -701,9 +763,11 @@ __curjoin_next(WT_CURSOR *cursor) if (!F_ISSET(cjoin, WT_CURJOIN_INITIALIZED)) WT_ERR(__curjoin_init_iter(session, cjoin)); + F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + iter = cjoin->iter; + nextkey: - if ((ret = __curjoin_entry_iter_next(cjoin->iter, &cursor->key, - &cursor->recno)) == 0) { + if ((ret = __curjoin_entry_iter_next(iter, cursor)) == 0) { F_SET(cursor, WT_CURSTD_KEY_EXT); /* @@ -715,11 +779,31 @@ nextkey: for (i = 0; i < cjoin->entries_next; i++) { ret = __curjoin_entry_member(session, cjoin, &cjoin->entries[i], skip_left); - if (ret == WT_NOTFOUND) + if (ret == WT_NOTFOUND) { + /* + * If this is compare=eq on our outer iterator, + * and we've moved past it, we're done. + */ + if (iter->isequal && i == 0) + break; goto nextkey; + } skip_left = false; WT_ERR(ret); } + } else if (ret != WT_NOTFOUND) + WT_ERR(ret); + + if (ret == 0) { + /* + * Position the 'main' cursor, this will be used to + * retrieve values from the cursor join. + */ + c = iter->main; + c->set_key(c, iter->curkey); + if ((ret = c->search(c)) != 0) + WT_ERR(c->search(c)); + F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); } if (0) { @@ -785,10 +869,11 @@ __curjoin_close(WT_CURSOR *cursor) for (end = &entry->ends[0]; end < &entry->ends[entry->ends_next]; end++) { F_CLR(end->cursor, WT_CURSTD_JOINED); - if (F_ISSET(end, WT_CURJOIN_END_OWN_KEY)) - __wt_free(session, end->key.data); + if (F_ISSET(end, WT_CURJOIN_END_OWN_CURSOR)) + WT_TRET(end->cursor->close(end->cursor)); } __wt_free(session, entry->ends); + __wt_free(session, entry->repack_format); } if (cjoin->iter != NULL) @@ -810,22 +895,22 @@ __wt_curjoin_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, - __curjoin_get_key, /* get-key */ - __curjoin_get_value, /* get-value */ - __wt_cursor_notsup, /* set-key */ - __wt_cursor_notsup, /* set-value */ - __wt_cursor_notsup, /* compare */ - __wt_cursor_notsup, /* equals */ - __curjoin_next, /* next */ - __wt_cursor_notsup, /* prev */ - __curjoin_reset, /* reset */ - __wt_cursor_notsup, /* search */ - __wt_cursor_notsup, /* search-near */ - __wt_cursor_notsup, /* insert */ - __wt_cursor_notsup, /* update */ - __wt_cursor_notsup, /* remove */ - __wt_cursor_notsup, /* reconfigure */ - __curjoin_close); /* close */ + __curjoin_get_key, /* get-key */ + __curjoin_get_value, /* get-value */ + __wt_cursor_set_key_notsup, /* set-key */ + __wt_cursor_set_value_notsup, /* set-value */ + __wt_cursor_compare_notsup, /* compare */ + __wt_cursor_equals_notsup, /* equals */ + __curjoin_next, /* next */ + __wt_cursor_notsup, /* prev */ + __curjoin_reset, /* reset */ + __wt_cursor_notsup, /* search */ + __wt_cursor_search_near_notsup, /* search-near */ + __wt_cursor_notsup, /* insert */ + __wt_cursor_notsup, /* update */ + __wt_cursor_notsup, /* remove */ + __wt_cursor_reconfigure_notsup, /* reconfigure */ + __curjoin_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_JOIN *cjoin; WT_DECL_ITEM(tmp); @@ -891,22 +976,22 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_INDEX *idx, WT_CURSOR *ref_cursor, uint8_t flags, uint8_t range, uint64_t count, uint32_t bloom_bit_count, uint32_t bloom_hash_count) { + WT_CURSOR_INDEX *cindex; + WT_CURSOR_JOIN_ENDPOINT *end; WT_CURSOR_JOIN_ENTRY *entry; WT_DECL_RET; - WT_CURSOR_JOIN_ENDPOINT *end, *newend; bool hasins, needbloom, range_eq; - u_int i, ins, nonbloom; + char *main_uri, *newformat; const char *raw_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), "raw", NULL }; - char *main_uri; - size_t namesize, newsize; + size_t len, newsize; + u_int i, ins, nonbloom; entry = NULL; hasins = needbloom = false; ins = 0; /* -Wuninitialized */ main_uri = NULL; nonbloom = 0; /* -Wuninitialized */ - namesize = strlen(cjoin->table->name); for (i = 0; i < cjoin->entries_next; i++) { if (cjoin->entries[i].index == idx) { @@ -982,13 +1067,13 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, ((range & WT_CURJOIN_END_GT) != 0 || range_eq)) || (F_ISSET(end, WT_CURJOIN_END_LT) && ((range & WT_CURJOIN_END_LT) != 0 || range_eq)) || - (end->flags == WT_CURJOIN_END_EQ && + (WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_EQ && (range & (WT_CURJOIN_END_LT | WT_CURJOIN_END_GT)) != 0)) WT_ERR_MSG(session, EINVAL, "join has overlapping ranges"); if (range == WT_CURJOIN_END_EQ && - end->flags == WT_CURJOIN_END_EQ && + WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_EQ && !F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION)) WT_ERR_MSG(session, EINVAL, "compare=eq can only be combined " @@ -1013,31 +1098,70 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, entry->bloom_hash_count = WT_MAX(entry->bloom_hash_count, bloom_hash_count); } - WT_ERR(__wt_realloc_def(session, &entry->ends_allocated, - entry->ends_next + 1, &entry->ends)); - if (!hasins) - ins = entry->ends_next; - newend = &entry->ends[ins]; - memmove(newend + 1, newend, - (entry->ends_next - ins) * sizeof(WT_CURSOR_JOIN_ENDPOINT)); - memset(newend, 0, sizeof(WT_CURSOR_JOIN_ENDPOINT)); - entry->ends_next++; - newend->cursor = ref_cursor; - F_SET(newend, range); + WT_ERR(__curjoin_insert_endpoint(session, entry, + hasins ? ins : entry->ends_next, &end)); + end->cursor = ref_cursor; + F_SET(end, range); /* Open the main file with a projection of the indexed columns. */ - if (entry->main == NULL && entry->index != NULL) { - namesize = strlen(cjoin->table->name); - newsize = namesize + entry->index->colconf.len + 1; + if (entry->main == NULL && idx != NULL) { + newsize = strlen(cjoin->table->name) + idx->colconf.len + 1; WT_ERR(__wt_calloc(session, 1, newsize, &main_uri)); snprintf(main_uri, newsize, "%s%.*s", - cjoin->table->name, (int)entry->index->colconf.len, - entry->index->colconf.str); + cjoin->table->name, (int)idx->colconf.len, + idx->colconf.str); WT_ERR(__wt_open_cursor(session, main_uri, (WT_CURSOR *)cjoin, raw_cfg, &entry->main)); + if (idx->extractor == NULL) { + /* + * Add no-op padding so trailing 'u' formats are not + * transformed to 'U'. This matches what happens in + * the index. We don't do this when we have an + * extractor, extractors already use the padding + * byte trick. + */ + len = strlen(entry->main->value_format) + 3; + WT_ERR(__wt_calloc(session, len, 1, &newformat)); + snprintf(newformat, len, "%s0x", + entry->main->value_format); + __wt_free(session, entry->main->value_format); + entry->main->value_format = newformat; + } + + /* + * When we are repacking index keys to remove the primary + * key, we never want to transform trailing 'u'. Use no-op + * padding to force this. + */ + cindex = (WT_CURSOR_INDEX *)ref_cursor; + len = strlen(cindex->iface.key_format) + 3; + WT_ERR(__wt_calloc(session, len, 1, &entry->repack_format)); + snprintf(entry->repack_format, len, "%s0x", + cindex->iface.key_format); } -err: if (main_uri != NULL) - __wt_free(session, main_uri); +err: __wt_free(session, main_uri); return (ret); } + +/* + * __curjoin_insert_endpoint -- + * Insert a new entry into the endpoint array for the join entry. + */ +static int +__curjoin_insert_endpoint(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry, + u_int pos, WT_CURSOR_JOIN_ENDPOINT **newendp) +{ + WT_CURSOR_JOIN_ENDPOINT *newend; + + WT_RET(__wt_realloc_def(session, &entry->ends_allocated, + entry->ends_next + 1, &entry->ends)); + newend = &entry->ends[pos]; + memmove(newend + 1, newend, + (entry->ends_next - pos) * sizeof(WT_CURSOR_JOIN_ENDPOINT)); + memset(newend, 0, sizeof(WT_CURSOR_JOIN_ENDPOINT)); + entry->ends_next++; + *newendp = newend; + + return (0); +} diff --git a/src/cursor/cur_log.c b/src/cursor/cur_log.c index 3fcd8a86066..0a13803da5d 100644 --- a/src/cursor/cur_log.c +++ b/src/cursor/cur_log.c @@ -347,22 +347,22 @@ __wt_curlog_open(WT_SESSION_IMPL *session, { WT_CONNECTION_IMPL *conn; WT_CURSOR_STATIC_INIT(iface, - __wt_cursor_get_key, /* get-key */ - __wt_cursor_get_value, /* get-value */ - __wt_cursor_set_key, /* set-key */ - __wt_cursor_set_value, /* set-value */ - __curlog_compare, /* compare */ - __wt_cursor_equals, /* equals */ - __curlog_next, /* next */ - __wt_cursor_notsup, /* prev */ - __curlog_reset, /* reset */ - __curlog_search, /* search */ - __wt_cursor_notsup, /* search-near */ - __wt_cursor_notsup, /* insert */ - __wt_cursor_notsup, /* update */ - __wt_cursor_notsup, /* remove */ - __wt_cursor_notsup, /* reconfigure */ - __curlog_close); /* close */ + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ + __curlog_compare, /* compare */ + __wt_cursor_equals, /* equals */ + __curlog_next, /* next */ + __wt_cursor_notsup, /* prev */ + __curlog_reset, /* reset */ + __curlog_search, /* search */ + __wt_cursor_search_near_notsup, /* search-near */ + __wt_cursor_notsup, /* insert */ + __wt_cursor_notsup, /* update */ + __wt_cursor_notsup, /* remove */ + __wt_cursor_reconfigure_notsup, /* reconfigure */ + __curlog_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_LOG *cl; WT_DECL_RET; @@ -397,7 +397,7 @@ __wt_curlog_open(WT_SESSION_IMPL *session, * The user may be trying to read a log record they just wrote. * Log records may be buffered, so force out any now. */ - WT_ERR(__wt_log_force_write(session, 1)); + WT_ERR(__wt_log_force_write(session, 1, NULL)); /* Log cursors block archiving. */ WT_ERR(__wt_readlock(session, log->log_archive_lock)); diff --git a/src/cursor/cur_metadata.c b/src/cursor/cur_metadata.c index df66ef34ddd..3d702e2ea8c 100644 --- a/src/cursor/cur_metadata.c +++ b/src/cursor/cur_metadata.c @@ -31,6 +31,58 @@ } while (0) /* + * __wt_schema_create_final -- + * Create a single configuration line from a set of configuration strings, + * including all of the defaults declared for a session.create, and stripping + * any configuration strings that don't belong in a session.create. Here for + * the wt dump command utility, which reads a set of configuration strings and + * needs to add in the defaults and then collapse them into single string for + * a subsequent load. + */ +int +__wt_schema_create_final( + WT_SESSION_IMPL *session, char *cfg_arg[], char **value_ret) +{ + WT_DECL_RET; + u_int i; + const char **cfg; + + /* + * Count the entries in the original, + * Allocate a copy with the defaults as the first entry, + * Collapse the whole thing into a single configuration string (which + * also strips any entries that don't appear in the first entry). + */ + for (i = 0; cfg_arg[i] != NULL; ++i) + ; + WT_RET(__wt_calloc_def(session, i + 2, &cfg)); + cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_create); + for (i = 0; cfg_arg[i] != NULL; ++i) + cfg[i + 1] = cfg_arg[i]; + cfg[i + 1] = NULL; + + ret = __wt_config_collapse(session, cfg, value_ret); + + __wt_free(session, cfg); + return (ret); +} + +/* + * __schema_create_strip -- + * Discard any configuration information from a schema entry that is not + * applicable to an session.create call. Here for the metadata:create URI. + */ +static int +__schema_create_strip( + WT_SESSION_IMPL *session, const char *value, char **value_ret) +{ + const char *cfg[] = + { WT_CONFIG_BASE(session, WT_SESSION_create), value, NULL }; + + return (__wt_config_collapse(session, cfg, value_ret)); +} + +/* * __curmetadata_setkv -- * Copy key/value into the public cursor, stripping internal metadata for * "create-only" cursors. @@ -49,8 +101,7 @@ __curmetadata_setkv(WT_CURSOR_METADATA *mdc, WT_CURSOR *fc) c->key.data = fc->key.data; c->key.size = fc->key.size; if (F_ISSET(mdc, WT_MDC_CREATEONLY)) { - WT_RET(__wt_schema_create_strip( - session, fc->value.data, NULL, &value)); + WT_RET(__schema_create_strip(session, fc->value.data, &value)); ret = __wt_buf_set( session, &c->value, value, strlen(value) + 1); __wt_free(session, value); @@ -92,8 +143,7 @@ __curmetadata_metadata_search(WT_SESSION_IMPL *session, WT_CURSOR *cursor) WT_RET(__wt_metadata_search(session, WT_METAFILE_URI, &value)); if (F_ISSET(mdc, WT_MDC_CREATEONLY)) { - ret = __wt_schema_create_strip( - session, value, NULL, &stripped); + ret = __schema_create_strip(session, value, &stripped); __wt_free(session, value); WT_RET(ret); value = stripped; @@ -448,22 +498,22 @@ __wt_curmetadata_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, - __wt_cursor_get_key, /* get-key */ - __wt_cursor_get_value, /* get-value */ - __wt_cursor_set_key, /* set-key */ - __wt_cursor_set_value, /* set-value */ - __curmetadata_compare, /* compare */ - __wt_cursor_equals, /* equals */ - __curmetadata_next, /* next */ - __curmetadata_prev, /* prev */ - __curmetadata_reset, /* reset */ - __curmetadata_search, /* search */ - __curmetadata_search_near, /* search-near */ - __curmetadata_insert, /* insert */ - __curmetadata_update, /* update */ - __curmetadata_remove, /* remove */ - __wt_cursor_notsup, /* reconfigure */ - __curmetadata_close); /* close */ + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ + __curmetadata_compare, /* compare */ + __wt_cursor_equals, /* equals */ + __curmetadata_next, /* next */ + __curmetadata_prev, /* prev */ + __curmetadata_reset, /* reset */ + __curmetadata_search, /* search */ + __curmetadata_search_near, /* search-near */ + __curmetadata_insert, /* insert */ + __curmetadata_update, /* update */ + __curmetadata_remove, /* remove */ + __wt_cursor_reconfigure_notsup, /* reconfigure */ + __curmetadata_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_METADATA *mdc; WT_DECL_RET; diff --git a/src/cursor/cur_stat.c b/src/cursor/cur_stat.c index 00a6ade21c6..f7a8f5fc866 100644 --- a/src/cursor/cur_stat.c +++ b/src/cursor/cur_stat.c @@ -200,8 +200,6 @@ __curstat_next(WT_CURSOR *cursor) if (cst->notinitialized) { WT_ERR(__wt_curstat_init( session, cursor->internal_uri, NULL, cst->cfg, cst)); - if (cst->next_set != NULL) - WT_ERR((*cst->next_set)(session, cst, true, true)); cst->notinitialized = false; } @@ -209,6 +207,8 @@ __curstat_next(WT_CURSOR *cursor) if (cst->notpositioned) { cst->notpositioned = false; cst->key = WT_STAT_KEY_MIN(cst); + if (cst->next_set != NULL) + WT_ERR((*cst->next_set)(session, cst, true, true)); } else if (cst->key < WT_STAT_KEY_MAX(cst)) ++cst->key; else if (cst->next_set != NULL) @@ -244,8 +244,6 @@ __curstat_prev(WT_CURSOR *cursor) if (cst->notinitialized) { WT_ERR(__wt_curstat_init( session, cursor->internal_uri, NULL, cst->cfg, cst)); - if (cst->next_set != NULL) - WT_ERR((*cst->next_set)(session, cst, false, true)); cst->notinitialized = false; } @@ -253,6 +251,8 @@ __curstat_prev(WT_CURSOR *cursor) if (cst->notpositioned) { cst->notpositioned = false; cst->key = WT_STAT_KEY_MAX(cst); + if (cst->next_set != NULL) + WT_ERR((*cst->next_set)(session, cst, false, true)); } else if (cst->key > WT_STAT_KEY_MIN(cst)) --cst->key; else if (cst->next_set != NULL) @@ -449,7 +449,6 @@ __curstat_join_next_set(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst, WT_JOIN_STATS_GROUP *join_group; ssize_t pos; - WT_ASSERT(session, WT_STREQ(cst->iface.uri, "statistics:join")); join_group = &cst->u.join_stats_group; cjoin = join_group->join_cursor; if (init) @@ -504,14 +503,13 @@ __curstat_join_init(WT_SESSION_IMPL *session, WT_CURSOR *curjoin, const char *cfg[], WT_CURSOR_STAT *cst) { WT_CURSOR_JOIN *cjoin; - WT_DECL_RET; WT_UNUSED(cfg); if (curjoin == NULL && cst->u.join_stats_group.join_cursor != NULL) curjoin = &cst->u.join_stats_group.join_cursor->iface; if (curjoin == NULL || !WT_PREFIX_MATCH(curjoin->uri, "join:")) - WT_ERR_MSG(session, EINVAL, + WT_RET_MSG(session, EINVAL, "join cursor must be used with statistics:join"); cjoin = (WT_CURSOR_JOIN *)curjoin; memset(&cst->u.join_stats_group, 0, sizeof(WT_JOIN_STATS_GROUP)); @@ -522,8 +520,7 @@ __curstat_join_init(WT_SESSION_IMPL *session, cst->stats_count = sizeof(WT_JOIN_STATS) / sizeof(int64_t); cst->stats_desc = __curstat_join_desc; cst->next_set = __curstat_join_next_set; - -err: return (ret); + return (0); } /* @@ -544,25 +541,28 @@ __wt_curstat_init(WT_SESSION_IMPL *session, dsrc_uri = uri + strlen("statistics:"); if (WT_STREQ(dsrc_uri, "join")) - return (__curstat_join_init(session, curjoin, cfg, cst)); + WT_RET(__curstat_join_init(session, curjoin, cfg, cst)); - if (WT_PREFIX_MATCH(dsrc_uri, "colgroup:")) - return ( + else if (WT_PREFIX_MATCH(dsrc_uri, "colgroup:")) + WT_RET( __wt_curstat_colgroup_init(session, dsrc_uri, cfg, cst)); - if (WT_PREFIX_MATCH(dsrc_uri, "file:")) - return (__curstat_file_init(session, dsrc_uri, cfg, cst)); + else if (WT_PREFIX_MATCH(dsrc_uri, "file:")) + WT_RET(__curstat_file_init(session, dsrc_uri, cfg, cst)); - if (WT_PREFIX_MATCH(dsrc_uri, "index:")) - return (__wt_curstat_index_init(session, dsrc_uri, cfg, cst)); + else if (WT_PREFIX_MATCH(dsrc_uri, "index:")) + WT_RET(__wt_curstat_index_init(session, dsrc_uri, cfg, cst)); - if (WT_PREFIX_MATCH(dsrc_uri, "lsm:")) - return (__wt_curstat_lsm_init(session, dsrc_uri, cst)); + else if (WT_PREFIX_MATCH(dsrc_uri, "lsm:")) + WT_RET(__wt_curstat_lsm_init(session, dsrc_uri, cst)); - if (WT_PREFIX_MATCH(dsrc_uri, "table:")) - return (__wt_curstat_table_init(session, dsrc_uri, cfg, cst)); + else if (WT_PREFIX_MATCH(dsrc_uri, "table:")) + WT_RET(__wt_curstat_table_init(session, dsrc_uri, cfg, cst)); - return (__wt_bad_object_type(session, uri)); + else + return (__wt_bad_object_type(session, uri)); + + return (0); } /* @@ -575,22 +575,22 @@ __wt_curstat_open(WT_SESSION_IMPL *session, { WT_CONNECTION_IMPL *conn; WT_CURSOR_STATIC_INIT(iface, - __curstat_get_key, /* get-key */ - __curstat_get_value, /* get-value */ - __curstat_set_key, /* set-key */ - __curstat_set_value, /* set-value */ - __wt_cursor_notsup, /* compare */ - __wt_cursor_notsup, /* equals */ - __curstat_next, /* next */ - __curstat_prev, /* prev */ - __curstat_reset, /* reset */ - __curstat_search, /* search */ - __wt_cursor_notsup, /* search-near */ - __wt_cursor_notsup, /* insert */ - __wt_cursor_notsup, /* update */ - __wt_cursor_notsup, /* remove */ - __wt_cursor_notsup, /* reconfigure */ - __curstat_close); /* close */ + __curstat_get_key, /* get-key */ + __curstat_get_value, /* get-value */ + __curstat_set_key, /* set-key */ + __curstat_set_value, /* set-value */ + __wt_cursor_compare_notsup, /* compare */ + __wt_cursor_equals_notsup, /* equals */ + __curstat_next, /* next */ + __curstat_prev, /* prev */ + __curstat_reset, /* reset */ + __curstat_search, /* search */ + __wt_cursor_search_near_notsup, /* search-near */ + __wt_cursor_notsup, /* insert */ + __wt_cursor_notsup, /* update */ + __wt_cursor_notsup, /* remove */ + __wt_cursor_reconfigure_notsup, /* reconfigure */ + __curstat_close); /* close */ WT_CONFIG_ITEM cval, sval; WT_CURSOR *cursor; WT_CURSOR_STAT *cst; diff --git a/src/cursor/cur_std.c b/src/cursor/cur_std.c index 051f36c8854..7839971f975 100644 --- a/src/cursor/cur_std.c +++ b/src/cursor/cur_std.c @@ -9,27 +9,108 @@ #include "wt_internal.h" /* + * __wt_cursor_noop -- + * Cursor noop. + */ +int +__wt_cursor_noop(WT_CURSOR *cursor) +{ + WT_UNUSED(cursor); + + return (0); +} + +/* * __wt_cursor_notsup -- * Unsupported cursor actions. */ int __wt_cursor_notsup(WT_CURSOR *cursor) { - WT_UNUSED(cursor); + WT_SESSION_IMPL *session; - return (ENOTSUP); + session = (WT_SESSION_IMPL *)cursor->session; + WT_RET_MSG(session, ENOTSUP, "Unsupported cursor operation"); } /* - * __wt_cursor_noop -- - * Cursor noop. + * __wt_cursor_get_value_notsup -- + * WT_CURSOR.get_value not-supported. */ int -__wt_cursor_noop(WT_CURSOR *cursor) +__wt_cursor_get_value_notsup(WT_CURSOR *cursor, ...) { - WT_UNUSED(cursor); + return (__wt_cursor_notsup(cursor)); +} - return (0); +/* + * __wt_cursor_set_key_notsup -- + * WT_CURSOR.set_key not-supported. + */ +void +__wt_cursor_set_key_notsup(WT_CURSOR *cursor, ...) +{ + (void)__wt_cursor_notsup(cursor); +} + +/* + * __wt_cursor_set_value_notsup -- + * WT_CURSOR.set_value not-supported. + */ +void +__wt_cursor_set_value_notsup(WT_CURSOR *cursor, ...) +{ + (void)__wt_cursor_notsup(cursor); +} + +/* + * __wt_cursor_compare_notsup -- + * Unsupported cursor comparison. + */ +int +__wt_cursor_compare_notsup(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) +{ + WT_UNUSED(b); + WT_UNUSED(cmpp); + + return (__wt_cursor_notsup(a)); +} + +/* + * __wt_cursor_equals_notsup -- + * Unsupported cursor equality. + */ +int +__wt_cursor_equals_notsup(WT_CURSOR *cursor, WT_CURSOR *other, int *equalp) +{ + WT_UNUSED(other); + WT_UNUSED(equalp); + + return (__wt_cursor_notsup(cursor)); +} + +/* + * __wt_cursor_search_near_notsup -- + * Unsupported cursor search-near. + */ +int +__wt_cursor_search_near_notsup(WT_CURSOR *cursor, int *exact) +{ + WT_UNUSED(exact); + + return (__wt_cursor_notsup(cursor)); +} + +/* + * __wt_cursor_reconfigure_notsup -- + * Unsupported cursor reconfiguration. + */ +int +__wt_cursor_reconfigure_notsup(WT_CURSOR *cursor, const char *config) +{ + WT_UNUSED(config); + + return (__wt_cursor_notsup(cursor)); } /* @@ -46,13 +127,12 @@ __wt_cursor_set_notsup(WT_CURSOR *cursor) * cursors in a session. Reconfigure is left open in case it's possible * in the future to change these configurations. */ - cursor->compare = - (int (*)(WT_CURSOR *, WT_CURSOR *, int *))__wt_cursor_notsup; + cursor->compare = __wt_cursor_compare_notsup; cursor->next = __wt_cursor_notsup; cursor->prev = __wt_cursor_notsup; cursor->reset = __wt_cursor_noop; cursor->search = __wt_cursor_notsup; - cursor->search_near = (int (*)(WT_CURSOR *, int *))__wt_cursor_notsup; + cursor->search_near = __wt_cursor_search_near_notsup; cursor->insert = __wt_cursor_notsup; cursor->update = __wt_cursor_notsup; cursor->remove = __wt_cursor_notsup; @@ -628,7 +708,7 @@ __wt_cursor_init(WT_CURSOR *cursor, } else { WT_RET( __wt_config_gets_def(session, cfg, "readonly", 0, &cval)); - if (cval.val != 0) { + if (cval.val != 0 || F_ISSET(S2C(session), WT_CONN_READONLY)) { cursor->insert = __wt_cursor_notsup; cursor->update = __wt_cursor_notsup; cursor->remove = __wt_cursor_notsup; diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c index d986577f640..9eb88ec6fcd 100644 --- a/src/cursor/cur_table.c +++ b/src/cursor/cur_table.c @@ -79,22 +79,22 @@ __wt_apply_single_idx(WT_SESSION_IMPL *session, WT_INDEX *idx, WT_CURSOR *cur, WT_CURSOR_TABLE *ctable, int (*f)(WT_CURSOR *)) { WT_CURSOR_STATIC_INIT(iface, - __wt_cursor_get_key, /* get-key */ - __wt_cursor_get_value, /* get-value */ - __wt_cursor_set_key, /* set-key */ - __wt_cursor_set_value, /* set-value */ - __wt_cursor_notsup, /* compare */ - __wt_cursor_notsup, /* equals */ - __wt_cursor_notsup, /* next */ - __wt_cursor_notsup, /* prev */ - __wt_cursor_notsup, /* reset */ - __wt_cursor_notsup, /* search */ - __wt_cursor_notsup, /* search-near */ - __curextract_insert, /* insert */ - __wt_cursor_notsup, /* update */ - __wt_cursor_notsup, /* reconfigure */ - __wt_cursor_notsup, /* remove */ - __wt_cursor_notsup); /* close */ + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ + __wt_cursor_compare_notsup, /* compare */ + __wt_cursor_equals_notsup, /* equals */ + __wt_cursor_notsup, /* next */ + __wt_cursor_notsup, /* prev */ + __wt_cursor_notsup, /* reset */ + __wt_cursor_notsup, /* search */ + __wt_cursor_search_near_notsup, /* search-near */ + __curextract_insert, /* insert */ + __wt_cursor_notsup, /* update */ + __wt_cursor_notsup, /* remove */ + __wt_cursor_reconfigure_notsup, /* reconfigure */ + __wt_cursor_notsup); /* close */ WT_CURSOR_EXTRACTOR extract_cursor; WT_DECL_RET; WT_ITEM key, value; @@ -842,22 +842,22 @@ __wt_curtable_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, - __wt_curtable_get_key, /* get-key */ - __wt_curtable_get_value, /* get-value */ - __wt_curtable_set_key, /* set-key */ - __wt_curtable_set_value, /* set-value */ - __curtable_compare, /* compare */ - __wt_cursor_equals, /* equals */ - __curtable_next, /* next */ - __curtable_prev, /* prev */ - __curtable_reset, /* reset */ - __curtable_search, /* search */ - __curtable_search_near, /* search-near */ - __curtable_insert, /* insert */ - __curtable_update, /* update */ - __curtable_remove, /* remove */ - __wt_cursor_reconfigure, /* reconfigure */ - __curtable_close); /* close */ + __wt_curtable_get_key, /* get-key */ + __wt_curtable_get_value, /* get-value */ + __wt_curtable_set_key, /* set-key */ + __wt_curtable_set_value, /* set-value */ + __curtable_compare, /* compare */ + __wt_cursor_equals, /* equals */ + __curtable_next, /* next */ + __curtable_prev, /* prev */ + __curtable_reset, /* reset */ + __curtable_search, /* search */ + __curtable_search_near, /* search-near */ + __curtable_insert, /* insert */ + __curtable_update, /* update */ + __curtable_remove, /* remove */ + __wt_cursor_reconfigure, /* reconfigure */ + __curtable_close); /* close */ WT_CONFIG_ITEM cval; WT_CURSOR *cursor; WT_CURSOR_TABLE *ctable; diff --git a/src/docs/checkpoint.dox b/src/docs/checkpoint.dox index 523c0887859..ec28fea13c3 100644 --- a/src/docs/checkpoint.dox +++ b/src/docs/checkpoint.dox @@ -23,11 +23,16 @@ All transactional updates committed before a checkpoint are made durable by the checkpoint, therefore the frequency of checkpoints limits the volume of data that may be lost due to application or system failure. -When WiredTiger data sources are first opened, they are opened in the -state of the most recent checkpoint taken on the file, in other words, -updates after the most recent checkpoint will not appear in the data -source. If no checkpoint is found when the data source is opened, the -data source will appear empty. +Data sources that are involved in an exclusive operation when the +checkpoint starts, including bulk load, verify or salvage, will be skipped +by the checkpoint. Operations requiring exclusive access may fail with +an \c EBUSY error if attempted during a checkpoint. + +When data sources are first opened, they are opened in the state of the +most recent checkpoint taken on the file, in other words, updates after the +most recent checkpoint will not appear in the data source. If no +checkpoint is found when the data source is opened, the data source will +appear empty. @section checkpoint_server Automatic checkpoints @@ -54,15 +59,16 @@ checkpoint cursor is closed. @section checkpoint_naming Checkpoint naming -Additionally, checkpoints that do not include LSM trees may optionally -be given names by the application. Checkpoints named by the application -persist until explicitly discarded or the application creates a new -checkpoint with the same name (which replaces the previous checkpoint -of that name). If the previous checkpoint cannot be replaced, either -because a cursor is reading from the previous checkpoint, or backups are -in progress, the checkpoint will fail. Because named checkpoints -persist until discarded or replaced, they can be used to periodically -snapshot data for later use. +Additionally, checkpoints that do not include LSM trees may optionally be +given names by the application. Because named checkpoints persist until +discarded or replaced, they can be used to periodically snapshot data for +later use. + +Checkpoints named by the application persist until explicitly discarded or +the application creates a new checkpoint with the same name (which replaces +the previous checkpoint of that name). If the previous checkpoint cannot be +replaced, either because a cursor is reading from the previous checkpoint, +or backups are in progress, the checkpoint will fail. Internal checkpoints (that is, checkpoints not named by the application) use the reserved name "WiredTigerCheckpoint". Applications can open the diff --git a/src/docs/command-line.dox b/src/docs/command-line.dox index e2b376d5e3f..0f5c56d25ce 100644 --- a/src/docs/command-line.dox +++ b/src/docs/command-line.dox @@ -41,7 +41,7 @@ by default and commands that only read data will not run recovery. Perform a backup of a database or set of data sources. The \c backup command performs a backup of the database, copying the -database files to a \c specified directory, which can be subsequently +underlying files to a \c specified directory, which can be subsequently opened as a WiredTiger database. See @ref backup for more information, and @ref file_permissions for specifics on the copied file permissions. @@ -58,10 +58,10 @@ the named data sources. <hr> @section util_compact wt compact -Compact a table or file. +Compact a table. -The \c compact command attempts to rewrite the specified table or file -to consume less disk space. +The \c compact command attempts to rewrite the specified table to +consume less disk space. @subsection util_compact_synopsis Synopsis <code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] compact uri</code> @@ -71,7 +71,7 @@ The \c compact command has no command-specific options. <hr> @section util_create wt create -Create a table or file. +Create a table. The \c create command creates the specified \c uri with the specified configuration. It is equivalent to a call to WT_SESSION::create with @@ -88,7 +88,7 @@ Include a configuration string to be passed to WT_SESSION::create. <hr> @section util_drop wt drop -Drop a table or file. +Drop a table. The \c drop command drops the specified \c uri. It is equivalent to a call to WT_SESSION::drop with the "force" configuration argument. @@ -136,10 +136,10 @@ printable characters unencoded). <hr> @section util_list wt list -List the tables and files in the database. +List the tables in the database. -By default, the \c list command prints out the tables and files stored in -the database. If a URI is specified as an argument, only information about +By default, the \c list command prints out the tables stored in the +database. If a URI is specified as an argument, only information about that data source is printed. @subsection util_list_synopsis Synopsis @@ -158,16 +158,16 @@ value is printed. <hr> @section util_load wt load -Load a table or file from dump output. +Load a table from dump output. The \c load command reads the standard input for data and loads it into -a table or file, creating the table or file if it does not yet exist. -The data should be the format produced by the \c dump command; see -@ref dump_formats for details. +a table, creating the table if it does not yet exist. The data should +be the format produced by the \c dump command; see @ref dump_formats for +details. -By default, if the table or file already exists, data in the file or -table will be overwritten by the new data (use the \c -n option to -make an attempt to overwrite existing data return an error). +By default, if the table already exists, data in the table will be +overwritten by the new data (use the \c -n option to make an attempt to +overwrite existing data return an error). @subsection util_load_synopsis Synopsis <code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] load [-ajn] [-f input] [-r name] [uri configuration ...]</code> @@ -182,8 +182,8 @@ number keys. The \c -a option is only applicable when loading into a column store. @par <code>-f</code> -By default, the \c load command reads from the standard input; the \c --f option reads the input from the specified file. +By default, the \c load command reads from the standard input; the \c -f +option reads the input from the specified file. @par <code>-j</code> Load input in the JSON (<a href="http://www.json.org">JavaScript Object @@ -196,7 +196,7 @@ load command to fail if there's an attempt to overwrite already existing data. @par <code>-r</code> -By default, the \c load command uses the table or file name taken from the +By default, the \c load command uses the table name taken from the input; the \c -r option renames the data source. Additionally, \c uri and \c configuration pairs may be specified to the @@ -227,24 +227,23 @@ table:xxx block_allocation=first table:xxx prefix_compress=false <hr> @section util_loadtext wt loadtext -Load text into a table or file. +Load text into a table. The \c loadtext command reads the standard input for text and loads it -into a table or file. The input data should be printable characters, -with newline delimiters for each key or value. +into a table. The input data should be printable characters, with +newline delimiters for each key or value. -The \c loadtext command does not create the file if it does not yet +The \c loadtext command does not create the object if it does not yet exist. -In the case of inserting values into a column-store table or file, each -value is appended to the table or file; in the case of inserting values -into a row-store table or file, lines are handled in pairs, where the -first line is the key and the second line is the value. If the -row-store table or file already exists, data in the table or file will -be overwritten by the new data. +In the case of inserting values into a column-store table, each value +is appended to the table; in the case of inserting values into a +row-store table, lines are handled in pairs, where the first line is the +key and the second line is the value. If the row-store table already +exists, data in the table will be overwritten by the new data. @subsection util_loadtext_synopsis Synopsis -<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] loadtext [-f input]</code> +<code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] loadtext [-f input] uri</code> @subsection util_loadtext_options Options The following are command-specific options for the \c loadtext command: @@ -275,7 +274,7 @@ to the default string format. <hr> @section util_read wt read -Read records from a table or file. +Read records from a table. The \c read command prints out the records associated with the specified keys from the specified data source. The data source must be configured @@ -291,9 +290,9 @@ The \c read command has no command-specific options. <hr> @section util_rename wt rename -Rename a table or file. +Rename a table. -The \c rename command renames the specified table or file. +The \c rename command renames the specified table. @subsection util_rename_synopsis Synopsis <code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] rename uri name</code> @@ -303,11 +302,11 @@ The \c rename command has no command-specific options. <hr> @section util_salvage wt salvage -Recover data from a corrupted file. +Recover data from a corrupted table. The \c salvage command salvages the specified data source, discarding any -data that cannot be recovered. Underlying files are re-written in -place, overwriting the original file contents. +data that cannot be recovered. Underlying files are re-written in place, +overwriting the original file contents. @subsection util_salvage_synopsis Synopsis <code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] salvage [-F force] uri</code> @@ -316,9 +315,9 @@ place, overwriting the original file contents. The following are command-specific options for the \c salvage command: @par <code>-F</code> -By default, salvage will refuse to salvage files that fail basic tests -(for example, files that don't appear to be in a WiredTiger format). -The \c -F option forces the salvage of the file, regardless. +By default, salvage will refuse to salvage tables that fail basic tests +(for example, tables that don't appear to be in a WiredTiger format). +The \c -F option forces the salvage of the table, regardless. <hr> @section util_stat wt stat @@ -339,11 +338,11 @@ Include only "fast" statistics in the output (equivalent to passing <hr> @section util_upgrade wt upgrade -Upgrade a table or file. +Upgrade a table. -The \c upgrade command upgrades the specified table or file, exiting -success if the data source is up-to-date, and failure if the data source -cannot be upgraded. +The \c upgrade command upgrades the specified table, exiting success if +the data source is up-to-date, and failure if the data source cannot be +upgraded. @subsection util_upgrade_synopsis Synopsis <code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] upgrade uri</code> @@ -353,11 +352,10 @@ The \c upgrade command has no command-specific options. <hr> @section util_verify wt verify -Check the structural integrity of a table or file. +Check the structural integrity of a table. -The \c verify command verifies the specified table or file, exiting -success if the data source is correct, and failure if the data source is -corrupted. +The \c verify command verifies the specified table, exiting success if +the data source is correct, and failure if the data source is corrupted. @subsection util_verify_synopsis Synopsis <code>wt [-RVv] [-C config] [-E secretkey ] [-h directory] verify uri</code> @@ -367,7 +365,7 @@ The \c verify command has no command-specific options. <hr> @section util_write wt write -Write records to a table or file. +Write records to a table. The \c write command stores records into the specified data source. The data source must be configured with string or record number keys and diff --git a/src/docs/data-sources.dox b/src/docs/data-sources.dox index d09d1cbc1b8..7f1879e0ffe 100644 --- a/src/docs/data-sources.dox +++ b/src/docs/data-sources.dox @@ -38,7 +38,7 @@ cursor types that give access to data managed by WiredTiger: key=<code>string</code>\, value=<code>string</code>\,<br> see @ref metadata for details} @row{<tt>statistics:[\<data source URI\>]</tt>, - database or data source statistics cursor, + database, data source or join statistics cursor, key=<code>int id</code>\,<br> value=<code>(string description\, string value\, uint64_t value)</code>\,<br> @@ -106,7 +106,9 @@ WiredTiger database as well as statistics for individual data sources. The statistics are at two levels: per-database and per-individual data source. Database-wide statistics are retrieved with the \c "statistics:" URI; individual data source statistics are available by specifying -\c "statistics:<data source URI>". +\c "statistics:<data source URI>". Additionally, statistics about a +join cursor can be retrieved by specifying \c "statistics:join" and +supplying the join cursor as an argument in the SESSION::open_cursor call. The statistic key is an integer from the list of keys in @ref_single statistics_keys "Statistics Keys". Statistics cursors return @@ -127,7 +129,11 @@ The following is an example of printing statistics about a table: @snippet ex_stat.c statistics table function -Both examples can use a common display routine that iterates through the +The following is an example of printing statistics about a join cursor: + +@snippet ex_stat.c statistics join cursor function + +These three examples can use a common display routine that iterates through the statistics until the cursor returns the end of the list. @snippet ex_stat.c statistics display function diff --git a/src/docs/error-handling.dox b/src/docs/error-handling.dox index d1291e38ff0..d91a126ee21 100644 --- a/src/docs/error-handling.dox +++ b/src/docs/error-handling.dox @@ -55,14 +55,32 @@ This error is generated when wiredtiger_open is configured to return an error if @if IGNORE_BUILT_BY_API_ERR_END @endif -The ::wiredtiger_strerror function returns the standard message -associated with any WiredTiger, ISO C99, or POSIX 1003.1-2001 function: +@section error_translation Translating errors + +The WT_SESSION::strerror and ::wiredtiger_strerror functions return the +standard text message associated with any WiredTiger, ISO C, or POSIX +standard API. + +@snippet ex_all.c Display an error thread safe @snippet ex_all.c Display an error +Note that ::wiredtiger_strerror is not thread-safe. + @m_if{c} +@section error_handling_event Error handling using the WT_EVENT_HANDLER + More complex error handling can be configured by passing an implementation of WT_EVENT_HANDLER to ::wiredtiger_open or WT_CONNECTION::open_session. + +For example, both informational and error messages might be passed to an +application-specific logging function that added a timestamp and logged +the message to a file, and error messages might additionally be output to +the \c stderr file stream. + +@snippet ex_event_handler.c Function event_handler +@snippet ex_event_handler.c Configure event_handler + @m_endif */ diff --git a/src/docs/license.dox b/src/docs/license.dox index febced2c6af..d7814d04fd6 100644 --- a/src/docs/license.dox +++ b/src/docs/license.dox @@ -2,16 +2,16 @@ The complete WiredTiger software package is Open Source software: you are welcome to modify and redistribute it under the terms of -<a href="http://www.gnu.org/licenses/gpl-2.0-standalone.html"> -<b>version 2</b></a> or -<a href="http://www.gnu.org/licenses/gpl-3.0-standalone.html"> -<b>version 3</b></a> of the -<b>GNU General Public License</b></a> +<a href="http://www.gnu.org/licenses/gpl-2.0-standalone.html">version 2</a> +or +<a href="http://www.gnu.org/licenses/gpl-3.0-standalone.html">version 3</a> +of the +<b>GNU General Public License</b> as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -<b>GNU General Public License</b></a> for details. +<b>GNU General Public License</b> for details. Additionally, portions of the WiredTiger distribution are distributed under the terms of the @@ -31,10 +31,10 @@ those described above, or for technical support for this software, please contact MongoDB, Inc. at <a mailto="info@wiredtiger.com">info@wiredtiger.com</a>. -@section license_library 3rd party software included in the WiredTiger library +@section license_library 3rd party software always included in the WiredTiger library Every build of the WiredTiger library binary includes the following 3rd -party software, distributed under their license terms. Redistribution +party software, distributed under separate license terms. Redistribution of the WiredTiger library should comply with these copyrights. <table> @@ -46,14 +46,26 @@ of the WiredTiger library should comply with these copyrights. @row{\c src/support/hash_fnv.c, Authors, Public Domain} </table> +@section license_crc32-vpmsum 3rd party software optionally included in the WiredTiger library: PPC64 + +PPC64 and PPC64LE builds of the WiredTiger library binary include additional +3rd party software, distributed under separate license terms. Redistribution +of the WiredTiger library PPC64 and PPC64LE builds should comply with these +copyrights. + +<table> +@hrow{Distribution Files, Copyright Holder, License} +@row{\c src/support/power8/*, Anton Blanchard, <a href="http://opensource.org/licenses/Apache-2.0">Apache License\, Version 2.0</a> or the <a href="http://www.gnu.org/licenses/gpl-2.0-standalone.html">GNU General Public License\, version 2 or later</a>} +</table> + @section license_leveldb 3rd party software optionally included in the WiredTiger library: LevelDB If the \c --enable-leveldb configuration option is specified when configuring the WiredTiger build, additional 3rd party software is -included in the WiredTiger LevelDB library binary, distributed under -their license terms. Redistribution of the WiredTiger library built -with the \c --enable-leveldb configuration option should comply with -these copyrights. +included in the WiredTiger library binary, distributed under separate +license terms. Redistribution of the WiredTiger library built with the +\c --enable-leveldb configuration option should comply with these +copyrights. <table> @hrow{Distribution Files, Copyright Holder, License} diff --git a/src/docs/programming.dox b/src/docs/programming.dox index 5d79edd660b..f717f4ed1fe 100644 --- a/src/docs/programming.dox +++ b/src/docs/programming.dox @@ -30,6 +30,7 @@ each of which is ordered by one or more columns. <h2>Programming notes</h2> - @subpage threads - @subpage namespace +- @subpage readonly @m_if{c} - @subpage signals @m_endif diff --git a/src/docs/readonly.dox b/src/docs/readonly.dox new file mode 100644 index 00000000000..ad4a94a73f1 --- /dev/null +++ b/src/docs/readonly.dox @@ -0,0 +1,55 @@ +/*! @m_page{{c,java},readonly,Database read-only mode} + +WiredTiger supports read-only mode databases. When a database is opened +in read-only mode, all modifications are disabled on the WT_CONNECTION +handle, any sessions opened in that connection and any cursors opened +in any of those sessions. For example, all cursor or session handle +methods that modify the database will instead return errors. + +When a database is opened in read-only mode, the database directory and +content must already exist and have been shutdown cleanly. + +@section readonly_config Database read-only configuration considerations + +The \c readonly configuration affects other configuration settings. +Where a default setting contradicts read-only operation, WiredTiger +defaults are overridden to perform in a read-only mode. For example, LSM +tree merges are turned off when LSM trees are configured, and log file +archiving is disabled when logging is configured. + +Where a user configured setting contradicts read-only operation, WiredTiger +will return an error. For example, zero-filling +log files is not allowed in read-only mode, and attempting to configure +them will return an error. + +@section readonly_recovery Readonly configuration and recovery + +Because recovery modifies the database, recovery cannot be done in +read-only mode. A ::wiredtiger_open call to open a database in read-only +mode will fail if the database was not cleanly shutdown and recovery is +required. + +@section readonly_logging Readonly configuration and logging + +If logging is enabled on the database when opened in read-only mode, log +file archiving and log file pre-allocation are disabled and the log files +will not be modified any way. + +@section readonly_lsm Readonly configuration and LSM trees + +If LSM trees are in use, read-only mode turns off all modification. +Internal LSM operations such as merging, creating new chunks, creating +bloom filters and dropping old chunks are disabled. + +@section readonly_handles Readonly configuration and multiple database handles + +One unusual affect of read-only operations is the potential for multiple +read-only database handles open on the same database at the same time. +WiredTiger prevents multiple connection handles by writing a lock file, +and this locking is done even in read-only mode. However, if the lock +file cannot be written, opening in read-only mode is still allowed to +proceed. For that reason, multiple read-only connection handles could +be open at the same time. Normal locking occurs if the lock file can be +written in read-only mode, preventing multiple database connections. + +*/ diff --git a/src/docs/spell.ok b/src/docs/spell.ok index 80597302cbb..efc306568cd 100644 --- a/src/docs/spell.ok +++ b/src/docs/spell.ok @@ -7,6 +7,7 @@ Atomicity BLOBs CFLAGS CPPFLAGS +CRC Cheng Christoph Collet's @@ -64,6 +65,7 @@ NOTFOUND NUMA NoSQL OPTYPE +PPC PRELOAD README Rebalance @@ -151,6 +153,7 @@ control's copydoc cpp crashless +crc cursortype customerABC cv @@ -377,6 +380,7 @@ rVv rdbms rdlock readlock +readonly realclean realloc realloc'd @@ -419,6 +423,7 @@ src ssd startsync statlog +stderr str strerror strftime @@ -475,6 +480,7 @@ valuefmt vec versa vm +vpmsum warmup whitespace wiredtiger diff --git a/src/docs/statistics.dox b/src/docs/statistics.dox index 453da34c51a..0a29e351e4e 100644 --- a/src/docs/statistics.dox +++ b/src/docs/statistics.dox @@ -79,6 +79,15 @@ or logged: @snippet ex_all.c Statistics clear configuration +The following example opens a statistics cursor on an open join cursor: + +@snippet ex_schema.c Statistics cursor join cursor + +The statistics gathered will be organized by reference cursors participating +in the join (see WT_SESSION::join); the uri of each reference cursor appears +as a prefix in the description field returned as a value by the statistics +cursor. + @section statistics_log Statistics logging WiredTiger will optionally log database statistics into a file when the diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox index e4d85003a1e..8b3d61e4c19 100644 --- a/src/docs/upgrading.dox +++ b/src/docs/upgrading.dox @@ -2,27 +2,34 @@ @section version_271 Upgrading to Version 2.7.1 <dl> +<dt>LSM metadata</dt> +<dd> +There is a change to the format of LSM metadata in this release to fix bugs +in dump / load of tables of type LSM. Tables created with the old LSM metadata +format will be upgraded automatically, but once updated to the new version +<b>are no longer compatible with older releases of WiredTiger</b>. +</dd> + <dt>Column-store bulk-load cursors</dt> <dd> -Historically, bulk-load of a column-store object ignored any key set in -the cursor and automatically assigned each inserted row the next -sequential record number for its key. In the 2.7.1 release, column-store -objects match row-store behavior and require the cursor key be set -before an insert. (This also allows allows sparse tables to be created -in column-store objects, any skipped records are created as -already-deleted rows.) To match the previous behavior, specify the -\c append configuration string when opening the column-store bulk-load -cursor; this causes the cursor's key to be ignored and each inserted row -will be assigned the next record number. +Historically, bulk-load of a column-store object ignored any key set in the +cursor and automatically assigned each inserted row the next sequential +record number for its key. In the 2.7.1 release, column-store objects match +row-store behavior and require the cursor key be set before an insert. +(This allows sparse tables to be created in column-store objects, any +skipped records are created as already-deleted rows.) To match the previous +behavior, specify the \c append configuration string when opening the +column-store bulk-load cursor; this causes the cursor's key to be ignored +and each inserted row will be assigned the next record number. </dd> <dt>Change to WT_SESSION::truncate with URI</dt> <dd> If using the WT_SESSION::truncate API with a file: URI for a full table -truncate, underlying algorithmic changes result in some visible differences. -This call can now return WT_ROLLBACK. Applications should be prepared to -handle this error. This method no longer requires exclusive access to the -table. Also the underlying disk space may not be immediately +truncate, underlying algorithmic changes result in some visible +differences. This call can now return WT_ROLLBACK. Applications should be +prepared to handle this error. This method no longer requires exclusive +access to the table. Also the underlying disk space may not be immediately reclaimed when the call returns. The performance of this API may differ from earlier releases. </dd> @@ -34,6 +41,14 @@ from the WiredTiger release; remaining compression engines include LZ4, snappy and zlib. </dd> +<dt>Change to named checkpoints with bulk loads</dt> +<dd> +Previous versions of WiredTiger created empty named checkpoints in files +being bulk-loaded. In this release, checkpoint skips files being +bulk-loaded, so they do not get named checkpoints that complete during the +bulk load. +</dd> + </dl><hr> @section version_270 Upgrading to Version 2.7.0 diff --git a/src/docs/wtperf.dox b/src/docs/wtperf.dox index 1f0d1533ac4..6d8dcab8f65 100644 --- a/src/docs/wtperf.dox +++ b/src/docs/wtperf.dox @@ -212,6 +212,10 @@ insert operations generate random content for the value @par read_range (unsigned int, default=0) scan a range of keys after each search +@par readonly (boolean, default=false) +reopen the connection between populate and workload phases in readonly +mode. Requires reopen_connection turned on (default). Requires that +read be the only workload specified @par reopen_connection (boolean, default=true) close and reopen the connection between populate and workload phases @par report_interval (unsigned int, default=2) @@ -247,14 +251,19 @@ threads configuration might be 'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' which would create 2 threads doing nothing but reads and 8 threads each doing 50% inserts and 25% reads and updates. Allowed -configuration values are 'count', 'throttle', 'reads', 'inserts', -'updates', 'truncate', 'truncate_pct' and 'truncate_count'. There are -also behavior modifiers, supported modifiers are 'ops_per_txn' +configuration values are 'count', 'throttle', 'update_delta', 'reads', +'inserts', 'updates', 'truncate', 'truncate_pct' and 'truncate_count'. +There are also behavior modifiers, supported modifiers are +'ops_per_txn' @par transaction_config (string, default=) transaction configuration string, relevant when populate_opts_per_txn is nonzero @par table_name (string, default=test) table name +@par value_sz_max (unsigned int, default=1000) +maximum value size when delta updates are present. Default disabled +@par value_sz_min (unsigned int, default=1) +minimum value size when delta updates are present. Default disabled @par value_sz (unsigned int, default=100) value size @par verbose (unsigned int, default=1) diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c index 641864a8baa..ca98b1bd62a 100644 --- a/src/evict/evict_file.c +++ b/src/evict/evict_file.c @@ -18,13 +18,12 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) WT_DECL_RET; WT_PAGE *page; WT_REF *next_ref, *ref; - bool evict_reset; /* * We need exclusive access to the file -- disable ordinary eviction * and drain any blocks already queued. */ - WT_RET(__wt_evict_file_exclusive_on(session, &evict_reset)); + WT_RET(__wt_evict_file_exclusive_on(session)); /* Make sure the oldest transaction ID is up-to-date. */ __wt_txn_update_oldest(session, true); @@ -98,8 +97,7 @@ err: /* On error, clear any left-over tree walk. */ session, next_ref, WT_READ_NO_EVICT)); } - if (evict_reset) - __wt_evict_file_exclusive_off(session); + __wt_evict_file_exclusive_off(session); return (ret); } diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 0536a06bc22..50a00787f35 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -159,7 +159,7 @@ __wt_evict_server_wake(WT_SESSION_IMPL *session) bytes_max / WT_MEGABYTE)); } - return (__wt_cond_signal(session, cache->evict_cond)); + return (__wt_cond_auto_signal(session, cache->evict_cond)); } /* @@ -175,8 +175,8 @@ __evict_server(void *arg) WT_SESSION_IMPL *session; #ifdef HAVE_DIAGNOSTIC struct timespec now, stuck_ts; - uint64_t pages_evicted = 0; #endif + uint64_t pages_evicted = 0; u_int spins; session = arg; @@ -219,11 +219,11 @@ __evict_server(void *arg) /* Next time we wake up, reverse the sweep direction. */ cache->flags ^= WT_CACHE_WALK_REVERSE; -#ifdef HAVE_DIAGNOSTIC pages_evicted = 0; } else if (pages_evicted != cache->pages_evict) { - WT_ERR(__wt_epoch(session, &stuck_ts)); pages_evicted = cache->pages_evict; +#ifdef HAVE_DIAGNOSTIC + WT_ERR(__wt_epoch(session, &stuck_ts)); } else { /* After being stuck for 5 minutes, give up. */ WT_ERR(__wt_epoch(session, &now)); @@ -238,7 +238,8 @@ __evict_server(void *arg) WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "sleeping")); /* Don't rely on signals: check periodically. */ - WT_ERR(__wt_cond_wait(session, cache->evict_cond, 100000)); + WT_ERR(__wt_cond_auto_wait( + session, cache->evict_cond, pages_evicted != 0)); WT_ERR(__wt_verbose(session, WT_VERB_EVICTSERVER, "waking")); } @@ -477,6 +478,7 @@ __evict_update_work(WT_SESSION_IMPL *session) conn = S2C(session); cache = conn->cache; + WT_STAT_FAST_CONN_SET(session, cache_eviction_aggressive_set, 0); /* Clear previous state. */ cache->state = 0; @@ -534,8 +536,11 @@ __evict_update_work(WT_SESSION_IMPL *session) return (false); -done: if (F_ISSET(cache, WT_CACHE_STUCK)) +done: if (F_ISSET(cache, WT_CACHE_STUCK)) { + WT_STAT_FAST_CONN_SET(session, + cache_eviction_aggressive_set, 1); FLD_SET(cache->state, WT_EVICT_PASS_AGGRESSIVE); + } return (true); } @@ -594,8 +599,11 @@ __evict_pass(WT_SESSION_IMPL *session) if (!__evict_update_work(session)) break; - if (loop > 10) + if (loop > 10) { + WT_STAT_FAST_CONN_SET(session, + cache_eviction_aggressive_set, 1); FLD_SET(cache->state, WT_EVICT_PASS_AGGRESSIVE); + } /* * Start a worker if we have capacity and we haven't reached @@ -713,12 +721,32 @@ __evict_clear_walks(WT_SESSION_IMPL *session) } /* - * __evict_request_walk_clear -- + * __evict_clear_all_walks -- + * Clear the eviction walk points for all files a session is waiting on. + */ +static int +__evict_clear_all_walks(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + + conn = S2C(session); + + TAILQ_FOREACH(dhandle, &conn->dhqh, q) + if (WT_PREFIX_MATCH(dhandle->name, "file:")) + WT_WITH_DHANDLE(session, + dhandle, WT_TRET(__evict_clear_walk(session))); + return (ret); +} + +/* + * __evict_request_clear_walk -- * Request that the eviction server clear the tree's current eviction * point. */ static int -__evict_request_walk_clear(WT_SESSION_IMPL *session) +__evict_request_clear_walk(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; @@ -746,32 +774,12 @@ __evict_request_walk_clear(WT_SESSION_IMPL *session) } /* - * __evict_clear_all_walks -- - * Clear the eviction walk points for all files a session is waiting on. - */ -static int -__evict_clear_all_walks(WT_SESSION_IMPL *session) -{ - WT_CONNECTION_IMPL *conn; - WT_DATA_HANDLE *dhandle; - WT_DECL_RET; - - conn = S2C(session); - - TAILQ_FOREACH(dhandle, &conn->dhqh, q) - if (WT_PREFIX_MATCH(dhandle->name, "file:")) - WT_WITH_DHANDLE(session, - dhandle, WT_TRET(__evict_clear_walk(session))); - return (ret); -} - -/* * __wt_evict_file_exclusive_on -- * Get exclusive eviction access to a file and discard any of the file's * blocks queued for eviction. */ int -__wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp) +__wt_evict_file_exclusive_on(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; @@ -779,33 +787,39 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp) WT_EVICT_ENTRY *evict; u_int i, elem; - *evict_resetp = false; - btree = S2BT(session); cache = S2C(session)->cache; - /* If the file wasn't evictable, there's no work to do. */ - if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) + /* + * Hold the walk lock to set the no-eviction flag. + * + * The no-eviction flag can be set permanently, in which case we never + * increment the no-eviction count. + */ + __wt_spin_lock(session, &cache->evict_walk_lock); + if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) { + if (btree->evict_disabled != 0) + ++btree->evict_disabled; + __wt_spin_unlock(session, &cache->evict_walk_lock); return (0); + } + ++btree->evict_disabled; /* - * Hold the walk lock to set the "no eviction" flag: no new pages from - * the file will be queued for eviction after this point. + * Ensure no new pages from the file will be queued for eviction after + * this point. */ - __wt_spin_lock(session, &cache->evict_walk_lock); F_SET(btree, WT_BTREE_NO_EVICTION); - __wt_spin_unlock(session, &cache->evict_walk_lock); + WT_FULL_BARRIER(); /* Clear any existing LRU eviction walk for the file. */ - WT_ERR(__evict_request_walk_clear(session)); - - /* Hold the evict lock to remove any queued pages from this file. */ - __wt_spin_lock(session, &cache->evict_lock); + WT_ERR(__evict_request_clear_walk(session)); /* * The eviction candidate list might reference pages from the file, - * clear it. + * clear it. Hold the evict lock to remove queued pages from a file. */ + __wt_spin_lock(session, &cache->evict_lock); elem = cache->evict_max; for (i = 0, evict = cache->evict_queue; i < elem; i++, evict++) if (evict->btree == btree) @@ -819,10 +833,11 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp) while (btree->evict_busy > 0) __wt_yield(); - *evict_resetp = true; - return (0); - -err: F_CLR(btree, WT_BTREE_NO_EVICTION); + if (0) { +err: --btree->evict_disabled; + F_CLR(btree, WT_BTREE_NO_EVICTION); + } + __wt_spin_unlock(session, &cache->evict_walk_lock); return (ret); } @@ -834,12 +849,28 @@ void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session) { WT_BTREE *btree; + WT_CACHE *cache; btree = S2BT(session); + cache = S2C(session)->cache; - WT_ASSERT(session, btree->evict_ref == NULL); + /* + * We have seen subtle bugs with multiple threads racing to turn + * eviction on/off. Make races more likely in diagnostic builds. + */ + WT_DIAGNOSTIC_YIELD; - F_CLR(btree, WT_BTREE_NO_EVICTION); + WT_ASSERT(session, + btree->evict_ref == NULL && F_ISSET(btree, WT_BTREE_NO_EVICTION)); + + /* + * The no-eviction flag can be set permanently, in which case we never + * increment the no-eviction count. + */ + __wt_spin_lock(session, &cache->evict_walk_lock); + if (btree->evict_disabled > 0 && --btree->evict_disabled == 0) + F_CLR(btree, WT_BTREE_NO_EVICTION); + __wt_spin_unlock(session, &cache->evict_walk_lock); } /* @@ -869,7 +900,7 @@ __evict_lru_walk(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_DECL_RET; - uint64_t cutoff; + uint64_t cutoff, read_gen_oldest; uint32_t candidates, entries; cache = S2C(session)->cache; @@ -910,34 +941,62 @@ __evict_lru_walk(WT_SESSION_IMPL *session) return (0); } - WT_ASSERT(session, cache->evict_queue[0].ref != NULL); - - /* Track the oldest read generation we have in the queue. */ - cache->read_gen_oldest = cache->evict_queue[0].ref->page->read_gen; - + /* Decide how many of the candidates we're going to try and evict. */ if (FLD_ISSET(cache->state, - WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK)) + WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK)) { /* * Take all candidates if we only gathered pages with an oldest * read generation set. */ cache->evict_candidates = entries; - else { - /* Find the bottom 25% of read generations. */ - cutoff = (3 * __evict_read_gen(&cache->evict_queue[0]) + - __evict_read_gen(&cache->evict_queue[entries - 1])) / 4; + } else { /* - * Don't take less than 10% or more than 50% of entries, - * regardless. That said, if there is only one entry, which is - * normal when populating an empty file, don't exclude it. + * Find the oldest read generation we have in the queue, used + * to set the initial value for pages read into the system. + * The queue is sorted, find the first "normal" generation. */ - for (candidates = 1 + entries / 10; - candidates < entries / 2; - candidates++) - if (__evict_read_gen( - &cache->evict_queue[candidates]) > cutoff) + read_gen_oldest = WT_READGEN_OLDEST; + for (candidates = 0; candidates < entries; ++candidates) { + read_gen_oldest = + __evict_read_gen(&cache->evict_queue[candidates]); + if (read_gen_oldest != WT_READGEN_OLDEST) break; - cache->evict_candidates = candidates; + } + + /* + * Take all candidates if we only gathered pages with an oldest + * read generation set. + * + * We normally never take more than 50% of the entries; if 50% + * of the entries were at the oldest read generation, take them. + */ + if (read_gen_oldest == WT_READGEN_OLDEST) + cache->evict_candidates = entries; + else if (candidates >= entries / 2) + cache->evict_candidates = candidates; + else { + /* Save the calculated oldest generation. */ + cache->read_gen_oldest = read_gen_oldest; + + /* Find the bottom 25% of read generations. */ + cutoff = + (3 * read_gen_oldest + __evict_read_gen( + &cache->evict_queue[entries - 1])) / 4; + + /* + * Don't take less than 10% or more than 50% of entries, + * regardless. That said, if there is only one entry, + * which is normal when populating an empty file, don't + * exclude it. + */ + for (candidates = 1 + entries / 10; + candidates < entries / 2; + candidates++) + if (__evict_read_gen( + &cache->evict_queue[candidates]) > cutoff) + break; + cache->evict_candidates = candidates; + } } cache->evict_current = cache->evict_queue; @@ -1106,23 +1165,27 @@ retry: while (slot < max_entries && ret == 0) { __wt_spin_unlock(session, &conn->dhandle_lock); dhandle_locked = false; - __wt_spin_lock(session, &cache->evict_walk_lock); - /* - * Re-check the "no eviction" flag -- it is used to enforce - * exclusive access when a handle is being closed. + * Re-check the "no eviction" flag, used to enforce exclusive + * access when a handle is being closed. If not set, remember + * the file to visit first, next loop. + * + * Only try to acquire the lock and simply continue if we fail; + * the lock is held while the thread turning off eviction clears + * the tree's current eviction point, and part of the process is + * waiting on this thread to acknowledge that action. */ - if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) { - /* Remember the file to visit first, next loop. */ - cache->evict_file_next = dhandle; - - WT_WITH_DHANDLE(session, dhandle, - ret = __evict_walk_file(session, &slot)); - WT_ASSERT(session, session->split_gen == 0); + if (!F_ISSET(btree, WT_BTREE_NO_EVICTION) && + !__wt_spin_trylock(session, &cache->evict_walk_lock)) { + if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) { + cache->evict_file_next = dhandle; + WT_WITH_DHANDLE(session, dhandle, + ret = __evict_walk_file(session, &slot)); + WT_ASSERT(session, session->split_gen == 0); + } + __wt_spin_unlock(session, &cache->evict_walk_lock); } - __wt_spin_unlock(session, &cache->evict_walk_lock); - /* * If we didn't find any candidates in the file, skip it next * time. @@ -1209,7 +1272,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp) uint64_t pages_walked; uint32_t walk_flags; int internal_pages, restarts; - bool enough, modified, would_split; + bool enough, modified; conn = S2C(session); btree = S2BT(session); @@ -1265,9 +1328,22 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp) if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)) continue; + /* + * It's possible (but unlikely) to visit a page without a read + * generation, if we race with the read instantiating the page. + * Ignore those pages, but set the page's read generation here + * to ensure a bug doesn't somehow leave a page without a read + * generation. + */ + if (page->read_gen == WT_READGEN_NOTSET) { + __wt_cache_read_gen_new(session, page); + continue; + } + /* Pages we no longer need (clean or dirty), are found money. */ if (__wt_page_is_empty(page) || - F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) + F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || + page->read_gen == WT_READGEN_OLDEST) goto fast; /* Skip clean pages if appropriate. */ @@ -1280,25 +1356,17 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp) * eviction, skip anything that isn't marked. */ if (FLD_ISSET(cache->state, WT_EVICT_PASS_WOULD_BLOCK) && - page->memory_footprint < btree->splitmempage && - page->read_gen != WT_READGEN_OLDEST) + page->memory_footprint < btree->splitmempage) continue; /* Limit internal pages to 50% unless we get aggressive. */ if (WT_PAGE_IS_INTERNAL(page) && - ++internal_pages > WT_EVICT_WALK_PER_FILE / 2 && - !FLD_ISSET(cache->state, WT_EVICT_PASS_AGGRESSIVE)) + !FLD_ISSET(cache->state, WT_EVICT_PASS_AGGRESSIVE) && + internal_pages >= (int)(evict - start) / 2) continue; - /* - * If this page has never been considered for eviction, set its - * read generation to somewhere in the middle of the LRU list. - */ - if (page->read_gen == WT_READGEN_NOTSET) - page->read_gen = __wt_cache_read_gen_new(session); - fast: /* If the page can't be evicted, give up. */ - if (!__wt_page_can_evict(session, ref, &would_split)) + if (!__wt_page_can_evict(session, ref, NULL)) continue; /* @@ -1332,6 +1400,9 @@ fast: /* If the page can't be evicted, give up. */ __evict_init_candidate(session, evict, ref); ++evict; + if (WT_PAGE_IS_INTERNAL(page)) + ++internal_pages; + WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER, "select: %p, size %" PRIu64, page, page->memory_footprint)); } @@ -1392,8 +1463,9 @@ __evict_get_ref( } /* - * The eviction server only tries to evict half of the pages before - * looking for more. + * Only evict half of the pages before looking for more. The remainder + * are left to eviction workers (if configured), or application threads + * if necessary. */ candidates = cache->evict_candidates; if (is_server && candidates > 1) @@ -1452,7 +1524,6 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server) { WT_BTREE *btree; WT_DECL_RET; - WT_PAGE *page; WT_REF *ref; WT_RET(__evict_get_ref(session, is_server, &btree, &ref)); @@ -1481,9 +1552,7 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server) * the page and some other thread may have evicted it by the time we * look at it. */ - page = ref->page; - if (page->read_gen != WT_READGEN_OLDEST) - page->read_gen = __wt_cache_read_gen_bump(session); + __wt_cache_read_gen_bump(session, ref->page); WT_WITH_BTREE(session, btree, ret = __wt_evict(session, ref, false)); diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c index 72c07eaa05d..f0d4752cc83 100644 --- a/src/evict/evict_page.c +++ b/src/evict/evict_page.c @@ -471,8 +471,7 @@ __evict_review( LF_SET(WT_EVICT_IN_MEMORY | WT_EVICT_UPDATE_RESTORE); else if (page->read_gen == WT_READGEN_OLDEST) LF_SET(WT_EVICT_UPDATE_RESTORE); - else if (F_ISSET(session, WT_SESSION_INTERNAL) && - F_ISSET(S2C(session)->cache, WT_CACHE_STUCK)) + else if (F_ISSET(S2C(session)->cache, WT_CACHE_STUCK)) LF_SET(WT_EVICT_LOOKASIDE); } diff --git a/src/include/btmem.h b/src/include/btmem.h index ee495c52fc8..7cdf2bef43a 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -598,9 +598,14 @@ struct __wt_page { * read generation is incremented by the eviction server each time it * becomes active. To avoid incrementing a page's read generation too * frequently, it is set to a future point. + * + * Because low read generation values have special meaning, and there + * are places where we manipulate the value, use an initial value well + * outside of the special range. */ #define WT_READGEN_NOTSET 0 #define WT_READGEN_OLDEST 1 +#define WT_READGEN_START_VALUE 100 #define WT_READGEN_STEP 100 uint64_t read_gen; diff --git a/src/include/btree.h b/src/include/btree.h index 703de0f2fc6..fd921677751 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -129,10 +129,11 @@ struct __wt_btree { uint64_t rec_max_txn; /* Maximum txn seen (clean trees) */ uint64_t write_gen; /* Write generation */ - WT_REF *evict_ref; /* Eviction thread's location */ - uint64_t evict_priority; /* Relative priority of cached pages */ - u_int evict_walk_period; /* Skip this many LRU walks */ - u_int evict_walk_skips; /* Number of walks skipped */ + WT_REF *evict_ref; /* Eviction thread's location */ + uint64_t evict_priority; /* Relative priority of cached pages */ + u_int evict_walk_period; /* Skip this many LRU walks */ + u_int evict_walk_skips; /* Number of walks skipped */ + u_int evict_disabled; /* Eviction disabled count */ volatile uint32_t evict_busy; /* Count of threads in eviction */ enum { diff --git a/src/include/btree.i b/src/include/btree.i index b4b4d7f25a2..6df7f87073f 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -1149,7 +1149,7 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp) * parent frees the backing blocks for any no-longer-used overflow keys, * which will corrupt the checkpoint's block management. */ - if (btree->checkpointing && + if (btree->checkpointing != WT_CKPT_OFF && F_ISSET_ATOMIC(ref->home, WT_PAGE_OVERFLOW_KEYS)) return (false); diff --git a/src/include/cache.h b/src/include/cache.h index a3961d6043e..9184a2fe6ed 100644 --- a/src/include/cache.h +++ b/src/include/cache.h @@ -75,9 +75,9 @@ struct __wt_cache { /* * Read information. */ - uint64_t read_gen; /* Page read generation (LRU) */ - uint64_t read_gen_oldest; /* The oldest read generation that - eviction knows about */ + uint64_t read_gen; /* Current page read generation */ + uint64_t read_gen_oldest; /* Oldest read generation the eviction + * server saw in its last queue load */ /* * Eviction thread information. diff --git a/src/include/cache.i b/src/include/cache.i index ee13eee84c5..8cf7555e716 100644 --- a/src/include/cache.i +++ b/src/include/cache.i @@ -28,34 +28,43 @@ __wt_cache_read_gen_incr(WT_SESSION_IMPL *session) /* * __wt_cache_read_gen_bump -- - * Get the read generation to keep a page in memory. + * Update the page's read generation. */ -static inline uint64_t -__wt_cache_read_gen_bump(WT_SESSION_IMPL *session) +static inline void +__wt_cache_read_gen_bump(WT_SESSION_IMPL *session, WT_PAGE *page) { + /* Ignore pages set for forcible eviction. */ + if (page->read_gen == WT_READGEN_OLDEST) + return; + + /* Ignore pages already in the future. */ + if (page->read_gen > __wt_cache_read_gen(session)) + return; + /* - * We return read-generations from the future (where "the future" is - * measured by increments of the global read generation). The reason - * is because when acquiring a new hazard pointer for a page, we can - * check its read generation, and if the read generation isn't less - * than the current global generation, we don't bother updating the - * page. In other words, the goal is to avoid some number of updates - * immediately after each update we have to make. + * We set read-generations in the future (where "the future" is measured + * by increments of the global read generation). The reason is because + * when acquiring a new hazard pointer for a page, we can check its read + * generation, and if the read generation isn't less than the current + * global generation, we don't bother updating the page. In other + * words, the goal is to avoid some number of updates immediately after + * each update we have to make. */ - return (__wt_cache_read_gen(session) + WT_READGEN_STEP); + page->read_gen = __wt_cache_read_gen(session) + WT_READGEN_STEP; } /* * __wt_cache_read_gen_new -- * Get the read generation for a new page in memory. */ -static inline uint64_t -__wt_cache_read_gen_new(WT_SESSION_IMPL *session) +static inline void +__wt_cache_read_gen_new(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_CACHE *cache; cache = S2C(session)->cache; - return (__wt_cache_read_gen(session) + cache->read_gen_oldest) / 2; + page->read_gen = + (__wt_cache_read_gen(session) + cache->read_gen_oldest) / 2; } /* @@ -119,12 +128,11 @@ __wt_session_can_wait(WT_SESSION_IMPL *session) return (0); /* - * LSM sets the no-eviction flag when holding the LSM tree lock, - * in that case, or when holding the schema lock, we don't want to - * highjack the thread for eviction. + * LSM sets the no-eviction flag when holding the LSM tree lock, in that + * case, or when holding the schema lock, we don't want to highjack the + * thread for eviction. */ - if (F_ISSET(session, - WT_SESSION_NO_EVICTION | WT_SESSION_LOCKED_SCHEMA)) + if (F_ISSET(session, WT_SESSION_NO_EVICTION | WT_SESSION_LOCKED_SCHEMA)) return (0); return (1); @@ -224,11 +232,11 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool *didworkp) return (0); /* - * Threads operating on trees that cannot be evicted are ignored, - * mostly because they're not contributing to the problem. + * Threads operating on cache-resident trees are ignored because they're + * not contributing to the problem. */ btree = S2BT_SAFE(session); - if (btree != NULL && F_ISSET(btree, WT_BTREE_NO_EVICTION)) + if (btree != NULL && F_ISSET(btree, WT_BTREE_IN_MEMORY)) return (0); /* Check if eviction is needed. */ diff --git a/src/include/column.i b/src/include/column.i index 9f3e2101f6f..d64e68420a5 100644 --- a/src/include/column.i +++ b/src/include/column.i @@ -11,13 +11,13 @@ * Search a column-store insert list for the next larger record. */ static inline WT_INSERT * -__col_insert_search_gt(WT_INSERT_HEAD *inshead, uint64_t recno) +__col_insert_search_gt(WT_INSERT_HEAD *ins_head, uint64_t recno) { WT_INSERT *ins, **insp; int i; /* If there's no insert chain to search, we're done. */ - if ((ins = WT_SKIP_LAST(inshead)) == NULL) + if ((ins = WT_SKIP_LAST(ins_head)) == NULL) return (NULL); /* Fast path check for targets past the end of the skiplist. */ @@ -29,7 +29,7 @@ __col_insert_search_gt(WT_INSERT_HEAD *inshead, uint64_t recno) * go as far as possible at each level before stepping down to the next. */ ins = NULL; - for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0;) + for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0;) if (*insp != NULL && recno >= WT_INSERT_RECNO(*insp)) { ins = *insp; /* GTE: keep going at this level */ insp = &(*insp)->next[i]; @@ -50,7 +50,7 @@ __col_insert_search_gt(WT_INSERT_HEAD *inshead, uint64_t recno) * such a record exists before searching. */ if (ins == NULL) - ins = WT_SKIP_FIRST(inshead); + ins = WT_SKIP_FIRST(ins_head); while (recno >= WT_INSERT_RECNO(ins)) ins = WT_SKIP_NEXT(ins); return (ins); @@ -61,13 +61,13 @@ __col_insert_search_gt(WT_INSERT_HEAD *inshead, uint64_t recno) * Search a column-store insert list for the next smaller record. */ static inline WT_INSERT * -__col_insert_search_lt(WT_INSERT_HEAD *inshead, uint64_t recno) +__col_insert_search_lt(WT_INSERT_HEAD *ins_head, uint64_t recno) { WT_INSERT *ins, **insp; int i; /* If there's no insert chain to search, we're done. */ - if ((ins = WT_SKIP_FIRST(inshead)) == NULL) + if ((ins = WT_SKIP_FIRST(ins_head)) == NULL) return (NULL); /* Fast path check for targets before the skiplist. */ @@ -78,7 +78,7 @@ __col_insert_search_lt(WT_INSERT_HEAD *inshead, uint64_t recno) * The insert list is a skip list: start at the highest skip level, then * go as far as possible at each level before stepping down to the next. */ - for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0;) + for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0;) if (*insp != NULL && recno > WT_INSERT_RECNO(*insp)) { ins = *insp; /* GT: keep going at this level */ insp = &(*insp)->next[i]; @@ -95,14 +95,14 @@ __col_insert_search_lt(WT_INSERT_HEAD *inshead, uint64_t recno) * Search a column-store insert list for an exact match. */ static inline WT_INSERT * -__col_insert_search_match(WT_INSERT_HEAD *inshead, uint64_t recno) +__col_insert_search_match(WT_INSERT_HEAD *ins_head, uint64_t recno) { WT_INSERT **insp, *ret_ins; uint64_t ins_recno; int cmp, i; /* If there's no insert chain to search, we're done. */ - if ((ret_ins = WT_SKIP_LAST(inshead)) == NULL) + if ((ret_ins = WT_SKIP_LAST(ins_head)) == NULL) return (NULL); /* Fast path the check for values at the end of the skiplist. */ @@ -115,7 +115,7 @@ __col_insert_search_match(WT_INSERT_HEAD *inshead, uint64_t recno) * The insert list is a skip list: start at the highest skip level, then * go as far as possible at each level before stepping down to the next. */ - for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0; ) { + for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0; ) { if (*insp == NULL) { --i; --insp; @@ -143,7 +143,7 @@ __col_insert_search_match(WT_INSERT_HEAD *inshead, uint64_t recno) * Search a column-store insert list, creating a skiplist stack as we go. */ static inline WT_INSERT * -__col_insert_search(WT_INSERT_HEAD *inshead, +__col_insert_search(WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack, WT_INSERT **next_stack, uint64_t recno) { WT_INSERT **insp, *ret_ins; @@ -151,15 +151,15 @@ __col_insert_search(WT_INSERT_HEAD *inshead, int cmp, i; /* If there's no insert chain to search, we're done. */ - if ((ret_ins = WT_SKIP_LAST(inshead)) == NULL) + if ((ret_ins = WT_SKIP_LAST(ins_head)) == NULL) return (NULL); /* Fast path appends. */ if (recno >= WT_INSERT_RECNO(ret_ins)) { for (i = 0; i < WT_SKIP_MAXDEPTH; i++) { ins_stack[i] = (i == 0) ? &ret_ins->next[0] : - (inshead->tail[i] != NULL) ? - &inshead->tail[i]->next[i] : &inshead->head[i]; + (ins_head->tail[i] != NULL) ? + &ins_head->tail[i]->next[i] : &ins_head->head[i]; next_stack[i] = NULL; } return (ret_ins); @@ -169,7 +169,7 @@ __col_insert_search(WT_INSERT_HEAD *inshead, * The insert list is a skip list: start at the highest skip level, then * go as far as possible at each level before stepping down to the next. */ - for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0; ) { + for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0; ) { if ((ret_ins = *insp) == NULL) { next_stack[i] = NULL; ins_stack[i--] = insp--; diff --git a/src/include/config.h b/src/include/config.h index e63db0e76cf..48a255134af 100644 --- a/src/include/config.h +++ b/src/include/config.h @@ -85,13 +85,15 @@ struct __wt_config_parser_impl { #define WT_CONFIG_ENTRY_WT_SESSION_upgrade 33 #define WT_CONFIG_ENTRY_WT_SESSION_verify 34 #define WT_CONFIG_ENTRY_colgroup_meta 35 -#define WT_CONFIG_ENTRY_file_meta 36 -#define WT_CONFIG_ENTRY_index_meta 37 -#define WT_CONFIG_ENTRY_table_meta 38 -#define WT_CONFIG_ENTRY_wiredtiger_open 39 -#define WT_CONFIG_ENTRY_wiredtiger_open_all 40 -#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 41 -#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 42 +#define WT_CONFIG_ENTRY_file_config 36 +#define WT_CONFIG_ENTRY_file_meta 37 +#define WT_CONFIG_ENTRY_index_meta 38 +#define WT_CONFIG_ENTRY_lsm_meta 39 +#define WT_CONFIG_ENTRY_table_meta 40 +#define WT_CONFIG_ENTRY_wiredtiger_open 41 +#define WT_CONFIG_ENTRY_wiredtiger_open_all 42 +#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 43 +#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 44 /* * configuration section: END * DO NOT EDIT: automatically built by dist/flags.py. diff --git a/src/include/connection.h b/src/include/connection.h index 88797e83ad6..2255056fcf6 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -175,6 +175,7 @@ struct __wt_connection_impl { WT_SPINLOCK checkpoint_lock; /* Checkpoint spinlock */ WT_SPINLOCK dhandle_lock; /* Data handle list spinlock */ WT_SPINLOCK fh_lock; /* File handle queue spinlock */ + WT_SPINLOCK metadata_lock; /* Metadata update spinlock */ WT_SPINLOCK reconfig_lock; /* Single thread reconfigure */ WT_SPINLOCK schema_lock; /* Schema operation spinlock */ WT_SPINLOCK table_lock; /* Table creation spinlock */ @@ -298,9 +299,10 @@ struct __wt_connection_impl { #define WT_CONN_STAT_ALL 0x01 /* "all" statistics configured */ #define WT_CONN_STAT_CLEAR 0x02 /* clear after gathering */ #define WT_CONN_STAT_FAST 0x04 /* "fast" statistics configured */ -#define WT_CONN_STAT_NONE 0x08 /* don't gather statistics */ -#define WT_CONN_STAT_ON_CLOSE 0x10 /* output statistics on close */ -#define WT_CONN_STAT_SIZE 0x20 /* "size" statistics configured */ +#define WT_CONN_STAT_JSON 0x08 /* output JSON format */ +#define WT_CONN_STAT_NONE 0x10 /* don't gather statistics */ +#define WT_CONN_STAT_ON_CLOSE 0x20 /* output statistics on close */ +#define WT_CONN_STAT_SIZE 0x40 /* "size" statistics configured */ uint32_t stat_flags; /* Connection statistics */ diff --git a/src/include/cursor.h b/src/include/cursor.h index 7f7b5dceb79..4b35daf106e 100644 --- a/src/include/cursor.h +++ b/src/include/cursor.h @@ -31,22 +31,22 @@ NULL, /* uri */ \ NULL, /* key_format */ \ NULL, /* value_format */ \ - (int (*)(WT_CURSOR *, ...))(get_key), \ - (int (*)(WT_CURSOR *, ...))(get_value), \ - (void (*)(WT_CURSOR *, ...))(set_key), \ - (void (*)(WT_CURSOR *, ...))(set_value), \ - (int (*)(WT_CURSOR *, WT_CURSOR *, int *))(compare), \ - (int (*)(WT_CURSOR *, WT_CURSOR *, int *))(equals), \ + get_key, \ + get_value, \ + set_key, \ + set_value, \ + compare, \ + equals, \ next, \ prev, \ reset, \ search, \ - (int (*)(WT_CURSOR *, int *))(search_near), \ + search_near, \ insert, \ update, \ remove, \ close, \ - (int (*)(WT_CURSOR *, const char *))(reconfigure), \ + reconfigure, \ { NULL, NULL }, /* TAILQ_ENTRY q */ \ 0, /* recno key */ \ { 0 }, /* recno raw buffer */ \ @@ -213,10 +213,11 @@ struct __wt_cursor_btree { #define WT_CBT_NO_TXN 0x10 /* Non-transactional cursor (e.g. on a checkpoint) */ #define WT_CBT_SEARCH_SMALLEST 0x20 /* Row-store: small-key insert list */ +#define WT_CBT_VAR_ONPAGE_MATCH 0x40 /* Var-store: on-page recno match */ #define WT_CBT_POSITION_MASK /* Flags associated with position */ \ (WT_CBT_ITERATE_APPEND | WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV | \ - WT_CBT_SEARCH_SMALLEST) + WT_CBT_SEARCH_SMALLEST | WT_CBT_VAR_ONPAGE_MATCH) uint8_t flags; }; @@ -287,9 +288,12 @@ struct __wt_cursor_join_iter { WT_SESSION_IMPL *session; WT_CURSOR_JOIN *cjoin; WT_CURSOR_JOIN_ENTRY *entry; - WT_CURSOR *cursor; - WT_ITEM *curkey; - bool advance; + WT_CURSOR *cursor; /* has null projection */ + WT_CURSOR *main; /* main table with projection */ + WT_ITEM *curkey; /* primary key */ + WT_ITEM idxkey; + bool positioned; + bool isequal; /* advancing means we're done */ }; struct __wt_cursor_join_endpoint { @@ -302,14 +306,18 @@ struct __wt_cursor_join_endpoint { #define WT_CURJOIN_END_GT 0x04 /* include values > cursor */ #define WT_CURJOIN_END_GE (WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ) #define WT_CURJOIN_END_LE (WT_CURJOIN_END_LT | WT_CURJOIN_END_EQ) -#define WT_CURJOIN_END_OWN_KEY 0x08 /* must free key's data */ +#define WT_CURJOIN_END_OWN_CURSOR 0x08 /* must close cursor */ uint8_t flags; /* range for this endpoint */ }; +#define WT_CURJOIN_END_RANGE(endp) \ + ((endp)->flags & \ + (WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ | WT_CURJOIN_END_LT)) struct __wt_cursor_join_entry { WT_INDEX *index; WT_CURSOR *main; /* raw main table cursor */ WT_BLOOM *bloom; /* Bloom filter handle */ + char *repack_format; /* target format for repack */ uint32_t bloom_bit_count; /* bits per item in bloom */ uint32_t bloom_hash_count; /* hash functions in bloom */ uint64_t count; /* approx number of matches */ diff --git a/src/include/extern.h b/src/include/extern.h index 1999ff6b732..48c52d4a109 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -168,7 +168,7 @@ extern int __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing); extern int __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst); -extern int __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_CACHE_OP op); +extern int __wt_cache_op(WT_SESSION_IMPL *session, WT_CACHE_OP op); extern int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk, size_t size, bool empty_page_ok); @@ -190,7 +190,7 @@ extern int __wt_row_insert_alloc(WT_SESSION_IMPL *session, WT_ITEM *key, u_int s extern int __wt_update_alloc( WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep); extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd); extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd); -extern int __wt_search_insert( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key); +extern int __wt_search_insert(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key); extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert); extern int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt); extern int __wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt); @@ -252,9 +252,7 @@ extern int __wt_checkpoint_signal(WT_SESSION_IMPL *session, wt_off_t logsize); extern int __wt_conn_dhandle_find( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint); extern int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force); extern int __wt_conn_btree_open( WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags); -extern int __wt_conn_btree_apply(WT_SESSION_IMPL *session, bool apply_checkpoints, const char *uri, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]); -extern int __wt_conn_btree_apply_single_ckpt(WT_SESSION_IMPL *session, const char *uri, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]); -extern int __wt_conn_btree_apply_single(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]); +extern int __wt_conn_btree_apply(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), const char *cfg[]); extern int __wt_conn_dhandle_close_all( WT_SESSION_IMPL *session, const char *uri, bool force); extern int __wt_conn_dhandle_discard_single( WT_SESSION_IMPL *session, bool final, bool force); extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session); @@ -278,7 +276,6 @@ extern int __wt_sweep_create(WT_SESSION_IMPL *session); extern int __wt_sweep_destroy(WT_SESSION_IMPL *session); extern int __wt_curbackup_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp); extern int __wt_backup_file_remove(WT_SESSION_IMPL *session); -extern int __wt_backup_list_uri_append( WT_SESSION_IMPL *session, const char *name, bool *skip); extern int __wt_curbulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool bitmap, bool skip_sort_check); extern int __wt_curconfig_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp); extern int __wt_curds_open( WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp); @@ -300,12 +297,20 @@ extern int __wt_json_to_item(WT_SESSION_IMPL *session, const char *jstr, const c extern ssize_t __wt_json_strlen(const char *src, size_t srclen); extern int __wt_json_strncpy(char **pdst, size_t dstlen, const char *src, size_t srclen); extern int __wt_curlog_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp); +extern int __wt_schema_create_final( WT_SESSION_IMPL *session, char *cfg_arg[], char **value_ret); extern int __wt_curmetadata_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); extern void __wt_curstat_dsrc_final(WT_CURSOR_STAT *cst); extern int __wt_curstat_init(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *curjoin, const char *cfg[], WT_CURSOR_STAT *cst); extern int __wt_curstat_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *other, const char *cfg[], WT_CURSOR **cursorp); -extern int __wt_cursor_notsup(WT_CURSOR *cursor); extern int __wt_cursor_noop(WT_CURSOR *cursor); +extern int __wt_cursor_notsup(WT_CURSOR *cursor); +extern int __wt_cursor_get_value_notsup(WT_CURSOR *cursor, ...); +extern void __wt_cursor_set_key_notsup(WT_CURSOR *cursor, ...); +extern void __wt_cursor_set_value_notsup(WT_CURSOR *cursor, ...); +extern int __wt_cursor_compare_notsup(WT_CURSOR *a, WT_CURSOR *b, int *cmpp); +extern int __wt_cursor_equals_notsup(WT_CURSOR *cursor, WT_CURSOR *other, int *equalp); +extern int __wt_cursor_search_near_notsup(WT_CURSOR *cursor, int *exact); +extern int __wt_cursor_reconfigure_notsup(WT_CURSOR *cursor, const char *config); extern void __wt_cursor_set_notsup(WT_CURSOR *cursor); extern int __wt_cursor_kv_not_set(WT_CURSOR *cursor, bool key); extern int __wt_cursor_get_key(WT_CURSOR *cursor, ...); @@ -337,7 +342,7 @@ extern void __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_evict_server_wake(WT_SESSION_IMPL *session); extern int __wt_evict_create(WT_SESSION_IMPL *session); extern int __wt_evict_destroy(WT_SESSION_IMPL *session); -extern int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp); +extern int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session); extern void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session); extern int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full); extern void __wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v); @@ -360,7 +365,7 @@ extern int __wt_log_open(WT_SESSION_IMPL *session); extern int __wt_log_close(WT_SESSION_IMPL *session); extern int __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep); extern int __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, int (*func)(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, WT_LSN *next_lsnp, void *cookie, int firstrecord), void *cookie); -extern int __wt_log_force_write(WT_SESSION_IMPL *session, bool retry); +extern int __wt_log_force_write(WT_SESSION_IMPL *session, bool retry, bool *did_work); extern int __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags); extern int __wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap); extern int __wt_log_flush(WT_SESSION_IMPL *session, uint32_t flags); @@ -441,7 +446,7 @@ extern int __wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); extern int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk); extern int __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree); extern int __wt_lsm_worker_start(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args); -extern int __wt_meta_btree_apply(WT_SESSION_IMPL *session, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]); +extern int __wt_meta_apply_all(WT_SESSION_IMPL *session, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), const char *cfg[]); extern int __wt_meta_checkpoint(WT_SESSION_IMPL *session, const char *fname, const char *checkpoint, WT_CKPT *ckpt); extern int __wt_meta_checkpoint_last_name( WT_SESSION_IMPL *session, const char *fname, const char **namep); extern int __wt_meta_checkpoint_clear(WT_SESSION_IMPL *session, const char *fname); @@ -481,7 +486,9 @@ extern int __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, char **va extern int __wt_turtle_update(WT_SESSION_IMPL *session, const char *key, const char *value); extern void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn)); extern int __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp); +extern int __wt_malloc(WT_SESSION_IMPL *session, size_t bytes_to_allocate, void *retp); extern int __wt_realloc(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp); +extern int __wt_realloc_noclear(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp); extern int __wt_realloc_aligned(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp); extern int __wt_strndup(WT_SESSION_IMPL *session, const void *str, size_t len, void *retp); extern void __wt_free_int(WT_SESSION_IMPL *session, const void *p_arg); @@ -490,6 +497,7 @@ extern int __wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp extern int __wt_dlsym(WT_SESSION_IMPL *session, WT_DLH *dlh, const char *name, bool fail, void *sym_ret); extern int __wt_dlclose(WT_SESSION_IMPL *session, WT_DLH *dlh); extern int __wt_errno(void); +extern int __wt_map_error_rdonly(int error); extern const char *__wt_strerror(WT_SESSION_IMPL *session, int error, char *errbuf, size_t errlen); extern int __wt_exist(WT_SESSION_IMPL *session, const char *filename, bool *existp); extern void __wt_fallocate_config(WT_SESSION_IMPL *session, WT_FH *fh); @@ -552,8 +560,18 @@ extern int __wt_struct_confchk(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *v); extern int __wt_struct_size(WT_SESSION_IMPL *session, size_t *sizep, const char *fmt, ...); extern int __wt_struct_pack(WT_SESSION_IMPL *session, void *buffer, size_t size, const char *fmt, ...); extern int __wt_struct_unpack(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, ...); -extern int __wt_struct_unpack_size(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, size_t *resultp); -extern int __wt_struct_repack(WT_SESSION_IMPL *session, const char *infmt, const char *outfmt, const WT_ITEM *inbuf, WT_ITEM *outbuf, void **reallocp); +extern int __wt_struct_repack(WT_SESSION_IMPL *session, const char *infmt, const char *outfmt, const WT_ITEM *inbuf, WT_ITEM *outbuf); +extern int __wt_ext_pack_start(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *format, void *buffer, size_t size, WT_PACK_STREAM **psp); +extern int __wt_ext_unpack_start(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, const char *format, const void *buffer, size_t size, WT_PACK_STREAM **psp); +extern int __wt_ext_pack_close(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, size_t *usedp); +extern int __wt_ext_pack_item(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, WT_ITEM *item); +extern int __wt_ext_pack_int(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, int64_t i); +extern int __wt_ext_pack_str(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, const char *s); +extern int __wt_ext_pack_uint(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, uint64_t u); +extern int __wt_ext_unpack_item(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, WT_ITEM *item); +extern int __wt_ext_unpack_int(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, int64_t *ip); +extern int __wt_ext_unpack_str(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, const char **sp); +extern int __wt_ext_unpack_uint(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, uint64_t *up); extern int __wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell); extern void __wt_ovfl_discard_free(WT_SESSION_IMPL *session, WT_PAGE *page); extern int __wt_ovfl_reuse_search(WT_SESSION_IMPL *session, WT_PAGE *page, uint8_t **addrp, size_t *addr_sizep, const void *value, size_t value_size); @@ -572,7 +590,6 @@ extern int __wt_bulk_insert_row(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) extern int __wt_bulk_insert_fix( WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted); extern int __wt_bulk_insert_fix_bitmap(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk); extern int __wt_bulk_insert_var( WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk, bool deleted); -extern int __wt_schema_create_strip(WT_SESSION_IMPL *session, const char *v1, const char *v2, char **value_ret); extern int __wt_direct_io_size_check(WT_SESSION_IMPL *session, const char **cfg, const char *config_name, uint32_t *allocsizep); extern int __wt_schema_colgroup_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *cgname, const char *config, WT_ITEM *buf); extern int __wt_schema_index_source(WT_SESSION_IMPL *session, WT_TABLE *table, const char *idxname, const char *config, WT_ITEM *buf); @@ -612,6 +629,7 @@ extern WT_DATA_SOURCE *__wt_schema_get_source(WT_SESSION_IMPL *session, const ch extern int __wt_str_name_check(WT_SESSION_IMPL *session, const char *str); extern int __wt_name_check(WT_SESSION_IMPL *session, const char *str, size_t len); extern int __wt_schema_worker(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), const char *cfg[], uint32_t open_flags); +extern int __wt_session_notsup(WT_SESSION *wt_session); extern int __wt_session_reset_cursors(WT_SESSION_IMPL *session, bool free_buffers); extern int __wt_session_copy_values(WT_SESSION_IMPL *session); extern int __wt_session_release_resources(WT_SESSION_IMPL *session); @@ -621,8 +639,8 @@ extern int __wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const ch extern int __wt_session_range_truncate(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *start, WT_CURSOR *stop); extern int __wt_open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const char *config, bool open_metadata, WT_SESSION_IMPL **sessionp); extern int __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, bool open_metadata, uint32_t session_flags, WT_SESSION_IMPL **sessionp); -extern int __wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, bool *skipp); extern int __wt_session_compact( WT_SESSION *wt_session, const char *uri, const char *config); +extern int __wt_session_compact_readonly( WT_SESSION *wt_session, const char *uri, const char *config); extern int __wt_session_lock_dhandle( WT_SESSION_IMPL *session, uint32_t flags, bool *is_deadp); extern int __wt_session_release_btree(WT_SESSION_IMPL *session); extern int __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], uint32_t flags); @@ -632,6 +650,11 @@ extern int __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *ch extern int __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]); extern uint32_t __wt_cksum(const void *chunk, size_t len); extern void __wt_cksum_init(void); +extern int __wt_cond_auto_alloc( WT_SESSION_IMPL *session, const char *name, bool is_signalled, uint64_t min, uint64_t max, WT_CONDVAR **condp); +extern int __wt_cond_auto_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond); +extern int __wt_cond_auto_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool *signalled); +extern int __wt_cond_auto_wait( WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress); +extern int __wt_cond_auto_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp); extern int __wt_decrypt(WT_SESSION_IMPL *session, WT_ENCRYPTOR *encryptor, size_t skip, WT_ITEM *in, WT_ITEM *out); extern int __wt_encrypt(WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t skip, WT_ITEM *in, WT_ITEM *out); extern void __wt_encrypt_size(WT_SESSION_IMPL *session, WT_KEYED_ENCRYPTOR *kencryptor, size_t incoming_size, size_t *sizep); @@ -731,7 +754,7 @@ extern void __wt_txn_destroy(WT_SESSION_IMPL *session); extern int __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_txn_global_destroy(WT_SESSION_IMPL *session); extern int __wt_checkpoint_name_ok(WT_SESSION_IMPL *session, const char *name, size_t len); -extern int __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[]); +extern int __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[]); diff --git a/src/include/flags.h b/src/include/flags.h index 24fae4abccd..a6f42a9938f 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -12,13 +12,14 @@ #define WT_CONN_LOG_SERVER_RUN 0x00000080 #define WT_CONN_LSM_MERGE 0x00000100 #define WT_CONN_PANIC 0x00000200 -#define WT_CONN_SERVER_ASYNC 0x00000400 -#define WT_CONN_SERVER_CHECKPOINT 0x00000800 -#define WT_CONN_SERVER_LSM 0x00001000 -#define WT_CONN_SERVER_RUN 0x00002000 -#define WT_CONN_SERVER_STATISTICS 0x00004000 -#define WT_CONN_SERVER_SWEEP 0x00008000 -#define WT_CONN_WAS_BACKUP 0x00010000 +#define WT_CONN_READONLY 0x00000400 +#define WT_CONN_SERVER_ASYNC 0x00000800 +#define WT_CONN_SERVER_CHECKPOINT 0x00001000 +#define WT_CONN_SERVER_LSM 0x00002000 +#define WT_CONN_SERVER_RUN 0x00004000 +#define WT_CONN_SERVER_STATISTICS 0x00008000 +#define WT_CONN_SERVER_SWEEP 0x00010000 +#define WT_CONN_WAS_BACKUP 0x00020000 #define WT_EVICTING 0x00000001 #define WT_EVICT_IN_MEMORY 0x00000002 #define WT_EVICT_LOOKASIDE 0x00000004 @@ -55,20 +56,21 @@ #define WT_SESSION_INTERNAL 0x00000004 #define WT_SESSION_LOCKED_CHECKPOINT 0x00000008 #define WT_SESSION_LOCKED_HANDLE_LIST 0x00000010 -#define WT_SESSION_LOCKED_SCHEMA 0x00000020 -#define WT_SESSION_LOCKED_SLOT 0x00000040 -#define WT_SESSION_LOCKED_TABLE 0x00000080 -#define WT_SESSION_LOCKED_TURTLE 0x00000100 -#define WT_SESSION_LOCK_NO_WAIT 0x00000200 -#define WT_SESSION_LOGGING_INMEM 0x00000400 -#define WT_SESSION_LOOKASIDE_CURSOR 0x00000800 -#define WT_SESSION_NO_CACHE 0x00001000 -#define WT_SESSION_NO_DATA_HANDLES 0x00002000 -#define WT_SESSION_NO_EVICTION 0x00004000 -#define WT_SESSION_NO_LOGGING 0x00008000 -#define WT_SESSION_NO_SCHEMA_LOCK 0x00010000 -#define WT_SESSION_QUIET_CORRUPT_FILE 0x00020000 -#define WT_SESSION_SERVER_ASYNC 0x00040000 +#define WT_SESSION_LOCKED_METADATA 0x00000020 +#define WT_SESSION_LOCKED_SCHEMA 0x00000040 +#define WT_SESSION_LOCKED_SLOT 0x00000080 +#define WT_SESSION_LOCKED_TABLE 0x00000100 +#define WT_SESSION_LOCKED_TURTLE 0x00000200 +#define WT_SESSION_LOCK_NO_WAIT 0x00000400 +#define WT_SESSION_LOGGING_INMEM 0x00000800 +#define WT_SESSION_LOOKASIDE_CURSOR 0x00001000 +#define WT_SESSION_NO_CACHE 0x00002000 +#define WT_SESSION_NO_DATA_HANDLES 0x00004000 +#define WT_SESSION_NO_EVICTION 0x00008000 +#define WT_SESSION_NO_LOGGING 0x00010000 +#define WT_SESSION_NO_SCHEMA_LOCK 0x00020000 +#define WT_SESSION_QUIET_CORRUPT_FILE 0x00040000 +#define WT_SESSION_SERVER_ASYNC 0x00080000 #define WT_TXN_LOG_CKPT_CLEANUP 0x00000001 #define WT_TXN_LOG_CKPT_PREPARE 0x00000002 #define WT_TXN_LOG_CKPT_START 0x00000004 diff --git a/src/include/gcc.h b/src/include/gcc.h index 6ccc0de3c03..ce6afdd6e9c 100644 --- a/src/include/gcc.h +++ b/src/include/gcc.h @@ -6,6 +6,7 @@ * See the file LICENSE for redistribution information. */ +#define WT_PTRDIFFT_FMT "td" /* ptrdiff_t format string */ #define WT_SIZET_FMT "zu" /* size_t format string */ /* Add GCC-specific attributes to types and function declarations. */ diff --git a/src/include/lint.h b/src/include/lint.h index f8b17022968..1b64186cbab 100644 --- a/src/include/lint.h +++ b/src/include/lint.h @@ -6,6 +6,7 @@ * See the file LICENSE for redistribution information. */ +#define WT_PTRDIFFT_FMT "td" /* ptrdiff_t format string */ #define WT_SIZET_FMT "zu" /* size_t format string */ #define WT_COMPILER_TYPE_ALIGN(x) diff --git a/src/include/lsm.h b/src/include/lsm.h index 7cb3ccc895d..444073087df 100644 --- a/src/include/lsm.h +++ b/src/include/lsm.h @@ -179,7 +179,7 @@ struct __wt_lsm_tree { int collator_owned; uint32_t refcnt; /* Number of users of the tree */ - uint8_t exclusive; /* Tree is locked exclusively */ + WT_SESSION_IMPL *excl_session; /* Session has exclusive lock */ #define LSM_TREE_MAX_QUEUE 100 uint32_t queue_ref; @@ -215,7 +215,7 @@ struct __wt_lsm_tree { size_t chunk_alloc; /* Space allocated for chunks */ uint32_t nchunks; /* Number of active chunks */ uint32_t last; /* Last allocated ID */ - int modified; /* Have there been updates? */ + bool modified; /* Have there been updates? */ WT_LSM_CHUNK **old_chunks; /* Array of old LSM chunks */ size_t old_alloc; /* Space allocated for old chunks */ @@ -242,13 +242,18 @@ struct __wt_lsm_tree { int64_t lsm_lookup_no_bloom; int64_t lsm_merge_throttle; -#define WT_LSM_TREE_ACTIVE 0x01 /* Workers are active */ -#define WT_LSM_TREE_AGGRESSIVE_TIMER 0x02 /* Timer for merge aggression */ -#define WT_LSM_TREE_COMPACTING 0x04 /* Tree being compacted */ -#define WT_LSM_TREE_MERGES 0x08 /* Tree should run merges */ -#define WT_LSM_TREE_NEED_SWITCH 0x10 /* New chunk needs creating */ -#define WT_LSM_TREE_OPEN 0x20 /* The tree is open */ -#define WT_LSM_TREE_THROTTLE 0x40 /* Throttle updates */ + /* + * The tree is open for business. This used to be a flag, but it is + * susceptible to races. + */ + bool active; + +#define WT_LSM_TREE_AGGRESSIVE_TIMER 0x01 /* Timer for merge aggression */ +#define WT_LSM_TREE_COMPACTING 0x02 /* Tree being compacted */ +#define WT_LSM_TREE_MERGES 0x04 /* Tree should run merges */ +#define WT_LSM_TREE_NEED_SWITCH 0x08 /* New chunk needs creating */ +#define WT_LSM_TREE_OPEN 0x10 /* The tree is open */ +#define WT_LSM_TREE_THROTTLE 0x20 /* Throttle updates */ uint32_t flags; }; diff --git a/src/include/meta.h b/src/include/meta.h index d61022c0c44..ac0f5fedac4 100644 --- a/src/include/meta.h +++ b/src/include/meta.h @@ -21,6 +21,7 @@ #define WT_METADATA_TURTLE_SET "WiredTiger.turtle.set" /* Turtle temp file */ #define WT_METADATA_URI "metadata:" /* Metadata alias */ +#define WT_METAFILE "WiredTiger.wt" /* Metadata table */ #define WT_METAFILE_URI "file:WiredTiger.wt" /* Metadata table URI */ #define WT_LAS_URI "file:WiredTigerLAS.wt" /* Lookaside table URI*/ diff --git a/src/include/misc.h b/src/include/misc.h index 5dadb1b1484..07d52c61eac 100644 --- a/src/include/misc.h +++ b/src/include/misc.h @@ -198,13 +198,9 @@ /* Check if a string matches a prefix. */ #define WT_PREFIX_MATCH(str, pfx) \ - (((const char *)str)[0] == ((const char *)pfx)[0] && \ + (((const char *)(str))[0] == ((const char *)pfx)[0] && \ strncmp((str), (pfx), strlen(pfx)) == 0) -/* Check if a non-nul-terminated string matches a prefix. */ -#define WT_PREFIX_MATCH_LEN(str, len, pfx) \ - ((len) >= strlen(pfx) && WT_PREFIX_MATCH(str, pfx)) - /* Check if a string matches a prefix, and move past it. */ #define WT_PREFIX_SKIP(str, pfx) \ (WT_PREFIX_MATCH(str, pfx) ? ((str) += strlen(pfx), 1) : 0) diff --git a/src/include/msvc.h b/src/include/msvc.h index 99260a44875..d5be5bd8c60 100644 --- a/src/include/msvc.h +++ b/src/include/msvc.h @@ -13,6 +13,7 @@ #define inline __inline +#define WT_PTRDIFFT_FMT "Id" /* ptrdiff_t format string */ #define WT_SIZET_FMT "Iu" /* size_t format string */ /* diff --git a/src/include/mutex.h b/src/include/mutex.h index f798bfb3ece..04679884930 100644 --- a/src/include/mutex.h +++ b/src/include/mutex.h @@ -20,6 +20,13 @@ struct __wt_condvar { int waiters; /* Numbers of waiters, or -1 if signalled with no waiters. */ + /* + * The following fields are only used for automatically adjusting + * condition variables. They could be in a separate structure. + */ + uint64_t min_wait; /* Minimum wait duration */ + uint64_t max_wait; /* Maximum wait duration */ + uint64_t prev_wait; /* Wait duration used last time */ }; /* diff --git a/src/include/packing.i b/src/include/packing.i index 784a55ef2ae..35b2ddc43db 100644 --- a/src/include/packing.i +++ b/src/include/packing.i @@ -677,8 +677,8 @@ __wt_struct_unpackv(WT_SESSION_IMPL *session, if (fmt[0] != '\0' && fmt[1] == '\0') { pv.type = fmt[0]; - if ((ret = __unpack_read(session, &pv, &p, size)) == 0) - WT_UNPACK_PUT(session, pv, ap); + WT_RET(__unpack_read(session, &pv, &p, size)); + WT_UNPACK_PUT(session, pv, ap); return (0); } diff --git a/src/include/schema.h b/src/include/schema.h index a51030870c1..f93c596e2ca 100644 --- a/src/include/schema.h +++ b/src/include/schema.h @@ -133,6 +133,14 @@ struct __wt_table { &S2C(session)->dhandle_lock, WT_SESSION_LOCKED_HANDLE_LIST, op) /* + * WT_WITH_METADATA_LOCK -- + * Acquire the metadata lock, perform an operation, drop the lock. + */ +#define WT_WITH_METADATA_LOCK(session, ret, op) \ + WT_WITH_LOCK(session, ret, \ + &S2C(session)->metadata_lock, WT_SESSION_LOCKED_METADATA, op) + +/* * WT_WITH_SCHEMA_LOCK -- * Acquire the schema lock, perform an operation, drop the lock. * Check that we are not already holding some other lock: the schema lock @@ -166,6 +174,8 @@ struct __wt_table { */ #define WT_WITHOUT_LOCKS(session, op) do { \ WT_CONNECTION_IMPL *__conn = S2C(session); \ + bool __checkpoint_locked = \ + F_ISSET(session, WT_SESSION_LOCKED_CHECKPOINT); \ bool __handle_locked = \ F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST); \ bool __table_locked = \ @@ -184,7 +194,15 @@ struct __wt_table { F_CLR(session, WT_SESSION_LOCKED_SCHEMA); \ __wt_spin_unlock(session, &__conn->schema_lock); \ } \ + if (__checkpoint_locked) { \ + F_CLR(session, WT_SESSION_LOCKED_CHECKPOINT); \ + __wt_spin_unlock(session, &__conn->checkpoint_lock); \ + } \ op; \ + if (__checkpoint_locked) { \ + __wt_spin_lock(session, &__conn->checkpoint_lock); \ + F_SET(session, WT_SESSION_LOCKED_CHECKPOINT); \ + } \ if (__schema_locked) { \ __wt_spin_lock(session, &__conn->schema_lock); \ F_SET(session, WT_SESSION_LOCKED_SCHEMA); \ diff --git a/src/include/session.h b/src/include/session.h index 5c3291230b4..7fdb7fc2548 100644 --- a/src/include/session.h +++ b/src/include/session.h @@ -126,14 +126,24 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl { void *block_manager; /* Block-manager support */ int (*block_manager_cleanup)(WT_SESSION_IMPL *); - /* Checkpoint support */ - struct { - WT_DATA_HANDLE *dhandle; - const char *name; - } *ckpt_handle; /* Handle list */ + /* Checkpoint handles */ + WT_DATA_HANDLE **ckpt_handle; /* Handle list */ u_int ckpt_handle_next; /* Next empty slot */ size_t ckpt_handle_allocated; /* Bytes allocated */ + /* + * Operations acting on handles. + * + * The preferred pattern is to gather all of the required handles at + * the beginning of an operation, then drop any other locks, perform + * the operation, then release the handles. This cannot be easily + * merged with the list of checkpoint handles because some operations + * (such as compact) do checkpoints internally. + */ + WT_DATA_HANDLE **op_handle; /* Handle list */ + u_int op_handle_next; /* Next empty slot */ + size_t op_handle_allocated; /* Bytes allocated */ + void *reconcile; /* Reconciliation support */ int (*reconcile_cleanup)(WT_SESSION_IMPL *); diff --git a/src/include/stat.h b/src/include/stat.h index 51d2fa332e7..f9170dc1a79 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -227,12 +227,22 @@ __wt_stats_clear(void *stats_arg, int slot) */ #define WT_CONNECTION_STATS_BASE 1000 struct __wt_connection_stats { - int64_t async_alloc_race; - int64_t async_alloc_view; + int64_t lsm_work_queue_app; + int64_t lsm_work_queue_manager; + int64_t lsm_rows_merged; + int64_t lsm_checkpoint_throttle; + int64_t lsm_merge_throttle; + int64_t lsm_work_queue_switch; + int64_t lsm_work_units_discarded; + int64_t lsm_work_units_done; + int64_t lsm_work_units_created; + int64_t lsm_work_queue_max; int64_t async_cur_queue; + int64_t async_max_queue; + int64_t async_alloc_race; int64_t async_flush; + int64_t async_alloc_view; int64_t async_full; - int64_t async_max_queue; int64_t async_nowork; int64_t async_op_alloc; int64_t async_op_compact; @@ -240,55 +250,66 @@ struct __wt_connection_stats { int64_t async_op_remove; int64_t async_op_search; int64_t async_op_update; - int64_t block_byte_map_read; - int64_t block_byte_read; - int64_t block_byte_write; - int64_t block_map_read; int64_t block_preload; int64_t block_read; int64_t block_write; - int64_t cache_bytes_dirty; - int64_t cache_bytes_internal; + int64_t block_byte_read; + int64_t block_byte_write; + int64_t block_map_read; + int64_t block_byte_map_read; int64_t cache_bytes_inuse; - int64_t cache_bytes_leaf; - int64_t cache_bytes_max; - int64_t cache_bytes_overflow; int64_t cache_bytes_read; int64_t cache_bytes_write; - int64_t cache_eviction_app; int64_t cache_eviction_checkpoint; - int64_t cache_eviction_clean; - int64_t cache_eviction_deepen; - int64_t cache_eviction_dirty; - int64_t cache_eviction_fail; - int64_t cache_eviction_force; - int64_t cache_eviction_force_delete; - int64_t cache_eviction_force_fail; - int64_t cache_eviction_hazard; - int64_t cache_eviction_internal; - int64_t cache_eviction_maximum_page_size; + int64_t cache_eviction_aggressive_set; int64_t cache_eviction_queue_empty; int64_t cache_eviction_queue_not_empty; int64_t cache_eviction_server_evicting; int64_t cache_eviction_server_not_evicting; int64_t cache_eviction_slow; - int64_t cache_eviction_split_internal; - int64_t cache_eviction_split_leaf; - int64_t cache_eviction_walk; int64_t cache_eviction_worker_evicting; - int64_t cache_inmem_split; + int64_t cache_eviction_force_fail; + int64_t cache_eviction_hazard; int64_t cache_inmem_splittable; + int64_t cache_inmem_split; + int64_t cache_eviction_internal; + int64_t cache_eviction_split_internal; + int64_t cache_eviction_split_leaf; int64_t cache_lookaside_insert; int64_t cache_lookaside_remove; - int64_t cache_overhead; - int64_t cache_pages_dirty; + int64_t cache_bytes_max; + int64_t cache_eviction_maximum_page_size; + int64_t cache_eviction_dirty; + int64_t cache_eviction_deepen; + int64_t cache_write_lookaside; int64_t cache_pages_inuse; + int64_t cache_eviction_force; + int64_t cache_eviction_force_delete; + int64_t cache_eviction_app; int64_t cache_read; int64_t cache_read_lookaside; + int64_t cache_eviction_fail; + int64_t cache_eviction_walk; int64_t cache_write; - int64_t cache_write_lookaside; int64_t cache_write_restore; + int64_t cache_overhead; + int64_t cache_bytes_internal; + int64_t cache_bytes_leaf; + int64_t cache_bytes_overflow; + int64_t cache_bytes_dirty; + int64_t cache_pages_dirty; + int64_t cache_eviction_clean; + int64_t cond_auto_wait_reset; + int64_t cond_auto_wait; + int64_t file_open; + int64_t memory_allocation; + int64_t memory_free; + int64_t memory_grow; int64_t cond_wait; + int64_t rwlock_read; + int64_t rwlock_write; + int64_t read_io; + int64_t write_io; int64_t cursor_create; int64_t cursor_insert; int64_t cursor_next; @@ -298,96 +319,81 @@ struct __wt_connection_stats { int64_t cursor_restart; int64_t cursor_search; int64_t cursor_search_near; - int64_t cursor_truncate; int64_t cursor_update; + int64_t cursor_truncate; int64_t dh_conn_handle_count; - int64_t dh_session_handles; - int64_t dh_session_sweeps; - int64_t dh_sweep_close; int64_t dh_sweep_ref; + int64_t dh_sweep_close; int64_t dh_sweep_remove; int64_t dh_sweep_tod; int64_t dh_sweeps; - int64_t file_open; - int64_t log_buffer_size; + int64_t dh_session_handles; + int64_t dh_session_sweeps; + int64_t log_slot_switch_busy; + int64_t log_slot_closes; + int64_t log_slot_races; + int64_t log_slot_transitions; + int64_t log_slot_joins; + int64_t log_slot_unbuffered; int64_t log_bytes_payload; int64_t log_bytes_written; - int64_t log_close_yields; - int64_t log_compress_len; - int64_t log_compress_mem; - int64_t log_compress_small; - int64_t log_compress_write_fails; - int64_t log_compress_writes; + int64_t log_zero_fills; int64_t log_flush; + int64_t log_force_write; + int64_t log_force_write_skip; + int64_t log_compress_writes; + int64_t log_compress_write_fails; + int64_t log_compress_small; + int64_t log_release_write_lsn; + int64_t log_scans; + int64_t log_scan_rereads; + int64_t log_write_lsn; + int64_t log_write_lsn_skip; + int64_t log_sync; + int64_t log_sync_dir; + int64_t log_writes; + int64_t log_slot_consolidated; int64_t log_max_filesize; - int64_t log_prealloc_files; int64_t log_prealloc_max; int64_t log_prealloc_missed; + int64_t log_prealloc_files; int64_t log_prealloc_used; - int64_t log_release_write_lsn; int64_t log_scan_records; - int64_t log_scan_rereads; - int64_t log_scans; - int64_t log_slot_closes; + int64_t log_compress_mem; + int64_t log_buffer_size; + int64_t log_compress_len; int64_t log_slot_coalesced; - int64_t log_slot_consolidated; - int64_t log_slot_joins; - int64_t log_slot_races; - int64_t log_slot_switch_busy; - int64_t log_slot_transitions; - int64_t log_slot_unbuffered; - int64_t log_sync; - int64_t log_sync_dir; - int64_t log_write_lsn; - int64_t log_writes; - int64_t log_zero_fills; - int64_t lsm_checkpoint_throttle; - int64_t lsm_merge_throttle; - int64_t lsm_rows_merged; - int64_t lsm_work_queue_app; - int64_t lsm_work_queue_manager; - int64_t lsm_work_queue_max; - int64_t lsm_work_queue_switch; - int64_t lsm_work_units_created; - int64_t lsm_work_units_discarded; - int64_t lsm_work_units_done; - int64_t memory_allocation; - int64_t memory_free; - int64_t memory_grow; - int64_t page_busy_blocked; - int64_t page_forcible_evict_blocked; - int64_t page_locked_blocked; - int64_t page_read_blocked; - int64_t page_sleep; - int64_t read_io; - int64_t rec_page_delete; + int64_t log_close_yields; int64_t rec_page_delete_fast; int64_t rec_pages; int64_t rec_pages_eviction; + int64_t rec_page_delete; int64_t rec_split_stashed_bytes; int64_t rec_split_stashed_objects; - int64_t rwlock_read; - int64_t rwlock_write; int64_t session_cursor_open; int64_t session_open; + int64_t page_busy_blocked; + int64_t page_forcible_evict_blocked; + int64_t page_locked_blocked; + int64_t page_read_blocked; + int64_t page_sleep; + int64_t txn_snapshots_created; + int64_t txn_snapshots_dropped; int64_t txn_begin; - int64_t txn_checkpoint; - int64_t txn_checkpoint_generation; int64_t txn_checkpoint_running; + int64_t txn_checkpoint_generation; int64_t txn_checkpoint_time_max; int64_t txn_checkpoint_time_min; int64_t txn_checkpoint_time_recent; int64_t txn_checkpoint_time_total; - int64_t txn_commit; + int64_t txn_checkpoint; int64_t txn_fail_cache; - int64_t txn_pinned_checkpoint_range; int64_t txn_pinned_range; + int64_t txn_pinned_checkpoint_range; int64_t txn_pinned_snapshot_range; - int64_t txn_rollback; - int64_t txn_snapshots_created; - int64_t txn_snapshots_dropped; int64_t txn_sync; - int64_t write_io; + int64_t txn_commit; + int64_t txn_rollback; }; /* @@ -395,102 +401,102 @@ struct __wt_connection_stats { */ #define WT_DSRC_STATS_BASE 2000 struct __wt_dsrc_stats { - int64_t allocation_size; - int64_t block_alloc; - int64_t block_checkpoint_size; - int64_t block_extension; - int64_t block_free; - int64_t block_magic; - int64_t block_major; - int64_t block_minor; - int64_t block_reuse_bytes; - int64_t block_size; - int64_t bloom_count; int64_t bloom_false_positive; int64_t bloom_hit; int64_t bloom_miss; int64_t bloom_page_evict; int64_t bloom_page_read; + int64_t bloom_count; + int64_t lsm_chunk_count; + int64_t lsm_generation_max; + int64_t lsm_lookup_no_bloom; + int64_t lsm_checkpoint_throttle; + int64_t lsm_merge_throttle; int64_t bloom_size; + int64_t block_extension; + int64_t block_alloc; + int64_t block_free; + int64_t block_checkpoint_size; + int64_t allocation_size; + int64_t block_reuse_bytes; + int64_t block_magic; + int64_t block_major; + int64_t block_size; + int64_t block_minor; int64_t btree_checkpoint_generation; - int64_t btree_column_deleted; int64_t btree_column_fix; int64_t btree_column_internal; int64_t btree_column_rle; + int64_t btree_column_deleted; int64_t btree_column_variable; - int64_t btree_compact_rewrite; - int64_t btree_entries; int64_t btree_fixed_len; - int64_t btree_maximum_depth; int64_t btree_maxintlkey; int64_t btree_maxintlpage; int64_t btree_maxleafkey; int64_t btree_maxleafpage; int64_t btree_maxleafvalue; + int64_t btree_maximum_depth; + int64_t btree_entries; int64_t btree_overflow; + int64_t btree_compact_rewrite; int64_t btree_row_internal; int64_t btree_row_leaf; int64_t cache_bytes_read; int64_t cache_bytes_write; int64_t cache_eviction_checkpoint; - int64_t cache_eviction_clean; - int64_t cache_eviction_deepen; - int64_t cache_eviction_dirty; int64_t cache_eviction_fail; int64_t cache_eviction_hazard; + int64_t cache_inmem_splittable; + int64_t cache_inmem_split; int64_t cache_eviction_internal; int64_t cache_eviction_split_internal; int64_t cache_eviction_split_leaf; - int64_t cache_inmem_split; - int64_t cache_inmem_splittable; + int64_t cache_eviction_dirty; + int64_t cache_read_overflow; int64_t cache_overflow_value; + int64_t cache_eviction_deepen; + int64_t cache_write_lookaside; int64_t cache_read; int64_t cache_read_lookaside; - int64_t cache_read_overflow; int64_t cache_write; - int64_t cache_write_lookaside; int64_t cache_write_restore; - int64_t compress_raw_fail; - int64_t compress_raw_fail_temporary; - int64_t compress_raw_ok; + int64_t cache_eviction_clean; int64_t compress_read; int64_t compress_write; int64_t compress_write_fail; int64_t compress_write_too_small; - int64_t cursor_create; - int64_t cursor_insert; + int64_t compress_raw_fail_temporary; + int64_t compress_raw_fail; + int64_t compress_raw_ok; int64_t cursor_insert_bulk; + int64_t cursor_create; int64_t cursor_insert_bytes; + int64_t cursor_remove_bytes; + int64_t cursor_update_bytes; + int64_t cursor_insert; int64_t cursor_next; int64_t cursor_prev; int64_t cursor_remove; - int64_t cursor_remove_bytes; int64_t cursor_reset; int64_t cursor_restart; int64_t cursor_search; int64_t cursor_search_near; int64_t cursor_truncate; int64_t cursor_update; - int64_t cursor_update_bytes; - int64_t lsm_checkpoint_throttle; - int64_t lsm_chunk_count; - int64_t lsm_generation_max; - int64_t lsm_lookup_no_bloom; - int64_t lsm_merge_throttle; int64_t rec_dictionary; + int64_t rec_page_delete_fast; + int64_t rec_suffix_compression; int64_t rec_multiblock_internal; - int64_t rec_multiblock_leaf; - int64_t rec_multiblock_max; int64_t rec_overflow_key_internal; + int64_t rec_prefix_compression; + int64_t rec_multiblock_leaf; int64_t rec_overflow_key_leaf; + int64_t rec_multiblock_max; int64_t rec_overflow_value; - int64_t rec_page_delete; - int64_t rec_page_delete_fast; int64_t rec_page_match; int64_t rec_pages; int64_t rec_pages_eviction; - int64_t rec_prefix_compression; - int64_t rec_suffix_compression; + int64_t rec_page_delete; int64_t session_compact; int64_t session_cursor_open; int64_t txn_update_conflict; diff --git a/src/include/txn.i b/src/include/txn.i index 46f2ff3e5f1..40e2a6175d6 100644 --- a/src/include/txn.i +++ b/src/include/txn.i @@ -266,6 +266,8 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) } F_SET(txn, WT_TXN_RUNNING); + if (F_ISSET(S2C(session), WT_CONN_READONLY)) + F_SET(txn, WT_TXN_READONLY); return (false); } diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 767c176b53f..1e263f22880 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -828,7 +828,8 @@ struct __wt_session { * @snippet ex_all.c Display an error thread safe * * @param session the session handle - * @param error a return value from a WiredTiger function + * @param error a return value from a WiredTiger, ISO C, or POSIX + * standard API * @returns a string representation of the error */ const char *__F(strerror)(WT_SESSION *session, int error); @@ -873,7 +874,7 @@ struct __wt_session { * updates). See @ref data_sources for more information. * <br> * @copydoc doc_cursor_types - * @param to_dup a cursor to duplicate + * @param to_dup a cursor to duplicate or gather statistics on * @configstart{WT_SESSION.open_cursor, see dist/api_data.py} * @config{append, append the value as a new record\, creating a new * record number key; valid only for cursors with record number keys., a @@ -1409,7 +1410,7 @@ struct __wt_session { * if <code>NULL</code>, the truncate continues to the end of the * object * @configempty{WT_SESSION.truncate, see dist/api_data.py} - * @ebusy_errors + * @errors */ int __F(truncate)(WT_SESSION *session, const char *name, @@ -1893,8 +1894,10 @@ struct __wt_connection { * information. Enabling the statistics log server uses a session from * the configured session_max., a set of related configuration options * defined below.} - * @config{ on_close, log - * statistics on database close., a boolean flag; default \c false.} + * @config{ json, encode + * statistics in JSON format., a boolean flag; default \c false.} + * @config{ on_close, log statistics on database + * close., a boolean flag; default \c false.} * @config{ path, the pathname to a file into * which the log records are written\, may contain ISO C standard * strftime conversion specifications. If the value is not an absolute @@ -1908,7 +1911,8 @@ struct __wt_connection { * empty.} * @config{ timestamp, a timestamp * prepended to each log record\, may contain strftime conversion - * specifications., a string; default \c "%b %d %H:%M:%S".} + * specifications\, when \c json is configured\, defaults to \c + * "%FT%Y.000Z"., a string; default \c "%b %d %H:%M:%S".} * @config{ wait, seconds to wait between each * write of the log records; setting this value above 0 configures * statistics logging., an integer between 0 and 100000; default \c 0.} @@ -1982,7 +1986,8 @@ struct __wt_connection { * * @param connection the connection handle * @param errhandler An error handler. If <code>NULL</code>, the - * connection's error handler is used + * connection's error handler is used. See @ref error_handling_event + * for more information. * @configstart{WT_CONNECTION.open_session, see dist/api_data.py} * @config{isolation, the default isolation level for operations in this * session., a string\, chosen from the following options: \c @@ -2143,7 +2148,8 @@ struct __wt_connection { * @param home The path to the database home directory. See @ref home * for more information. * @param errhandler An error handler. If <code>NULL</code>, a builtin error - * handler is installed that writes error messages to stderr + * handler is installed that writes error messages to stderr. See + * @ref error_handling_event for more information. * @configstart{wiredtiger_open, see dist/api_data.py} * @config{async = (, asynchronous operations configuration options., a set of * related configuration options defined below.} @@ -2326,6 +2332,9 @@ struct __wt_connection { * start an RPC server for primary processes and use RPC for secondary * processes). <b>Not yet supported in WiredTiger</b>., a boolean flag; default * \c false.} + * @config{readonly, open connection in read-only mode. The database must + * exist. All methods that may modify a database are disabled. See @ref + * readonly for more information., a boolean flag; default \c false.} * @config{session_max, maximum expected number of sessions (including server * threads)., an integer greater than or equal to 1; default \c 100.} * @config{shared_cache = (, shared cache configuration options. A database @@ -2363,23 +2372,26 @@ struct __wt_connection { * maintain\, to a file. See @ref statistics for more information. Enabling * the statistics log server uses a session from the configured session_max., a * set of related configuration options defined below.} - * @config{ on_close, log statistics on database close., - * a boolean flag; default \c false.} - * @config{ path, the - * pathname to a file into which the log records are written\, may contain ISO C - * standard strftime conversion specifications. If the value is not an absolute - * path name\, the file is created relative to the database home., a string; - * default \c "WiredTigerStat.%d.%H".} - * @config{ sources, - * if non-empty\, include statistics for the list of data source URIs\, if they - * are open at the time of the statistics logging. The list may include URIs + * @config{ json, encode statistics in JSON format., a + * boolean flag; default \c false.} + * @config{ on_close, + * log statistics on database close., a boolean flag; default \c false.} + * @config{ path, the pathname to a file into which the + * log records are written\, may contain ISO C standard strftime conversion + * specifications. If the value is not an absolute path name\, the file is + * created relative to the database home., a string; default \c + * "WiredTigerStat.%d.%H".} + * @config{ sources, if + * non-empty\, include statistics for the list of data source URIs\, if they are + * open at the time of the statistics logging. The list may include URIs * matching a single data source ("table:mytable")\, or a URI matching all data * sources of a particular type ("table:")., a list of strings; default empty.} * @config{ timestamp, a timestamp prepended to each log - * record\, may contain strftime conversion specifications., a string; default - * \c "%b %d %H:%M:%S".} - * @config{ wait, seconds to wait - * between each write of the log records; setting this value above 0 configures + * record\, may contain strftime conversion specifications\, when \c json is + * configured\, defaults to \c "%FT%Y.000Z"., a string; default \c "%b %d + * %H:%M:%S".} + * @config{ wait, seconds to wait between + * each write of the log records; setting this value above 0 configures * statistics logging., an integer between 0 and 100000; default \c 0.} * @config{ * ),,} @@ -2431,11 +2443,12 @@ int wiredtiger_open(const char *home, WT_CONNECTION **connectionp); /*! - * Return information about a WiredTiger error as a string, not thread-safe. + * Return information about a WiredTiger error as a string (see + * WT_SESSION::strerror for a thread-safe API). * * @snippet ex_all.c Display an error * - * @param error a return value from a WiredTiger call + * @param error a return value from a WiredTiger, ISO C, or POSIX standard API * @returns a string representation of the error */ const char *wiredtiger_strerror(int error); @@ -2474,7 +2487,7 @@ struct __wt_async_callback { struct __wt_event_handler { /*! * Callback to handle error messages; by default, error messages are - * written to the stderr stream. + * written to the stderr stream. See @ref error_handling. * * Errors that require the application to exit and restart will have * their \c error value set to \c WT_PANIC. The application can exit @@ -2488,8 +2501,9 @@ struct __wt_event_handler { * @param session the WiredTiger session handle in use when the error * was generated. The handle may have been created by the application * or automatically by WiredTiger. - * @param error a WiredTiger, C99 or POSIX error code, which can - * be converted to a string using ::wiredtiger_strerror + * @param error a return value from a WiredTiger, ISO C, or + * POSIX standard API, which can be converted to a string using + * WT_SESSION::strerror * @param message an error string */ int (*handle_error)(WT_EVENT_HANDLER *handler, @@ -2497,7 +2511,7 @@ struct __wt_event_handler { /*! * Callback to handle informational messages; by default, informational - * messages are written to the stdout stream. + * messages are written to the stdout stream. See @ref error_handling. * * Message handler returns are not ignored: if the handler returns * non-zero, the error may cause the WiredTiger function posting the @@ -2513,7 +2527,7 @@ struct __wt_event_handler { /*! * Callback to handle progress messages; by default, no progress - * messages are written. + * messages are written. See @ref error_handling. * * Progress handler returns are not ignored: if the handler returns * non-zero, the error may cause the WiredTiger function posting the @@ -2998,6 +3012,10 @@ const char *wiredtiger_version(int *majorp, int *minorp, int *patchp); */ #define WT_CACHE_FULL -31807 /*! @endcond */ +/*! @cond internal */ +/*! Permission denied (internal). */ +#define WT_PERM_DENIED -31808 +/*! @endcond */ /* * Error return section: END * DO NOT EDIT: automatically built by dist/api_err.py. @@ -3688,329 +3706,341 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); * keys. See @ref data_statistics for more information. * @{ */ -/*! async: number of allocation state races */ -#define WT_STAT_CONN_ASYNC_ALLOC_RACE 1000 -/*! async: number of operation slots viewed for allocation */ -#define WT_STAT_CONN_ASYNC_ALLOC_VIEW 1001 +/*! LSM: application work units currently queued */ +#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1000 +/*! LSM: merge work units currently queued */ +#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1001 +/*! LSM: rows merged in an LSM tree */ +#define WT_STAT_CONN_LSM_ROWS_MERGED 1002 +/*! LSM: sleep for LSM checkpoint throttle */ +#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1003 +/*! LSM: sleep for LSM merge throttle */ +#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1004 +/*! LSM: switch work units currently queued */ +#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1005 +/*! LSM: tree maintenance operations discarded */ +#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1006 +/*! LSM: tree maintenance operations executed */ +#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1007 +/*! LSM: tree maintenance operations scheduled */ +#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1008 +/*! LSM: tree queue hit maximum */ +#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1009 /*! async: current work queue length */ -#define WT_STAT_CONN_ASYNC_CUR_QUEUE 1002 +#define WT_STAT_CONN_ASYNC_CUR_QUEUE 1010 +/*! async: maximum work queue length */ +#define WT_STAT_CONN_ASYNC_MAX_QUEUE 1011 +/*! async: number of allocation state races */ +#define WT_STAT_CONN_ASYNC_ALLOC_RACE 1012 /*! async: number of flush calls */ -#define WT_STAT_CONN_ASYNC_FLUSH 1003 +#define WT_STAT_CONN_ASYNC_FLUSH 1013 +/*! async: number of operation slots viewed for allocation */ +#define WT_STAT_CONN_ASYNC_ALLOC_VIEW 1014 /*! async: number of times operation allocation failed */ -#define WT_STAT_CONN_ASYNC_FULL 1004 -/*! async: maximum work queue length */ -#define WT_STAT_CONN_ASYNC_MAX_QUEUE 1005 +#define WT_STAT_CONN_ASYNC_FULL 1015 /*! async: number of times worker found no work */ -#define WT_STAT_CONN_ASYNC_NOWORK 1006 +#define WT_STAT_CONN_ASYNC_NOWORK 1016 /*! async: total allocations */ -#define WT_STAT_CONN_ASYNC_OP_ALLOC 1007 +#define WT_STAT_CONN_ASYNC_OP_ALLOC 1017 /*! async: total compact calls */ -#define WT_STAT_CONN_ASYNC_OP_COMPACT 1008 +#define WT_STAT_CONN_ASYNC_OP_COMPACT 1018 /*! async: total insert calls */ -#define WT_STAT_CONN_ASYNC_OP_INSERT 1009 +#define WT_STAT_CONN_ASYNC_OP_INSERT 1019 /*! async: total remove calls */ -#define WT_STAT_CONN_ASYNC_OP_REMOVE 1010 +#define WT_STAT_CONN_ASYNC_OP_REMOVE 1020 /*! async: total search calls */ -#define WT_STAT_CONN_ASYNC_OP_SEARCH 1011 +#define WT_STAT_CONN_ASYNC_OP_SEARCH 1021 /*! async: total update calls */ -#define WT_STAT_CONN_ASYNC_OP_UPDATE 1012 -/*! block-manager: mapped bytes read */ -#define WT_STAT_CONN_BLOCK_BYTE_MAP_READ 1013 -/*! block-manager: bytes read */ -#define WT_STAT_CONN_BLOCK_BYTE_READ 1014 -/*! block-manager: bytes written */ -#define WT_STAT_CONN_BLOCK_BYTE_WRITE 1015 -/*! block-manager: mapped blocks read */ -#define WT_STAT_CONN_BLOCK_MAP_READ 1016 +#define WT_STAT_CONN_ASYNC_OP_UPDATE 1022 /*! block-manager: blocks pre-loaded */ -#define WT_STAT_CONN_BLOCK_PRELOAD 1017 +#define WT_STAT_CONN_BLOCK_PRELOAD 1023 /*! block-manager: blocks read */ -#define WT_STAT_CONN_BLOCK_READ 1018 +#define WT_STAT_CONN_BLOCK_READ 1024 /*! block-manager: blocks written */ -#define WT_STAT_CONN_BLOCK_WRITE 1019 -/*! cache: tracked dirty bytes in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1020 -/*! cache: tracked bytes belonging to internal pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1021 +#define WT_STAT_CONN_BLOCK_WRITE 1025 +/*! block-manager: bytes read */ +#define WT_STAT_CONN_BLOCK_BYTE_READ 1026 +/*! block-manager: bytes written */ +#define WT_STAT_CONN_BLOCK_BYTE_WRITE 1027 +/*! block-manager: mapped blocks read */ +#define WT_STAT_CONN_BLOCK_MAP_READ 1028 +/*! block-manager: mapped bytes read */ +#define WT_STAT_CONN_BLOCK_BYTE_MAP_READ 1029 /*! cache: bytes currently in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_INUSE 1022 -/*! cache: tracked bytes belonging to leaf pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_LEAF 1023 -/*! cache: maximum bytes configured */ -#define WT_STAT_CONN_CACHE_BYTES_MAX 1024 -/*! cache: tracked bytes belonging to overflow pages in the cache */ -#define WT_STAT_CONN_CACHE_BYTES_OVERFLOW 1025 +#define WT_STAT_CONN_CACHE_BYTES_INUSE 1030 /*! cache: bytes read into cache */ -#define WT_STAT_CONN_CACHE_BYTES_READ 1026 +#define WT_STAT_CONN_CACHE_BYTES_READ 1031 /*! cache: bytes written from cache */ -#define WT_STAT_CONN_CACHE_BYTES_WRITE 1027 -/*! cache: pages evicted by application threads */ -#define WT_STAT_CONN_CACHE_EVICTION_APP 1028 +#define WT_STAT_CONN_CACHE_BYTES_WRITE 1032 /*! cache: checkpoint blocked page eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1029 -/*! cache: unmodified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1030 -/*! cache: page split during eviction deepened the tree */ -#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1031 -/*! cache: modified pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1032 -/*! cache: pages selected for eviction unable to be evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1033 -/*! cache: pages evicted because they exceeded the in-memory maximum */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1034 -/*! cache: pages evicted because they had chains of deleted items */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1035 -/*! cache: failed eviction of pages that exceeded the in-memory maximum */ -#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1036 -/*! cache: hazard pointer blocked page eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1037 -/*! cache: internal pages evicted */ -#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1038 -/*! cache: maximum page size at eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1039 +#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1033 +/*! cache: eviction currently operating in aggressive mode */ +#define WT_STAT_CONN_CACHE_EVICTION_AGGRESSIVE_SET 1034 /*! cache: eviction server candidate queue empty when topping up */ -#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1040 +#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1035 /*! cache: eviction server candidate queue not empty when topping up */ -#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1041 +#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1036 /*! cache: eviction server evicting pages */ -#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1042 +#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1037 /*! cache: eviction server populating queue, but not evicting pages */ -#define WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING 1043 +#define WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING 1038 /*! cache: eviction server unable to reach eviction goal */ -#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1044 -/*! cache: internal pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1045 -/*! cache: leaf pages split during eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1046 -/*! cache: pages walked for eviction */ -#define WT_STAT_CONN_CACHE_EVICTION_WALK 1047 +#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1039 /*! cache: eviction worker thread evicting pages */ -#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1048 -/*! cache: in-memory page splits */ -#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1049 +#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1040 +/*! cache: failed eviction of pages that exceeded the in-memory maximum */ +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1041 +/*! cache: hazard pointer blocked page eviction */ +#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1042 /*! cache: in-memory page passed criteria to be split */ -#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1050 +#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1043 +/*! cache: in-memory page splits */ +#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1044 +/*! cache: internal pages evicted */ +#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1045 +/*! cache: internal pages split during eviction */ +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1046 +/*! cache: leaf pages split during eviction */ +#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1047 /*! cache: lookaside table insert calls */ -#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1051 +#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1048 /*! cache: lookaside table remove calls */ -#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1052 -/*! cache: percentage overhead */ -#define WT_STAT_CONN_CACHE_OVERHEAD 1053 -/*! cache: tracked dirty pages in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1054 +#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1049 +/*! cache: maximum bytes configured */ +#define WT_STAT_CONN_CACHE_BYTES_MAX 1050 +/*! cache: maximum page size at eviction */ +#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1051 +/*! cache: modified pages evicted */ +#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1052 +/*! cache: page split during eviction deepened the tree */ +#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1053 +/*! cache: page written requiring lookaside records */ +#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1054 /*! cache: pages currently held in the cache */ #define WT_STAT_CONN_CACHE_PAGES_INUSE 1055 +/*! cache: pages evicted because they exceeded the in-memory maximum */ +#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1056 +/*! cache: pages evicted because they had chains of deleted items */ +#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1057 +/*! cache: pages evicted by application threads */ +#define WT_STAT_CONN_CACHE_EVICTION_APP 1058 /*! cache: pages read into cache */ -#define WT_STAT_CONN_CACHE_READ 1056 +#define WT_STAT_CONN_CACHE_READ 1059 /*! cache: pages read into cache requiring lookaside entries */ -#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1057 +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1060 +/*! cache: pages selected for eviction unable to be evicted */ +#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1061 +/*! cache: pages walked for eviction */ +#define WT_STAT_CONN_CACHE_EVICTION_WALK 1062 /*! cache: pages written from cache */ -#define WT_STAT_CONN_CACHE_WRITE 1058 -/*! cache: page written requiring lookaside records */ -#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1059 +#define WT_STAT_CONN_CACHE_WRITE 1063 /*! cache: pages written requiring in-memory restoration */ -#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1060 +#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1064 +/*! cache: percentage overhead */ +#define WT_STAT_CONN_CACHE_OVERHEAD 1065 +/*! cache: tracked bytes belonging to internal pages in the cache */ +#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1066 +/*! cache: tracked bytes belonging to leaf pages in the cache */ +#define WT_STAT_CONN_CACHE_BYTES_LEAF 1067 +/*! cache: tracked bytes belonging to overflow pages in the cache */ +#define WT_STAT_CONN_CACHE_BYTES_OVERFLOW 1068 +/*! cache: tracked dirty bytes in the cache */ +#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1069 +/*! cache: tracked dirty pages in the cache */ +#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1070 +/*! cache: unmodified pages evicted */ +#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1071 +/*! connection: auto adjusting condition resets */ +#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1072 +/*! connection: auto adjusting condition wait calls */ +#define WT_STAT_CONN_COND_AUTO_WAIT 1073 +/*! connection: files currently open */ +#define WT_STAT_CONN_FILE_OPEN 1074 +/*! connection: memory allocations */ +#define WT_STAT_CONN_MEMORY_ALLOCATION 1075 +/*! connection: memory frees */ +#define WT_STAT_CONN_MEMORY_FREE 1076 +/*! connection: memory re-allocations */ +#define WT_STAT_CONN_MEMORY_GROW 1077 /*! connection: pthread mutex condition wait calls */ -#define WT_STAT_CONN_COND_WAIT 1061 +#define WT_STAT_CONN_COND_WAIT 1078 +/*! connection: pthread mutex shared lock read-lock calls */ +#define WT_STAT_CONN_RWLOCK_READ 1079 +/*! connection: pthread mutex shared lock write-lock calls */ +#define WT_STAT_CONN_RWLOCK_WRITE 1080 +/*! connection: total read I/Os */ +#define WT_STAT_CONN_READ_IO 1081 +/*! connection: total write I/Os */ +#define WT_STAT_CONN_WRITE_IO 1082 /*! cursor: cursor create calls */ -#define WT_STAT_CONN_CURSOR_CREATE 1062 +#define WT_STAT_CONN_CURSOR_CREATE 1083 /*! cursor: cursor insert calls */ -#define WT_STAT_CONN_CURSOR_INSERT 1063 +#define WT_STAT_CONN_CURSOR_INSERT 1084 /*! cursor: cursor next calls */ -#define WT_STAT_CONN_CURSOR_NEXT 1064 +#define WT_STAT_CONN_CURSOR_NEXT 1085 /*! cursor: cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV 1065 +#define WT_STAT_CONN_CURSOR_PREV 1086 /*! cursor: cursor remove calls */ -#define WT_STAT_CONN_CURSOR_REMOVE 1066 +#define WT_STAT_CONN_CURSOR_REMOVE 1087 /*! cursor: cursor reset calls */ -#define WT_STAT_CONN_CURSOR_RESET 1067 +#define WT_STAT_CONN_CURSOR_RESET 1088 /*! cursor: cursor restarted searches */ -#define WT_STAT_CONN_CURSOR_RESTART 1068 +#define WT_STAT_CONN_CURSOR_RESTART 1089 /*! cursor: cursor search calls */ -#define WT_STAT_CONN_CURSOR_SEARCH 1069 +#define WT_STAT_CONN_CURSOR_SEARCH 1090 /*! cursor: cursor search near calls */ -#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1070 -/*! cursor: truncate calls */ -#define WT_STAT_CONN_CURSOR_TRUNCATE 1071 +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1091 /*! cursor: cursor update calls */ -#define WT_STAT_CONN_CURSOR_UPDATE 1072 +#define WT_STAT_CONN_CURSOR_UPDATE 1092 +/*! cursor: truncate calls */ +#define WT_STAT_CONN_CURSOR_TRUNCATE 1093 /*! data-handle: connection data handles currently active */ -#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1073 -/*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1074 -/*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1075 -/*! data-handle: connection sweep dhandles closed */ -#define WT_STAT_CONN_DH_SWEEP_CLOSE 1076 +#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1094 /*! data-handle: connection sweep candidate became referenced */ -#define WT_STAT_CONN_DH_SWEEP_REF 1077 +#define WT_STAT_CONN_DH_SWEEP_REF 1095 +/*! data-handle: connection sweep dhandles closed */ +#define WT_STAT_CONN_DH_SWEEP_CLOSE 1096 /*! data-handle: connection sweep dhandles removed from hash list */ -#define WT_STAT_CONN_DH_SWEEP_REMOVE 1078 +#define WT_STAT_CONN_DH_SWEEP_REMOVE 1097 /*! data-handle: connection sweep time-of-death sets */ -#define WT_STAT_CONN_DH_SWEEP_TOD 1079 +#define WT_STAT_CONN_DH_SWEEP_TOD 1098 /*! data-handle: connection sweeps */ -#define WT_STAT_CONN_DH_SWEEPS 1080 -/*! connection: files currently open */ -#define WT_STAT_CONN_FILE_OPEN 1081 -/*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1082 +#define WT_STAT_CONN_DH_SWEEPS 1099 +/*! data-handle: session dhandles swept */ +#define WT_STAT_CONN_DH_SESSION_HANDLES 1100 +/*! data-handle: session sweep attempts */ +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1101 +/*! log: busy returns attempting to switch slots */ +#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1102 +/*! log: consolidated slot closures */ +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1103 +/*! log: consolidated slot join races */ +#define WT_STAT_CONN_LOG_SLOT_RACES 1104 +/*! log: consolidated slot join transitions */ +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1105 +/*! log: consolidated slot joins */ +#define WT_STAT_CONN_LOG_SLOT_JOINS 1106 +/*! log: consolidated slot unbuffered writes */ +#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1107 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1083 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1108 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1084 -/*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1085 -/*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1086 -/*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1087 -/*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1088 -/*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1089 -/*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1090 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1109 +/*! log: log files manually zero-filled */ +#define WT_STAT_CONN_LOG_ZERO_FILLS 1110 /*! log: log flush operations */ -#define WT_STAT_CONN_LOG_FLUSH 1091 +#define WT_STAT_CONN_LOG_FLUSH 1111 +/*! log: log force write operations */ +#define WT_STAT_CONN_LOG_FORCE_WRITE 1112 +/*! log: log force write operations skipped */ +#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1113 +/*! log: log records compressed */ +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1114 +/*! log: log records not compressed */ +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1115 +/*! log: log records too small to compress */ +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1116 +/*! log: log release advances write LSN */ +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1117 +/*! log: log scan operations */ +#define WT_STAT_CONN_LOG_SCANS 1118 +/*! log: log scan records requiring two reads */ +#define WT_STAT_CONN_LOG_SCAN_REREADS 1119 +/*! log: log server thread advances write LSN */ +#define WT_STAT_CONN_LOG_WRITE_LSN 1120 +/*! log: log server thread write LSN walk skipped */ +#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1121 +/*! log: log sync operations */ +#define WT_STAT_CONN_LOG_SYNC 1122 +/*! log: log sync_dir operations */ +#define WT_STAT_CONN_LOG_SYNC_DIR 1123 +/*! log: log write operations */ +#define WT_STAT_CONN_LOG_WRITES 1124 +/*! log: logging bytes consolidated */ +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1125 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1092 -/*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1093 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1126 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1094 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1127 /*! log: pre-allocated log files not ready and missed */ -#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1095 +#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1128 +/*! log: pre-allocated log files prepared */ +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1129 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1096 -/*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1097 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1130 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1098 -/*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1099 -/*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1100 -/*! log: consolidated slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1101 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1131 +/*! log: total in-memory size of compressed records */ +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1132 +/*! log: total log buffer size */ +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1133 +/*! log: total size of compressed records */ +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1134 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1102 -/*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1103 -/*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1104 -/*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1105 -/*! log: busy returns attempting to switch slots */ -#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1106 -/*! log: consolidated slot join transitions */ -#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1107 -/*! log: consolidated slot unbuffered writes */ -#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1108 -/*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1109 -/*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1110 -/*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1111 -/*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1112 -/*! log: log files manually zero-filled */ -#define WT_STAT_CONN_LOG_ZERO_FILLS 1113 -/*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1114 -/*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1115 -/*! LSM: rows merged in an LSM tree */ -#define WT_STAT_CONN_LSM_ROWS_MERGED 1116 -/*! LSM: application work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1117 -/*! LSM: merge work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1118 -/*! LSM: tree queue hit maximum */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1119 -/*! LSM: switch work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1120 -/*! LSM: tree maintenance operations scheduled */ -#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1121 -/*! LSM: tree maintenance operations discarded */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1122 -/*! LSM: tree maintenance operations executed */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1123 -/*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1124 -/*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1125 -/*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1126 -/*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1127 -/*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1128 -/*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1129 -/*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1130 -/*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1131 -/*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1132 -/*! reconciliation: pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE 1133 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1135 +/*! log: yields waiting for previous log file close */ +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1136 /*! reconciliation: fast-path pages deleted */ -#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1134 +#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1137 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1135 +#define WT_STAT_CONN_REC_PAGES 1138 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1136 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1139 +/*! reconciliation: pages deleted */ +#define WT_STAT_CONN_REC_PAGE_DELETE 1140 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1137 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1141 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1138 -/*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1139 -/*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1140 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1142 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1141 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1143 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1142 +#define WT_STAT_CONN_SESSION_OPEN 1144 +/*! thread-yield: page acquire busy blocked */ +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1145 +/*! thread-yield: page acquire eviction blocked */ +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1146 +/*! thread-yield: page acquire locked blocked */ +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1147 +/*! thread-yield: page acquire read blocked */ +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1148 +/*! thread-yield: page acquire time sleeping (usecs) */ +#define WT_STAT_CONN_PAGE_SLEEP 1149 +/*! transaction: number of named snapshots created */ +#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1150 +/*! transaction: number of named snapshots dropped */ +#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1151 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1143 -/*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1144 -/*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1145 +#define WT_STAT_CONN_TXN_BEGIN 1152 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1146 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1153 +/*! transaction: transaction checkpoint generation */ +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1154 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1147 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1155 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1148 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1156 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1149 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1157 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1150 -/*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1151 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1158 +/*! transaction: transaction checkpoints */ +#define WT_STAT_CONN_TXN_CHECKPOINT 1159 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1152 -/*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1153 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1160 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1154 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1161 +/*! transaction: transaction range of IDs currently pinned by a checkpoint */ +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1162 /*! transaction: transaction range of IDs currently pinned by named * snapshots */ -#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1155 -/*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1156 -/*! transaction: number of named snapshots created */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1157 -/*! transaction: number of named snapshots dropped */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1158 +#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1163 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1159 -/*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1160 +#define WT_STAT_CONN_TXN_SYNC 1164 +/*! transaction: transactions committed */ +#define WT_STAT_CONN_TXN_COMMIT 1165 +/*! transaction: transactions rolled back */ +#define WT_STAT_CONN_TXN_ROLLBACK 1166 /*! * @} @@ -4018,200 +4048,200 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); * @anchor statistics_dsrc * @{ */ -/*! block-manager: file allocation unit size */ -#define WT_STAT_DSRC_ALLOCATION_SIZE 2000 -/*! block-manager: blocks allocated */ -#define WT_STAT_DSRC_BLOCK_ALLOC 2001 -/*! block-manager: checkpoint size */ -#define WT_STAT_DSRC_BLOCK_CHECKPOINT_SIZE 2002 -/*! block-manager: allocations requiring file extension */ -#define WT_STAT_DSRC_BLOCK_EXTENSION 2003 -/*! block-manager: blocks freed */ -#define WT_STAT_DSRC_BLOCK_FREE 2004 -/*! block-manager: file magic number */ -#define WT_STAT_DSRC_BLOCK_MAGIC 2005 -/*! block-manager: file major version number */ -#define WT_STAT_DSRC_BLOCK_MAJOR 2006 -/*! block-manager: minor version number */ -#define WT_STAT_DSRC_BLOCK_MINOR 2007 -/*! block-manager: file bytes available for reuse */ -#define WT_STAT_DSRC_BLOCK_REUSE_BYTES 2008 -/*! block-manager: file size in bytes */ -#define WT_STAT_DSRC_BLOCK_SIZE 2009 -/*! LSM: bloom filters in the LSM tree */ -#define WT_STAT_DSRC_BLOOM_COUNT 2010 /*! LSM: bloom filter false positives */ -#define WT_STAT_DSRC_BLOOM_FALSE_POSITIVE 2011 +#define WT_STAT_DSRC_BLOOM_FALSE_POSITIVE 2000 /*! LSM: bloom filter hits */ -#define WT_STAT_DSRC_BLOOM_HIT 2012 +#define WT_STAT_DSRC_BLOOM_HIT 2001 /*! LSM: bloom filter misses */ -#define WT_STAT_DSRC_BLOOM_MISS 2013 +#define WT_STAT_DSRC_BLOOM_MISS 2002 /*! LSM: bloom filter pages evicted from cache */ -#define WT_STAT_DSRC_BLOOM_PAGE_EVICT 2014 +#define WT_STAT_DSRC_BLOOM_PAGE_EVICT 2003 /*! LSM: bloom filter pages read into cache */ -#define WT_STAT_DSRC_BLOOM_PAGE_READ 2015 +#define WT_STAT_DSRC_BLOOM_PAGE_READ 2004 +/*! LSM: bloom filters in the LSM tree */ +#define WT_STAT_DSRC_BLOOM_COUNT 2005 +/*! LSM: chunks in the LSM tree */ +#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2006 +/*! LSM: highest merge generation in the LSM tree */ +#define WT_STAT_DSRC_LSM_GENERATION_MAX 2007 +/*! LSM: queries that could have benefited from a Bloom filter that did + * not exist */ +#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2008 +/*! LSM: sleep for LSM checkpoint throttle */ +#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2009 +/*! LSM: sleep for LSM merge throttle */ +#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2010 /*! LSM: total size of bloom filters */ -#define WT_STAT_DSRC_BLOOM_SIZE 2016 +#define WT_STAT_DSRC_BLOOM_SIZE 2011 +/*! block-manager: allocations requiring file extension */ +#define WT_STAT_DSRC_BLOCK_EXTENSION 2012 +/*! block-manager: blocks allocated */ +#define WT_STAT_DSRC_BLOCK_ALLOC 2013 +/*! block-manager: blocks freed */ +#define WT_STAT_DSRC_BLOCK_FREE 2014 +/*! block-manager: checkpoint size */ +#define WT_STAT_DSRC_BLOCK_CHECKPOINT_SIZE 2015 +/*! block-manager: file allocation unit size */ +#define WT_STAT_DSRC_ALLOCATION_SIZE 2016 +/*! block-manager: file bytes available for reuse */ +#define WT_STAT_DSRC_BLOCK_REUSE_BYTES 2017 +/*! block-manager: file magic number */ +#define WT_STAT_DSRC_BLOCK_MAGIC 2018 +/*! block-manager: file major version number */ +#define WT_STAT_DSRC_BLOCK_MAJOR 2019 +/*! block-manager: file size in bytes */ +#define WT_STAT_DSRC_BLOCK_SIZE 2020 +/*! block-manager: minor version number */ +#define WT_STAT_DSRC_BLOCK_MINOR 2021 /*! btree: btree checkpoint generation */ -#define WT_STAT_DSRC_BTREE_CHECKPOINT_GENERATION 2017 -/*! btree: column-store variable-size deleted values */ -#define WT_STAT_DSRC_BTREE_COLUMN_DELETED 2018 +#define WT_STAT_DSRC_BTREE_CHECKPOINT_GENERATION 2022 /*! btree: column-store fixed-size leaf pages */ -#define WT_STAT_DSRC_BTREE_COLUMN_FIX 2019 +#define WT_STAT_DSRC_BTREE_COLUMN_FIX 2023 /*! btree: column-store internal pages */ -#define WT_STAT_DSRC_BTREE_COLUMN_INTERNAL 2020 +#define WT_STAT_DSRC_BTREE_COLUMN_INTERNAL 2024 /*! btree: column-store variable-size RLE encoded values */ -#define WT_STAT_DSRC_BTREE_COLUMN_RLE 2021 +#define WT_STAT_DSRC_BTREE_COLUMN_RLE 2025 +/*! btree: column-store variable-size deleted values */ +#define WT_STAT_DSRC_BTREE_COLUMN_DELETED 2026 /*! btree: column-store variable-size leaf pages */ -#define WT_STAT_DSRC_BTREE_COLUMN_VARIABLE 2022 -/*! btree: pages rewritten by compaction */ -#define WT_STAT_DSRC_BTREE_COMPACT_REWRITE 2023 -/*! btree: number of key/value pairs */ -#define WT_STAT_DSRC_BTREE_ENTRIES 2024 +#define WT_STAT_DSRC_BTREE_COLUMN_VARIABLE 2027 /*! btree: fixed-record size */ -#define WT_STAT_DSRC_BTREE_FIXED_LEN 2025 -/*! btree: maximum tree depth */ -#define WT_STAT_DSRC_BTREE_MAXIMUM_DEPTH 2026 +#define WT_STAT_DSRC_BTREE_FIXED_LEN 2028 /*! btree: maximum internal page key size */ -#define WT_STAT_DSRC_BTREE_MAXINTLKEY 2027 +#define WT_STAT_DSRC_BTREE_MAXINTLKEY 2029 /*! btree: maximum internal page size */ -#define WT_STAT_DSRC_BTREE_MAXINTLPAGE 2028 +#define WT_STAT_DSRC_BTREE_MAXINTLPAGE 2030 /*! btree: maximum leaf page key size */ -#define WT_STAT_DSRC_BTREE_MAXLEAFKEY 2029 +#define WT_STAT_DSRC_BTREE_MAXLEAFKEY 2031 /*! btree: maximum leaf page size */ -#define WT_STAT_DSRC_BTREE_MAXLEAFPAGE 2030 +#define WT_STAT_DSRC_BTREE_MAXLEAFPAGE 2032 /*! btree: maximum leaf page value size */ -#define WT_STAT_DSRC_BTREE_MAXLEAFVALUE 2031 +#define WT_STAT_DSRC_BTREE_MAXLEAFVALUE 2033 +/*! btree: maximum tree depth */ +#define WT_STAT_DSRC_BTREE_MAXIMUM_DEPTH 2034 +/*! btree: number of key/value pairs */ +#define WT_STAT_DSRC_BTREE_ENTRIES 2035 /*! btree: overflow pages */ -#define WT_STAT_DSRC_BTREE_OVERFLOW 2032 +#define WT_STAT_DSRC_BTREE_OVERFLOW 2036 +/*! btree: pages rewritten by compaction */ +#define WT_STAT_DSRC_BTREE_COMPACT_REWRITE 2037 /*! btree: row-store internal pages */ -#define WT_STAT_DSRC_BTREE_ROW_INTERNAL 2033 +#define WT_STAT_DSRC_BTREE_ROW_INTERNAL 2038 /*! btree: row-store leaf pages */ -#define WT_STAT_DSRC_BTREE_ROW_LEAF 2034 +#define WT_STAT_DSRC_BTREE_ROW_LEAF 2039 /*! cache: bytes read into cache */ -#define WT_STAT_DSRC_CACHE_BYTES_READ 2035 +#define WT_STAT_DSRC_CACHE_BYTES_READ 2040 /*! cache: bytes written from cache */ -#define WT_STAT_DSRC_CACHE_BYTES_WRITE 2036 +#define WT_STAT_DSRC_CACHE_BYTES_WRITE 2041 /*! cache: checkpoint blocked page eviction */ -#define WT_STAT_DSRC_CACHE_EVICTION_CHECKPOINT 2037 -/*! cache: unmodified pages evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2038 -/*! cache: page split during eviction deepened the tree */ -#define WT_STAT_DSRC_CACHE_EVICTION_DEEPEN 2039 -/*! cache: modified pages evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_DIRTY 2040 +#define WT_STAT_DSRC_CACHE_EVICTION_CHECKPOINT 2042 /*! cache: data source pages selected for eviction unable to be evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_FAIL 2041 +#define WT_STAT_DSRC_CACHE_EVICTION_FAIL 2043 /*! cache: hazard pointer blocked page eviction */ -#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 2042 +#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 2044 +/*! cache: in-memory page passed criteria to be split */ +#define WT_STAT_DSRC_CACHE_INMEM_SPLITTABLE 2045 +/*! cache: in-memory page splits */ +#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2046 /*! cache: internal pages evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 2043 +#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 2047 /*! cache: internal pages split during eviction */ -#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT_INTERNAL 2044 +#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT_INTERNAL 2048 /*! cache: leaf pages split during eviction */ -#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT_LEAF 2045 -/*! cache: in-memory page splits */ -#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2046 -/*! cache: in-memory page passed criteria to be split */ -#define WT_STAT_DSRC_CACHE_INMEM_SPLITTABLE 2047 +#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT_LEAF 2049 +/*! cache: modified pages evicted */ +#define WT_STAT_DSRC_CACHE_EVICTION_DIRTY 2050 +/*! cache: overflow pages read into cache */ +#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2051 /*! cache: overflow values cached in memory */ -#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2048 +#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2052 +/*! cache: page split during eviction deepened the tree */ +#define WT_STAT_DSRC_CACHE_EVICTION_DEEPEN 2053 +/*! cache: page written requiring lookaside records */ +#define WT_STAT_DSRC_CACHE_WRITE_LOOKASIDE 2054 /*! cache: pages read into cache */ -#define WT_STAT_DSRC_CACHE_READ 2049 +#define WT_STAT_DSRC_CACHE_READ 2055 /*! cache: pages read into cache requiring lookaside entries */ -#define WT_STAT_DSRC_CACHE_READ_LOOKASIDE 2050 -/*! cache: overflow pages read into cache */ -#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2051 +#define WT_STAT_DSRC_CACHE_READ_LOOKASIDE 2056 /*! cache: pages written from cache */ -#define WT_STAT_DSRC_CACHE_WRITE 2052 -/*! cache: page written requiring lookaside records */ -#define WT_STAT_DSRC_CACHE_WRITE_LOOKASIDE 2053 +#define WT_STAT_DSRC_CACHE_WRITE 2057 /*! cache: pages written requiring in-memory restoration */ -#define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2054 -/*! compression: raw compression call failed, no additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2055 -/*! compression: raw compression call failed, additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2056 -/*! compression: raw compression call succeeded */ -#define WT_STAT_DSRC_COMPRESS_RAW_OK 2057 +#define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2058 +/*! cache: unmodified pages evicted */ +#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2059 /*! compression: compressed pages read */ -#define WT_STAT_DSRC_COMPRESS_READ 2058 +#define WT_STAT_DSRC_COMPRESS_READ 2060 /*! compression: compressed pages written */ -#define WT_STAT_DSRC_COMPRESS_WRITE 2059 +#define WT_STAT_DSRC_COMPRESS_WRITE 2061 /*! compression: page written failed to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2060 +#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2062 /*! compression: page written was too small to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2061 -/*! cursor: create calls */ -#define WT_STAT_DSRC_CURSOR_CREATE 2062 -/*! cursor: insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT 2063 +#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2063 +/*! compression: raw compression call failed, additional data available */ +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2064 +/*! compression: raw compression call failed, no additional data available */ +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2065 +/*! compression: raw compression call succeeded */ +#define WT_STAT_DSRC_COMPRESS_RAW_OK 2066 /*! cursor: bulk-loaded cursor-insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2064 +#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2067 +/*! cursor: create calls */ +#define WT_STAT_DSRC_CURSOR_CREATE 2068 /*! cursor: cursor-insert key and value bytes inserted */ -#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2065 +#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2069 +/*! cursor: cursor-remove key bytes removed */ +#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2070 +/*! cursor: cursor-update value bytes updated */ +#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2071 +/*! cursor: insert calls */ +#define WT_STAT_DSRC_CURSOR_INSERT 2072 /*! cursor: next calls */ -#define WT_STAT_DSRC_CURSOR_NEXT 2066 +#define WT_STAT_DSRC_CURSOR_NEXT 2073 /*! cursor: prev calls */ -#define WT_STAT_DSRC_CURSOR_PREV 2067 +#define WT_STAT_DSRC_CURSOR_PREV 2074 /*! cursor: remove calls */ -#define WT_STAT_DSRC_CURSOR_REMOVE 2068 -/*! cursor: cursor-remove key bytes removed */ -#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2069 +#define WT_STAT_DSRC_CURSOR_REMOVE 2075 /*! cursor: reset calls */ -#define WT_STAT_DSRC_CURSOR_RESET 2070 +#define WT_STAT_DSRC_CURSOR_RESET 2076 /*! cursor: restarted searches */ -#define WT_STAT_DSRC_CURSOR_RESTART 2071 +#define WT_STAT_DSRC_CURSOR_RESTART 2077 /*! cursor: search calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH 2072 +#define WT_STAT_DSRC_CURSOR_SEARCH 2078 /*! cursor: search near calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2073 +#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2079 /*! cursor: truncate calls */ -#define WT_STAT_DSRC_CURSOR_TRUNCATE 2074 +#define WT_STAT_DSRC_CURSOR_TRUNCATE 2080 /*! cursor: update calls */ -#define WT_STAT_DSRC_CURSOR_UPDATE 2075 -/*! cursor: cursor-update value bytes updated */ -#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2076 -/*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2077 -/*! LSM: chunks in the LSM tree */ -#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2078 -/*! LSM: highest merge generation in the LSM tree */ -#define WT_STAT_DSRC_LSM_GENERATION_MAX 2079 -/*! LSM: queries that could have benefited from a Bloom filter that did - * not exist */ -#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2080 -/*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2081 +#define WT_STAT_DSRC_CURSOR_UPDATE 2081 /*! reconciliation: dictionary matches */ #define WT_STAT_DSRC_REC_DICTIONARY 2082 +/*! reconciliation: fast-path pages deleted */ +#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2083 +/*! reconciliation: internal page key bytes discarded using suffix + * compression */ +#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2084 /*! reconciliation: internal page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2083 -/*! reconciliation: leaf page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2084 -/*! reconciliation: maximum blocks required for a page */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2085 +#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2085 /*! reconciliation: internal-page overflow keys */ #define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2086 +/*! reconciliation: leaf page key bytes discarded using prefix compression */ +#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2087 +/*! reconciliation: leaf page multi-block writes */ +#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2088 /*! reconciliation: leaf-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2087 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2089 +/*! reconciliation: maximum blocks required for a page */ +#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2090 /*! reconciliation: overflow values written */ -#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2088 -/*! reconciliation: pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE 2089 -/*! reconciliation: fast-path pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2090 +#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2091 /*! reconciliation: page checksum matches */ -#define WT_STAT_DSRC_REC_PAGE_MATCH 2091 +#define WT_STAT_DSRC_REC_PAGE_MATCH 2092 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_DSRC_REC_PAGES 2092 +#define WT_STAT_DSRC_REC_PAGES 2093 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_DSRC_REC_PAGES_EVICTION 2093 -/*! reconciliation: leaf page key bytes discarded using prefix compression */ -#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2094 -/*! reconciliation: internal page key bytes discarded using suffix - * compression */ -#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2095 +#define WT_STAT_DSRC_REC_PAGES_EVICTION 2094 +/*! reconciliation: pages deleted */ +#define WT_STAT_DSRC_REC_PAGE_DELETE 2095 /*! session: object compaction */ #define WT_STAT_DSRC_SESSION_COMPACT 2096 /*! session: open cursor count */ diff --git a/src/include/wiredtiger_ext.h b/src/include/wiredtiger_ext.h index 0db876b56f3..7d97d97dcf5 100644 --- a/src/include/wiredtiger_ext.h +++ b/src/include/wiredtiger_ext.h @@ -268,8 +268,9 @@ struct __wt_extension_api { WT_SESSION *session, const char *key, const char *value); /*! - * Pack a structure into a buffer. - * See ::wiredtiger_struct_pack for details. + * Pack a structure into a buffer. Deprecated in favor of stream + * based pack and unpack API. See WT_EXTENSION_API::pack_start for + * details. * * @param wt_api the extension handle * @param session the session handle @@ -282,8 +283,8 @@ struct __wt_extension_api { void *buffer, size_t size, const char *format, ...); /*! - * Calculate the size required to pack a structure. - * See ::wiredtiger_struct_size for details. + * Calculate the size required to pack a structure. Deprecated in + * favor of stream based pack and unpack API. * * @param wt_api the extension handle * @param session the session handle @@ -296,8 +297,9 @@ struct __wt_extension_api { size_t *sizep, const char *format, ...); /*! - * Unpack a structure from a buffer. - * See ::wiredtiger_struct_unpack for details. + * Unpack a structure from a buffer. Deprecated in favor of stream + * based pack and unpack API. See WT_EXTENSION_API::unpack_start for + * details. * * @param wt_api the extension handle * @param session the session handle @@ -309,6 +311,130 @@ struct __wt_extension_api { int (*struct_unpack)(WT_EXTENSION_API *wt_api, WT_SESSION *session, const void *buffer, size_t size, const char *format, ...); + /* + * Streaming pack/unpack API. + */ + /*! + * Start a packing operation into a buffer. + * See ::wiredtiger_pack_start for details. + * + * @param session the session handle + * @param format the data format, see @ref packing + * @param buffer a pointer to memory to hold the packed data + * @param size the size of the buffer + * @param[out] psp the new packing stream handle + * @errors + */ + int (*pack_start)(WT_EXTENSION_API *wt_api, + WT_SESSION *session, const char *format, + void *buffer, size_t size, WT_PACK_STREAM **psp); + + /*! + * Start an unpacking operation from a buffer. + * See ::wiredtiger_unpack_start for details. + * + * @param session the session handle + * @param format the data format, see @ref packing + * @param buffer a pointer to memory holding the packed data + * @param size the size of the buffer + * @param[out] psp the new packing stream handle + * @errors + */ + int (*unpack_start)(WT_EXTENSION_API *wt_api, + WT_SESSION *session, const char *format, + const void *buffer, size_t size, WT_PACK_STREAM **psp); + + /*! + * Close a packing stream. + * + * @param ps the packing stream handle + * @param[out] usedp the number of bytes in the buffer used by the + * stream + * @errors + */ + int (*pack_close)(WT_EXTENSION_API *wt_api, + WT_PACK_STREAM *ps, size_t *usedp); + + /*! + * Pack an item into a packing stream. + * + * @param ps the packing stream handle + * @param item an item to pack + * @errors + */ + int (*pack_item)(WT_EXTENSION_API *wt_api, + WT_PACK_STREAM *ps, WT_ITEM *item); + + /*! + * Pack a signed integer into a packing stream. + * + * @param ps the packing stream handle + * @param i a signed integer to pack + * @errors + */ + int (*pack_int)(WT_EXTENSION_API *wt_api, + WT_PACK_STREAM *ps, int64_t i); + + /*! + * Pack a string into a packing stream. + * + * @param ps the packing stream handle + * @param s a string to pack + * @errors + */ + int (*pack_str)(WT_EXTENSION_API *wt_api, + WT_PACK_STREAM *ps, const char *s); + + /*! + * Pack an unsigned integer into a packing stream. + * + * @param ps the packing stream handle + * @param u an unsigned integer to pack + * @errors + */ + int (*pack_uint)(WT_EXTENSION_API *wt_api, + WT_PACK_STREAM *ps, uint64_t u); + + /*! + * Unpack an item from a packing stream. + * + * @param ps the packing stream handle + * @param item an item to unpack + * @errors + */ + int (*unpack_item)(WT_EXTENSION_API *wt_api, + WT_PACK_STREAM *ps, WT_ITEM *item); + + /*! + * Unpack a signed integer from a packing stream. + * + * @param ps the packing stream handle + * @param[out] ip the unpacked signed integer + * @errors + */ + int (*unpack_int)(WT_EXTENSION_API *wt_api, + WT_PACK_STREAM *ps, int64_t *ip); + + /*! + * Unpack a string from a packing stream. + * + * @param ps the packing stream handle + * @param[out] sp the unpacked string + * @errors + */ + int (*unpack_str)(WT_EXTENSION_API *wt_api, + WT_PACK_STREAM *ps, const char **sp); + + /*! + * Unpack an unsigned integer from a packing stream. + * + * @param ps the packing stream handle + * @param[out] up the unpacked unsigned integer + * @errors + */ + int (*unpack_uint)(WT_EXTENSION_API *wt_api, + WT_PACK_STREAM *ps, uint64_t *up); + /*! * Return the current transaction ID. * diff --git a/src/log/log.c b/src/log/log.c index ce2d7191491..e41073299a8 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -29,7 +29,7 @@ __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn) log = conn->log; log->ckpt_lsn = *ckp_lsn; if (conn->log_cond != NULL) - WT_RET(__wt_cond_signal(session, conn->log_cond)); + WT_RET(__wt_cond_auto_signal(session, conn->log_cond)); return (0); } @@ -46,7 +46,7 @@ __wt_log_flush_lsn(WT_SESSION_IMPL *session, WT_LSN *lsn, bool start) conn = S2C(session); log = conn->log; - WT_RET(__wt_log_force_write(session, 1)); + WT_RET(__wt_log_force_write(session, 1, NULL)); WT_RET(__wt_log_wrlsn(session, NULL)); if (start) *lsn = log->write_start_lsn; @@ -118,9 +118,9 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) */ if (log->sync_dir_lsn.l.file < min_lsn->l.file) { WT_ERR(__wt_verbose(session, WT_VERB_LOG, - "log_force_sync: sync directory %s to LSN %d/%lu", - log->log_dir_fh->name, - min_lsn->l.file, min_lsn->l.offset)); + "log_force_sync: sync directory %s to LSN %" PRIu32 + "/%" PRIu32, + log->log_dir_fh->name, min_lsn->l.file, min_lsn->l.offset)); WT_ERR(__wt_directory_sync_fh(session, log->log_dir_fh)); log->sync_dir_lsn = *min_lsn; WT_STAT_FAST_CONN_INCR(session, log_sync_dir); @@ -130,7 +130,7 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) */ if (__wt_log_cmp(&log->sync_lsn, min_lsn) < 0) { WT_ERR(__wt_verbose(session, WT_VERB_LOG, - "log_force_sync: sync %s to LSN %d/%lu", + "log_force_sync: sync %s to LSN %" PRIu32 "/%" PRIu32, log->log_fh->name, min_lsn->l.file, min_lsn->l.offset)); WT_ERR(__wt_fsync(session, log->log_fh)); log->sync_lsn = *min_lsn; @@ -273,7 +273,7 @@ __wt_log_get_all_files(WT_SESSION_IMPL *session, * These may be files needed by backup. Force the current slot * to get written to the file. */ - WT_RET(__wt_log_force_write(session, 1)); + WT_RET(__wt_log_force_write(session, 1, NULL)); WT_RET(__log_get_files(session, WT_LOG_FILENAME, &files, &count)); /* Filter out any files that are below the checkpoint LSN. */ @@ -697,7 +697,7 @@ __log_openfile(WT_SESSION_IMPL *session, WT_ERR_MSG(session, WT_ERROR, "unsupported WiredTiger file version: this build " " only supports major/minor versions up to %d/%d, " - " and the file is version %d/%d", + " and the file is version %" PRIu16 "/%" PRIu16, WT_LOG_MAJOR_VERSION, WT_LOG_MINOR_VERSION, desc->majorv, desc->minorv); } @@ -824,7 +824,7 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created) if (create_log) { WT_STAT_FAST_CONN_INCR(session, log_prealloc_missed); if (conn->log_cond != NULL) - WT_RET(__wt_cond_signal( + WT_RET(__wt_cond_auto_signal( session, conn->log_cond)); } } @@ -1088,28 +1088,36 @@ __wt_log_open(WT_SESSION_IMPL *session) WT_RET(__wt_open(session, conn->log_path, false, false, WT_FILE_TYPE_DIRECTORY, &log->log_dir_fh)); } - /* - * Clean up any old interim pre-allocated files. - * We clean up these files because settings have changed upon reboot - * and we want those settings to take effect right away. - */ - WT_ERR(__log_get_files(session, - WT_LOG_TMPNAME, &logfiles, &logcount)); - for (i = 0; i < logcount; i++) { - WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); - WT_ERR(__wt_log_remove(session, WT_LOG_TMPNAME, lognum)); - } - __wt_log_files_free(session, logfiles, logcount); - logfiles = NULL; - logcount = 0; - WT_ERR(__log_get_files(session, - WT_LOG_PREPNAME, &logfiles, &logcount)); - for (i = 0; i < logcount; i++) { - WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); - WT_ERR(__wt_log_remove(session, WT_LOG_PREPNAME, lognum)); + + if (!F_ISSET(conn, WT_CONN_READONLY)) { + /* + * Clean up any old interim pre-allocated files. We clean + * up these files because settings have changed upon reboot + * and we want those settings to take effect right away. + */ + WT_ERR(__log_get_files(session, + WT_LOG_TMPNAME, &logfiles, &logcount)); + for (i = 0; i < logcount; i++) { + WT_ERR(__wt_log_extract_lognum( + session, logfiles[i], &lognum)); + WT_ERR(__wt_log_remove( + session, WT_LOG_TMPNAME, lognum)); + } + __wt_log_files_free(session, logfiles, logcount); + logfiles = NULL; + logcount = 0; + WT_ERR(__log_get_files(session, + WT_LOG_PREPNAME, &logfiles, &logcount)); + for (i = 0; i < logcount; i++) { + WT_ERR(__wt_log_extract_lognum( + session, logfiles[i], &lognum)); + WT_ERR(__wt_log_remove( + session, WT_LOG_PREPNAME, lognum)); + } + __wt_log_files_free(session, logfiles, logcount); + logfiles = NULL; } - __wt_log_files_free(session, logfiles, logcount); - logfiles = NULL; + /* * Now look at the log files and set our LSNs. */ @@ -1121,7 +1129,8 @@ __wt_log_open(WT_SESSION_IMPL *session) } log->fileid = lastlog; WT_ERR(__wt_verbose(session, WT_VERB_LOG, - "log_open: first log %d last log %d", firstlog, lastlog)); + "log_open: first log %" PRIu32 " last log %" PRIu32, + firstlog, lastlog)); if (firstlog == UINT32_MAX) { WT_ASSERT(session, logcount == 0); WT_INIT_LSN(&log->first_lsn); @@ -1132,9 +1141,11 @@ __wt_log_open(WT_SESSION_IMPL *session) * Start logging at the beginning of the next log file, no matter * where the previous log file ends. */ - WT_WITH_SLOT_LOCK(session, log, ret, - ret = __log_newfile(session, true, NULL)); - WT_ERR(ret); + if (!F_ISSET(conn, WT_CONN_READONLY)) { + WT_WITH_SLOT_LOCK(session, log, ret, + ret = __log_newfile(session, true, NULL)); + WT_ERR(ret); + } /* If we found log files, save the new state. */ if (logcount > 0) { @@ -1163,20 +1174,24 @@ __wt_log_close(WT_SESSION_IMPL *session) if (log->log_close_fh != NULL && log->log_close_fh != log->log_fh) { WT_RET(__wt_verbose(session, WT_VERB_LOG, "closing old log %s", log->log_close_fh->name)); - WT_RET(__wt_fsync(session, log->log_close_fh)); + if (!F_ISSET(conn, WT_CONN_READONLY)) + WT_RET(__wt_fsync(session, log->log_close_fh)); WT_RET(__wt_close(session, &log->log_close_fh)); } if (log->log_fh != NULL) { WT_RET(__wt_verbose(session, WT_VERB_LOG, "closing log %s", log->log_fh->name)); - WT_RET(__wt_fsync(session, log->log_fh)); + if (!F_ISSET(conn, WT_CONN_READONLY)) + WT_RET(__wt_fsync(session, log->log_fh)); WT_RET(__wt_close(session, &log->log_fh)); log->log_fh = NULL; } if (log->log_dir_fh != NULL) { WT_RET(__wt_verbose(session, WT_VERB_LOG, "closing log directory %s", log->log_dir_fh->name)); - WT_RET(__wt_directory_sync_fh(session, log->log_dir_fh)); + if (!F_ISSET(conn, WT_CONN_READONLY)) + WT_RET( + __wt_directory_sync_fh(session, log->log_dir_fh)); WT_RET(__wt_close(session, &log->log_dir_fh)); log->log_dir_fh = NULL; } @@ -1237,10 +1252,8 @@ __log_has_hole(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, bool *hole) } } -err: if (buf != NULL) - __wt_free(session, buf); - if (zerobuf != NULL) - __wt_free(session, zerobuf); +err: __wt_free(session, buf); + __wt_free(session, zerobuf); return (ret); } @@ -1324,7 +1337,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) */ if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) __wt_spin_unlock(session, &log->log_slot_lock); - WT_ERR(__wt_cond_signal(session, conn->log_wrlsn_cond)); + WT_ERR(__wt_cond_auto_signal(session, conn->log_wrlsn_cond)); if (++yield_count < WT_THOUSAND) __wt_yield(); else @@ -1381,7 +1394,8 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) (log->sync_dir_lsn.l.file < sync_lsn.l.file)) { WT_ASSERT(session, log->log_dir_fh != NULL); WT_ERR(__wt_verbose(session, WT_VERB_LOG, - "log_release: sync directory %s to LSN %u/%lu", + "log_release: sync directory %s to LSN %" PRIu32 + "/%" PRIu32, log->log_dir_fh->name, sync_lsn.l.file, sync_lsn.l.offset)); WT_ERR(__wt_directory_sync_fh( @@ -1396,7 +1410,8 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) if (F_ISSET(slot, WT_SLOT_SYNC) && __wt_log_cmp(&log->sync_lsn, &slot->slot_end_lsn) < 0) { WT_ERR(__wt_verbose(session, WT_VERB_LOG, - "log_release: sync log %s to LSN %u/%lu", + "log_release: sync log %s to LSN %" PRIu32 + "/%" PRIu32, log->log_fh->name, sync_lsn.l.file, sync_lsn.l.offset)); WT_STAT_FAST_CONN_INCR(session, log_sync); @@ -1463,7 +1478,7 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, if (LF_ISSET(WT_LOGSCAN_RECOVER)) WT_RET(__wt_verbose(session, WT_VERB_LOG, - "__wt_log_scan truncating to %u/%u", + "__wt_log_scan truncating to %" PRIu32 "/%" PRIu32, log->trunc_lsn.l.file, log->trunc_lsn.l.offset)); if (log != NULL) { @@ -1744,14 +1759,25 @@ err: WT_STAT_FAST_CONN_INCR(session, log_scans); * Wrapper function that takes the lock. */ int -__wt_log_force_write(WT_SESSION_IMPL *session, bool retry) +__wt_log_force_write(WT_SESSION_IMPL *session, bool retry, bool *did_work) { WT_LOG *log; WT_MYSLOT myslot; + uint32_t joined; log = S2C(session)->log; memset(&myslot, 0, sizeof(myslot)); + WT_STAT_FAST_CONN_INCR(session, log_force_write); + if (did_work != NULL) + *did_work = true; myslot.slot = log->active_slot; + joined = WT_LOG_SLOT_JOINED(log->active_slot->slot_state); + if (joined == 0) { + WT_STAT_FAST_CONN_INCR(session, log_force_write_skip); + if (did_work != NULL) + *did_work = false; + return (0); + } return (__wt_log_slot_switch(session, &myslot, retry, true)); } @@ -1984,10 +2010,10 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, * XXX I've seen times when conditions are NULL. */ if (conn->log_cond != NULL) { - WT_ERR(__wt_cond_signal(session, conn->log_cond)); + WT_ERR(__wt_cond_auto_signal(session, conn->log_cond)); __wt_yield(); } else - WT_ERR(__wt_log_force_write(session, 1)); + WT_ERR(__wt_log_force_write(session, 1, NULL)); } if (LF_ISSET(WT_LOG_FLUSH)) { /* Wait for our writes to reach the OS */ @@ -2114,7 +2140,7 @@ __wt_log_flush(WT_SESSION_IMPL *session, uint32_t flags) WT_RET(__wt_log_flush_lsn(session, &lsn, false)); WT_RET(__wt_verbose(session, WT_VERB_LOG, - "log_flush: flags %d LSN %u/%lu", + "log_flush: flags %#" PRIx32 " LSN %" PRIu32 "/%" PRIu32, flags, lsn.l.file, lsn.l.offset)); /* * If the user wants write-no-sync, there is nothing more to do. diff --git a/src/log/log_slot.c b/src/log/log_slot.c index 2844516e78f..570d1c9ce48 100644 --- a/src/log/log_slot.c +++ b/src/log/log_slot.c @@ -253,7 +253,7 @@ __wt_log_slot_new(WT_SESSION_IMPL *session) /* * If we didn't find any free slots signal the worker thread. */ - (void)__wt_cond_signal(session, conn->log_wrlsn_cond); + (void)__wt_cond_auto_signal(session, conn->log_wrlsn_cond); __wt_yield(); } /* NOTREACHED */ diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index f76b2bfd9ac..e023b2b407e 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -1501,22 +1501,22 @@ __wt_clsm_open(WT_SESSION_IMPL *session, { WT_CONFIG_ITEM cval; WT_CURSOR_STATIC_INIT(iface, - __wt_cursor_get_key, /* get-key */ - __wt_cursor_get_value, /* get-value */ - __wt_cursor_set_key, /* set-key */ - __wt_cursor_set_value, /* set-value */ - __clsm_compare, /* compare */ - __wt_cursor_equals, /* equals */ - __clsm_next, /* next */ - __clsm_prev, /* prev */ - __clsm_reset, /* reset */ - __clsm_search, /* search */ - __clsm_search_near, /* search-near */ - __clsm_insert, /* insert */ - __clsm_update, /* update */ - __clsm_remove, /* remove */ - __wt_cursor_reconfigure, /* reconfigure */ - __wt_clsm_close); /* close */ + __wt_cursor_get_key, /* get-key */ + __wt_cursor_get_value, /* get-value */ + __wt_cursor_set_key, /* set-key */ + __wt_cursor_set_value, /* set-value */ + __clsm_compare, /* compare */ + __wt_cursor_equals, /* equals */ + __clsm_next, /* next */ + __clsm_prev, /* prev */ + __clsm_reset, /* reset */ + __clsm_search, /* search */ + __clsm_search_near, /* search-near */ + __clsm_insert, /* insert */ + __clsm_update, /* update */ + __clsm_remove, /* remove */ + __wt_cursor_reconfigure, /* reconfigure */ + __wt_clsm_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_LSM *clsm; WT_DECL_RET; @@ -1556,7 +1556,7 @@ __wt_clsm_open(WT_SESSION_IMPL *session, WT_ERR(ret); /* Make sure we have exclusive access if and only if we want it */ - WT_ASSERT(session, !bulk || lsm_tree->exclusive); + WT_ASSERT(session, !bulk || lsm_tree->excl_session != NULL); WT_ERR(__wt_calloc_one(session, &clsm)); diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c index dac8d987328..943a5894ab3 100644 --- a/src/lsm/lsm_manager.c +++ b/src/lsm/lsm_manager.c @@ -212,6 +212,10 @@ __wt_lsm_manager_start(WT_SESSION_IMPL *session) conn = S2C(session); manager = &conn->lsm_manager; + if (F_ISSET(conn, WT_CONN_READONLY)) { + manager->lsm_workers = 0; + return (0); + } /* * We need at least a manager, a switch thread and a generic * worker. @@ -284,6 +288,8 @@ __wt_lsm_manager_destroy(WT_SESSION_IMPL *session) manager = &conn->lsm_manager; removed = 0; + WT_ASSERT(session, !F_ISSET(conn, WT_CONN_READONLY) || + manager->lsm_workers == 0); if (manager->lsm_workers > 0) { /* * Stop the main LSM manager thread first. @@ -384,7 +390,7 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) F_SET(session, WT_SESSION_LOCKED_HANDLE_LIST); dhandle_locked = true; TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) { - if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) + if (!lsm_tree->active) continue; WT_ERR(__wt_epoch(session, &now)); pushms = lsm_tree->work_push_ts.tv_sec == 0 ? 0 : @@ -427,8 +433,10 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) session, WT_LSM_WORK_BLOOM, 0, lsm_tree)); WT_ERR(__wt_verbose(session, WT_VERB_LSM_MANAGER, - "MGR %s: queue %d mod %d nchunks %d" - " flags 0x%x aggressive %d pushms %" PRIu64 + "MGR %s: queue %" PRIu32 " mod %d " + "nchunks %" PRIu32 + " flags %#" PRIx32 " aggressive %" PRIu32 + " pushms %" PRIu64 " fillms %" PRIu64, lsm_tree->name, lsm_tree->queue_ref, lsm_tree->modified, lsm_tree->nchunks, @@ -616,6 +624,7 @@ __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session, manager = &S2C(session)->lsm_manager; + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); /* * Don't add merges or bloom filter creates if merges * or bloom filters are disabled in the tree. @@ -641,7 +650,7 @@ __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session, * is checked. */ (void)__wt_atomic_add32(&lsm_tree->queue_ref, 1); - if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) { + if (!lsm_tree->active) { (void)__wt_atomic_sub32(&lsm_tree->queue_ref, 1); return (0); } diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c index 29325066da7..6d907284546 100644 --- a/src/lsm/lsm_merge.c +++ b/src/lsm/lsm_merge.c @@ -60,10 +60,11 @@ __lsm_merge_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { struct timespec now; uint64_t msec_since_last_merge, msec_to_create_merge; - u_int new_aggressive; + uint32_t new_aggressive; new_aggressive = 0; + WT_ASSERT(session, lsm_tree->merge_min != 0); /* * If the tree is open read-only or we are compacting, be very * aggressive. Otherwise, we can spend a long time waiting for merges @@ -124,8 +125,9 @@ __lsm_merge_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) if (new_aggressive > lsm_tree->merge_aggressiveness) { WT_RET(__wt_verbose(session, WT_VERB_LSM, - "LSM merge %s got aggressive (old %u new %u), " - "merge_min %d, %u / %" PRIu64, + "LSM merge %s got aggressive " + "(old %" PRIu32 " new %" PRIu32 "), " + "merge_min %u, %" PRIu64 " / %" PRIu64, lsm_tree->name, lsm_tree->merge_aggressiveness, new_aggressive, lsm_tree->merge_min, msec_since_last_merge, lsm_tree->chunk_fill_ms)); @@ -410,7 +412,8 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) start_chunk, end_chunk, dest_id, record_count, generation)); for (verb = start_chunk; verb <= end_chunk; verb++) WT_ERR(__wt_verbose(session, WT_VERB_LSM, - "Merging %s: Chunk[%u] id %u, gen: %" PRIu32 + "Merging %s: Chunk[%u] id %" PRIu32 + ", gen: %" PRIu32 ", size: %" PRIu64 ", records: %" PRIu64, lsm_tree->name, verb, lsm_tree->chunk[verb]->id, lsm_tree->chunk[verb]->generation, @@ -460,7 +463,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) #define LSM_MERGE_CHECK_INTERVAL WT_THOUSAND for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) { - if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) + if (!lsm_tree->active) WT_ERR(EINTR); WT_STAT_FAST_CONN_INCRV(session, diff --git a/src/lsm/lsm_meta.c b/src/lsm/lsm_meta.c index d76b2a48aa7..e19e2cd0126 100644 --- a/src/lsm/lsm_meta.c +++ b/src/lsm/lsm_meta.c @@ -9,17 +9,17 @@ #include "wt_internal.h" /* - * __wt_lsm_meta_read -- - * Read the metadata for an LSM tree. + * __lsm_meta_read_v0 -- + * Read v0 of LSM metadata. */ -int -__wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +static int +__lsm_meta_read_v0( + WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, const char *lsmconf) { WT_CONFIG cparser, lparser; WT_CONFIG_ITEM ck, cv, fileconf, lk, lv, metadata; WT_DECL_RET; WT_LSM_CHUNK *chunk; - char *lsmconfig; u_int nchunks; chunk = NULL; /* -Wconditional-uninitialized */ @@ -28,8 +28,7 @@ __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) if (F_ISSET(S2C(session), WT_CONN_LSM_MERGE)) F_SET(lsm_tree, WT_LSM_TREE_MERGES); - WT_RET(__wt_metadata_search(session, lsm_tree->name, &lsmconfig)); - WT_ERR(__wt_config_init(session, &cparser, lsmconfig)); + WT_ERR(__wt_config_init(session, &cparser, lsmconf)); while ((ret = __wt_config_next(&cparser, &ck, &cv)) == 0) { if (WT_STRING_MATCH("key_format", ck.str, ck.len)) { __wt_free(session, lsm_tree->key_format); @@ -48,7 +47,7 @@ __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) * from the file configuration. */ WT_ERR(__wt_config_getones( - session, lsmconfig, "file_config", &fileconf)); + session, lsmconf, "file_config", &fileconf)); WT_CLEAR(metadata); WT_ERR_NOTFOUND_OK(__wt_config_subgets( session, &fileconf, "app_metadata", &metadata)); @@ -160,16 +159,292 @@ __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) */ } WT_ERR_NOTFOUND_OK(ret); +err: return (ret); +} + +/* + * __lsm_meta_read_v1 -- + * Read v1 of LSM metadata. + */ +static int +__lsm_meta_read_v1( + WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, const char *lsmconf) +{ + WT_CONFIG lparser; + WT_CONFIG_ITEM cv, lk, lv, metadata; + WT_DECL_ITEM(buf); + WT_DECL_RET; + WT_LSM_CHUNK *chunk; + const char *file_cfg[] = { + WT_CONFIG_BASE(session, file_config), NULL, NULL, NULL }; + char *fileconf; + u_int nchunks; + + chunk = NULL; /* -Wconditional-uninitialized */ + + WT_ERR(__wt_config_getones(session, lsmconf, "key_format", &cv)); + WT_ERR(__wt_strndup(session, cv.str, cv.len, &lsm_tree->key_format)); + WT_ERR(__wt_config_getones(session, lsmconf, "value_format", &cv)); + WT_ERR(__wt_strndup(session, cv.str, cv.len, &lsm_tree->value_format)); + + WT_ERR(__wt_config_getones(session, lsmconf, "collator", &cv)); + if (cv.len != 0 && !WT_STRING_MATCH("none", cv.str, cv.len)) { + /* Extract the application-supplied metadata (if any). */ + WT_CLEAR(metadata); + WT_ERR_NOTFOUND_OK(__wt_config_getones( + session, lsmconf, "app_metadata", &metadata)); + WT_ERR(__wt_collator_config(session, lsm_tree->name, + &cv, &metadata, + &lsm_tree->collator, &lsm_tree->collator_owned)); + WT_ERR(__wt_strndup(session, + cv.str, cv.len, &lsm_tree->collator_name)); + } + + WT_ERR(__wt_config_getones(session, lsmconf, "lsm.auto_throttle", &cv)); + if (cv.val) + F_SET(lsm_tree, WT_LSM_TREE_THROTTLE); + else + F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE); + + WT_ERR(__wt_config_getones(session, lsmconf, "lsm.bloom", &cv)); + FLD_SET(lsm_tree->bloom, + (cv.val == 0 ? WT_LSM_BLOOM_OFF : WT_LSM_BLOOM_MERGED)); + WT_ERR(__wt_config_getones(session, lsmconf, "lsm.bloom_oldest", &cv)); + if (cv.val != 0) + FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST); + + if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) && + FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST)) + WT_ERR_MSG(session, EINVAL, + "Bloom filters can only be created on newest and oldest " + "chunks if bloom filters are enabled"); + + WT_ERR(__wt_config_getones( + session, lsmconf, "lsm.bloom_bit_count", &cv)); + lsm_tree->bloom_bit_count = (uint32_t)cv.val; + WT_ERR(__wt_config_getones(session, lsmconf, "lsm.bloom_config", &cv)); + /* Don't include the brackets. */ + if (cv.type == WT_CONFIG_ITEM_STRUCT) { + cv.str++; + cv.len -= 2; + } + WT_ERR(__wt_config_check(session, + WT_CONFIG_REF(session, WT_SESSION_create), cv.str, cv.len)); + WT_ERR(__wt_strndup(session, cv.str, cv.len, &lsm_tree->bloom_config)); + WT_ERR(__wt_config_getones( + session, lsmconf, "lsm.bloom_hash_count", &cv)); + lsm_tree->bloom_hash_count = (uint32_t)cv.val; + + WT_ERR(__wt_config_getones( + session, lsmconf, "lsm.chunk_count_limit", &cv)); + lsm_tree->chunk_count_limit = (uint32_t)cv.val; + if (cv.val == 0) + F_SET(lsm_tree, WT_LSM_TREE_MERGES); + else + F_CLR(lsm_tree, WT_LSM_TREE_MERGES); + WT_ERR(__wt_config_getones(session, lsmconf, "lsm.chunk_max", &cv)); + lsm_tree->chunk_max = (uint64_t)cv.val; + WT_ERR(__wt_config_getones(session, lsmconf, "lsm.chunk_size", &cv)); + lsm_tree->chunk_size = (uint64_t)cv.val; + + if (lsm_tree->chunk_size > lsm_tree->chunk_max) + WT_ERR_MSG(session, EINVAL, + "Chunk size (chunk_size) must be smaller than or equal to " + "the maximum chunk size (chunk_max)"); + + WT_ERR(__wt_config_getones(session, lsmconf, "lsm.merge_max", &cv)); + lsm_tree->merge_max = (uint32_t)cv.val; + WT_ERR(__wt_config_getones(session, lsmconf, "lsm.merge_min", &cv)); + lsm_tree->merge_min = (uint32_t)cv.val; + + if (lsm_tree->merge_min > lsm_tree->merge_max) + WT_ERR_MSG(session, EINVAL, + "LSM merge_min must be less than or equal to merge_max"); + + WT_ERR(__wt_config_getones(session, lsmconf, "last", &cv)); + lsm_tree->last = (u_int)cv.val; + WT_ERR(__wt_config_getones(session, lsmconf, "chunks", &cv)); + WT_ERR(__wt_config_subinit(session, &lparser, &cv)); + for (nchunks = 0; (ret = + __wt_config_next(&lparser, &lk, &lv)) == 0; ) { + if (WT_STRING_MATCH("id", lk.str, lk.len)) { + WT_ERR(__wt_realloc_def(session, + &lsm_tree->chunk_alloc, + nchunks + 1, &lsm_tree->chunk)); + WT_ERR(__wt_calloc_one(session, &chunk)); + lsm_tree->chunk[nchunks++] = chunk; + chunk->id = (uint32_t)lv.val; + WT_ERR(__wt_lsm_tree_chunk_name(session, + lsm_tree, chunk->id, &chunk->uri)); + F_SET(chunk, + WT_LSM_CHUNK_ONDISK | + WT_LSM_CHUNK_STABLE); + } else if (WT_STRING_MATCH("bloom", lk.str, lk.len)) { + WT_ERR(__wt_lsm_tree_bloom_name( + session, lsm_tree, chunk->id, &chunk->bloom_uri)); + F_SET(chunk, WT_LSM_CHUNK_BLOOM); + continue; + } else if (WT_STRING_MATCH("chunk_size", lk.str, lk.len)) { + chunk->size = (uint64_t)lv.val; + continue; + } else if (WT_STRING_MATCH("count", lk.str, lk.len)) { + chunk->count = (uint64_t)lv.val; + continue; + } else if (WT_STRING_MATCH("generation", lk.str, lk.len)) { + chunk->generation = (uint32_t)lv.val; + continue; + } + } + WT_ERR_NOTFOUND_OK(ret); + lsm_tree->nchunks = nchunks; + + WT_ERR(__wt_config_getones(session, lsmconf, "old_chunks", &cv)); + WT_ERR(__wt_config_subinit(session, &lparser, &cv)); + for (nchunks = 0; (ret = + __wt_config_next(&lparser, &lk, &lv)) == 0; ) { + if (WT_STRING_MATCH("bloom", lk.str, lk.len)) { + WT_ERR(__wt_strndup(session, + lv.str, lv.len, &chunk->bloom_uri)); + F_SET(chunk, WT_LSM_CHUNK_BLOOM); + continue; + } + WT_ERR(__wt_realloc_def(session, + &lsm_tree->old_alloc, nchunks + 1, + &lsm_tree->old_chunks)); + WT_ERR(__wt_calloc_one(session, &chunk)); + lsm_tree->old_chunks[nchunks++] = chunk; + WT_ERR(__wt_strndup(session, + lk.str, lk.len, &chunk->uri)); + F_SET(chunk, WT_LSM_CHUNK_ONDISK); + } + WT_ERR_NOTFOUND_OK(ret); + lsm_tree->nold_chunks = nchunks; + + /* + * Set up the config for each chunk. + * + * Make the memory_page_max double the chunk size, so application + * threads don't immediately try to force evict the chunk when the + * worker thread clears the NO_EVICTION flag. + */ + file_cfg[1] = lsmconf; + WT_ERR(__wt_scr_alloc(session, 0, &buf)); + WT_ERR(__wt_buf_fmt(session, buf, + "key_format=u,value_format=u,memory_page_max=%" PRIu64, + 2 * lsm_tree->chunk_max)); + file_cfg[2] = buf->data; + WT_ERR(__wt_config_collapse(session, file_cfg, &fileconf)); + lsm_tree->file_config = fileconf; + + /* + * Ignore any other values: the metadata entry might have been + * created by a future release, with unknown options. + */ +err: __wt_scr_free(session, &buf); + return (ret); +} + +/* + * __lsm_meta_upgrade_v1 -- + * Upgrade to v1 of LSM metadata. + */ +static int +__lsm_meta_upgrade_v1(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_DECL_ITEM(buf); + WT_DECL_RET; + const char *new_cfg[] = { + WT_CONFIG_BASE(session, lsm_meta), NULL, NULL, NULL }; + + /* Include the custom config that used to be embedded in file_config. */ + new_cfg[1] = lsm_tree->file_config; + + WT_ERR(__wt_scr_alloc(session, 0, &buf)); + WT_ERR(__wt_buf_fmt(session, buf, + "key_format=%s,value_format=%s", + lsm_tree->key_format, lsm_tree->value_format)); + + WT_ERR(__wt_buf_catfmt(session, buf, ",collator=%s", + lsm_tree->collator_name != NULL ? lsm_tree->collator_name : "")); + + WT_ERR(__wt_buf_catfmt(session, buf, ",lsm=(")); + + WT_ERR(__wt_buf_catfmt(session, buf, "auto_throttle=%d", + F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE))); + + WT_ERR(__wt_buf_catfmt(session, buf, ",bloom=%d", + FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_MERGED))); + WT_ERR(__wt_buf_catfmt(session, buf, ",bloom_oldest=%d", + FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST))); + WT_ERR(__wt_buf_catfmt(session, buf, ",bloom_bit_count=%" PRIu32, + lsm_tree->bloom_bit_count)); + if (lsm_tree->bloom_config != NULL && + strlen(lsm_tree->bloom_config) > 0) + WT_ERR(__wt_buf_catfmt(session, buf, ",bloom_config=(%s)", + lsm_tree->bloom_config)); + else + WT_ERR(__wt_buf_catfmt(session, buf, ",bloom_config=")); + WT_ERR(__wt_buf_catfmt(session, buf, ",bloom_hash_count=%" PRIu32, + lsm_tree->bloom_hash_count)); + + WT_ERR(__wt_buf_catfmt(session, buf, ",chunk_count_limit=%" PRIu32, + lsm_tree->chunk_count_limit)); + WT_ERR(__wt_buf_catfmt(session, buf, ",chunk_max=%" PRIu64, + lsm_tree->chunk_max)); + WT_ERR(__wt_buf_catfmt(session, buf, ",merge_max=%" PRIu32, + lsm_tree->merge_max)); + WT_ERR(__wt_buf_catfmt(session, buf, ",merge_min=%" PRIu32, + lsm_tree->merge_min)); + + WT_ERR(__wt_buf_catfmt(session, buf, ")")); + + new_cfg[2] = buf->data; + WT_ERR(__wt_config_merge(session, new_cfg, NULL, &lsm_tree->config)); + +err: __wt_scr_free(session, &buf); + return (ret); +} +/* + * __wt_lsm_meta_read -- + * Read the metadata for an LSM tree. + */ +int +__wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +{ + WT_CONFIG_ITEM cval; + WT_DECL_RET; + char *lsmconf; + bool upgrade; + + /* LSM trees inherit the merge setting from the connection. */ + if (F_ISSET(S2C(session), WT_CONN_LSM_MERGE)) + F_SET(lsm_tree, WT_LSM_TREE_MERGES); + + WT_RET(__wt_metadata_search(session, lsm_tree->name, &lsmconf)); + upgrade = false; + ret = __wt_config_getones(session, lsmconf, "file_config", &cval); + if (ret == 0) { + ret = __lsm_meta_read_v0(session, lsm_tree, lsmconf); + __wt_free(session, lsmconf); + WT_RET(ret); + upgrade = true; + } else if (ret == WT_NOTFOUND) { + lsm_tree->config = lsmconf; + ret = 0; + WT_RET(__lsm_meta_read_v1(session, lsm_tree, lsmconf)); + } /* - * If the default merge_min was not overridden, calculate it now. We - * do this here so that trees created before merge_min was added get a - * sane value. + * If the default merge_min was not overridden, calculate it now. */ if (lsm_tree->merge_min < 2) lsm_tree->merge_min = WT_MAX(2, lsm_tree->merge_max / 2); - -err: __wt_free(session, lsmconfig); + /* + * If needed, upgrade the configuration. We need to do this after + * we have fixed the merge_min value. + */ + if (upgrade) + WT_RET(__lsm_meta_upgrade_v1(session, lsm_tree)); return (ret); } @@ -184,32 +459,15 @@ __wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_DECL_RET; WT_LSM_CHUNK *chunk; u_int i; + const char *new_cfg[] = { NULL, NULL, NULL }; + char *new_metadata; bool first; + new_metadata = NULL; + WT_RET(__wt_scr_alloc(session, 0, &buf)); - WT_ERR(__wt_buf_fmt(session, buf, - "key_format=%s,value_format=%s,bloom_config=(%s),file_config=(%s)", - lsm_tree->key_format, lsm_tree->value_format, - lsm_tree->bloom_config, lsm_tree->file_config)); - if (lsm_tree->collator_name != NULL) - WT_ERR(__wt_buf_catfmt( - session, buf, ",collator=%s", lsm_tree->collator_name)); WT_ERR(__wt_buf_catfmt(session, buf, - ",last=%" PRIu32 - ",chunk_count_limit=%" PRIu32 - ",chunk_max=%" PRIu64 - ",chunk_size=%" PRIu64 - ",auto_throttle=%" PRIu32 - ",merge_max=%" PRIu32 - ",merge_min=%" PRIu32 - ",bloom=%" PRIu32 - ",bloom_bit_count=%" PRIu32 - ",bloom_hash_count=%" PRIu32, - lsm_tree->last, lsm_tree->chunk_count_limit, - lsm_tree->chunk_max, lsm_tree->chunk_size, - F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) ? 1 : 0, - lsm_tree->merge_max, lsm_tree->merge_min, lsm_tree->bloom, - lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count)); + ",last=%" PRIu32, lsm_tree->last)); WT_ERR(__wt_buf_catfmt(session, buf, ",chunks=[")); for (i = 0; i < lsm_tree->nchunks; i++) { chunk = lsm_tree->chunk[i]; @@ -243,9 +501,15 @@ __wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) session, buf, ",bloom=\"%s\"", chunk->bloom_uri)); } WT_ERR(__wt_buf_catfmt(session, buf, "]")); - ret = __wt_metadata_update(session, lsm_tree->name, buf->data); + + /* Update the existing configuration with the new values. */ + new_cfg[0] = lsm_tree->config; + new_cfg[1] = buf->data; + WT_ERR(__wt_config_collapse(session, new_cfg, &new_metadata)); + ret = __wt_metadata_update(session, lsm_tree->name, new_metadata); WT_ERR(ret); err: __wt_scr_free(session, &buf); + __wt_free(session, new_metadata); return (ret); } diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c index ab18e41a2f5..cb1ddf22f84 100644 --- a/src/lsm/lsm_tree.c +++ b/src/lsm/lsm_tree.c @@ -27,6 +27,7 @@ __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final) WT_UNUSED(final); /* Only used in diagnostic builds */ + WT_ASSERT(session, !lsm_tree->active); /* * The work unit queue should be empty, but it's worth checking * since work units use a different locking scheme to regular tree @@ -85,19 +86,27 @@ __lsm_tree_discard(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final) * Close an LSM tree structure. */ static int -__lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) +__lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool final) { WT_DECL_RET; int i; - /* Stop any active merges. */ - F_CLR(lsm_tree, WT_LSM_TREE_ACTIVE); + /* + * Stop any new work units being added. The barrier is necessary + * because we rely on the state change being visible before checking + * the tree queue state. + */ + lsm_tree->active = false; + WT_READ_BARRIER(); /* - * Wait for all LSM operations and work units that were in flight to - * finish. + * Wait for all LSM operations to drain. If WiredTiger is shutting + * down also wait for the tree reference count to go to zero, otherwise + * we know a user is holding a reference to the tree, so exclusive + * access is not available. */ - for (i = 0; lsm_tree->refcnt > 1 || lsm_tree->queue_ref > 0; ++i) { + for (i = 0; + lsm_tree->queue_ref > 0 || (final && lsm_tree->refcnt > 1); ++i) { /* * Remove any work units from the manager queues. Do this step * repeatedly in case a work unit was in the process of being @@ -114,11 +123,14 @@ __lsm_tree_close(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) if (i % WT_THOUSAND == 0) { WT_WITHOUT_LOCKS(session, ret = __wt_lsm_manager_clear_tree(session, lsm_tree)); - WT_RET(ret); + WT_ERR(ret); } __wt_yield(); } return (0); + +err: lsm_tree->active = true; + return (ret); } /* @@ -142,7 +154,7 @@ __wt_lsm_tree_close_all(WT_SESSION_IMPL *session) * is unconditional. */ (void)__wt_atomic_add32(&lsm_tree->refcnt, 1); - WT_TRET(__lsm_tree_close(session, lsm_tree)); + WT_TRET(__lsm_tree_close(session, lsm_tree, true)); WT_TRET(__lsm_tree_discard(session, lsm_tree, true)); } @@ -157,9 +169,12 @@ static int __lsm_tree_set_name(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, const char *uri) { - if (lsm_tree->name != NULL) - __wt_free(session, lsm_tree->name); - WT_RET(__wt_strdup(session, uri, &lsm_tree->name)); + void *p; + + WT_RET(__wt_strdup(session, uri, &p)); + + __wt_free(session, lsm_tree->name); + lsm_tree->name = p; lsm_tree->filename = lsm_tree->name + strlen("lsm:"); return (0); } @@ -306,15 +321,15 @@ int __wt_lsm_tree_create(WT_SESSION_IMPL *session, const char *uri, bool exclusive, const char *config) { - WT_CONFIG_ITEM cval; - WT_DECL_ITEM(buf); WT_DECL_RET; WT_LSM_TREE *lsm_tree; const char *cfg[] = - { WT_CONFIG_BASE(session, WT_SESSION_create), config, NULL }; - char *tmpconfig; + { WT_CONFIG_BASE(session, lsm_meta), config, NULL }; + const char *metadata; - /* If the tree is open, it already exists. */ + metadata = NULL; + + /* If the tree can be opened, it already exists. */ WT_WITH_HANDLE_LIST_LOCK(session, ret = __wt_lsm_tree_get(session, uri, false, &lsm_tree)); if (ret == 0) { @@ -323,139 +338,22 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session, } WT_RET_NOTFOUND_OK(ret); - /* - * If the tree has metadata, it already exists. - * - * !!! - * Use a local variable: we don't care what the existing configuration - * is, but we don't want to overwrite the real config. - */ - if (__wt_metadata_search(session, uri, &tmpconfig) == 0) { - __wt_free(session, tmpconfig); - return (exclusive ? EEXIST : 0); + if (!F_ISSET(S2C(session), WT_CONN_READONLY)) { + WT_ERR(__wt_config_merge(session, cfg, NULL, &metadata)); + WT_ERR(__wt_metadata_insert(session, uri, metadata)); } - WT_RET_NOTFOUND_OK(ret); - - /* In-memory configurations don't make sense for LSM. */ - if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) - WT_RET_MSG(session, EINVAL, - "LSM trees not supported by in-memory configurations"); - - WT_RET(__wt_config_gets(session, cfg, "key_format", &cval)); - if (WT_STRING_MATCH("r", cval.str, cval.len)) - WT_RET_MSG(session, EINVAL, - "LSM trees cannot be configured as column stores"); - - WT_RET(__wt_calloc_one(session, &lsm_tree)); - - WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri)); - - WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval)); - WT_ERR(__wt_strndup( - session, cval.str, cval.len, &lsm_tree->key_format)); - WT_ERR(__wt_config_gets(session, cfg, "value_format", &cval)); - WT_ERR(__wt_strndup( - session, cval.str, cval.len, &lsm_tree->value_format)); - - WT_ERR(__wt_config_gets_none(session, cfg, "collator", &cval)); - WT_ERR(__wt_strndup( - session, cval.str, cval.len, &lsm_tree->collator_name)); - - WT_ERR(__wt_config_gets(session, cfg, "cache_resident", &cval)); - if (cval.val != 0) - WT_ERR_MSG(session, EINVAL, - "The cache_resident flag is not compatible with LSM"); - - WT_ERR(__wt_config_gets(session, cfg, "lsm.auto_throttle", &cval)); - if (cval.val) - F_SET(lsm_tree, WT_LSM_TREE_THROTTLE); - else - F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE); - WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom", &cval)); - FLD_SET(lsm_tree->bloom, - (cval.val == 0 ? WT_LSM_BLOOM_OFF : WT_LSM_BLOOM_MERGED)); - WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_oldest", &cval)); - if (cval.val != 0) - FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST); - - if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) && - FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST)) - WT_ERR_MSG(session, EINVAL, - "Bloom filters can only be created on newest and oldest " - "chunks if bloom filters are enabled"); - - WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_config", &cval)); - if (cval.type == WT_CONFIG_ITEM_STRUCT) { - cval.str++; - cval.len -= 2; - } - WT_ERR(__wt_config_check(session, - WT_CONFIG_REF(session, WT_SESSION_create), cval.str, cval.len)); - WT_ERR(__wt_strndup( - session, cval.str, cval.len, &lsm_tree->bloom_config)); - - WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_bit_count", &cval)); - lsm_tree->bloom_bit_count = (uint32_t)cval.val; - WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_hash_count", &cval)); - lsm_tree->bloom_hash_count = (uint32_t)cval.val; - WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_count_limit", &cval)); - lsm_tree->chunk_count_limit = (uint32_t)cval.val; - if (cval.val == 0) - F_SET(lsm_tree, WT_LSM_TREE_MERGES); - else - F_CLR(lsm_tree, WT_LSM_TREE_MERGES); - WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_max", &cval)); - lsm_tree->chunk_max = (uint64_t)cval.val; - WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_size", &cval)); - lsm_tree->chunk_size = (uint64_t)cval.val; - if (lsm_tree->chunk_size > lsm_tree->chunk_max) - WT_ERR_MSG(session, EINVAL, - "Chunk size (chunk_size) must be smaller than or equal to " - "the maximum chunk size (chunk_max)"); - WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_max", &cval)); - lsm_tree->merge_max = (uint32_t)cval.val; - WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_min", &cval)); - lsm_tree->merge_min = (uint32_t)cval.val; - if (lsm_tree->merge_min > lsm_tree->merge_max) - WT_ERR_MSG(session, EINVAL, - "LSM merge_min must be less than or equal to merge_max"); - - /* - * Set up the config for each chunk. - * - * Make the memory_page_max double the chunk size, so application - * threads don't immediately try to force evict the chunk when the - * worker thread clears the NO_EVICTION flag. - */ - WT_ERR(__wt_scr_alloc(session, 0, &buf)); - WT_ERR(__wt_buf_fmt(session, buf, - "%s,key_format=u,value_format=u,memory_page_max=%" PRIu64, - config, 2 * lsm_tree->chunk_max)); - WT_ERR(__wt_strndup( - session, buf->data, buf->size, &lsm_tree->file_config)); - - /* Create the first chunk and flush the metadata. */ - WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); - - /* Discard our partially populated handle. */ - ret = __lsm_tree_discard(session, lsm_tree, false); - lsm_tree = NULL; /* * Open our new tree and add it to the handle cache. Don't discard on * error: the returned handle is NULL on error, and the metadata * tracking macros handle cleaning up on failure. */ - if (ret == 0) - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __lsm_tree_open(session, uri, true, &lsm_tree)); + WT_WITH_HANDLE_LIST_LOCK(session, + ret = __lsm_tree_open(session, uri, true, &lsm_tree)); if (ret == 0) __wt_lsm_tree_release(session, lsm_tree); - if (0) { -err: WT_TRET(__lsm_tree_discard(session, lsm_tree, false)); - } - __wt_scr_free(session, &buf); +err: __wt_free(session, metadata); return (ret); } @@ -477,27 +375,26 @@ __lsm_tree_find(WT_SESSION_IMPL *session, /* See if the tree is already open. */ TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) if (strcmp(uri, lsm_tree->name) == 0) { - /* - * Short circuit if the handle is already held - * exclusively or exclusive access is requested and - * there are references held. - */ - if ((exclusive && lsm_tree->refcnt > 0) || - lsm_tree->exclusive) - return (EBUSY); - if (exclusive) { /* * Make sure we win the race to switch on the * exclusive flag. */ - if (!__wt_atomic_cas8( - &lsm_tree->exclusive, 0, 1)) + if (!__wt_atomic_cas_ptr( + &lsm_tree->excl_session, NULL, session)) return (EBUSY); - /* Make sure there are no readers */ - if (!__wt_atomic_cas32( - &lsm_tree->refcnt, 0, 1)) { - lsm_tree->exclusive = 0; + + /* + * Drain the work queue before checking for + * open cursors - otherwise we can generate + * spurious busy returns. + */ + (void)__wt_atomic_add32(&lsm_tree->refcnt, 1); + if (__lsm_tree_close( + session, lsm_tree, false) != 0 || + lsm_tree->refcnt != 1) { + __wt_lsm_tree_release( + session, lsm_tree); return (EBUSY); } } else { @@ -507,11 +404,11 @@ __lsm_tree_find(WT_SESSION_IMPL *session, * We got a reference, check if an exclusive * lock beat us to it. */ - if (lsm_tree->exclusive) { + if (lsm_tree->excl_session != NULL) { WT_ASSERT(session, lsm_tree->refcnt > 0); - (void)__wt_atomic_sub32( - &lsm_tree->refcnt, 1); + __wt_lsm_tree_release( + session, lsm_tree); return (EBUSY); } } @@ -603,7 +500,7 @@ __lsm_tree_open(WT_SESSION_IMPL *session, * with getting handles exclusive. */ lsm_tree->refcnt = 1; - lsm_tree->exclusive = exclusive ? 1 : 0; + lsm_tree->excl_session = exclusive ? session : NULL; lsm_tree->queue_ref = 0; /* Set a flush timestamp as a baseline. */ @@ -611,7 +508,9 @@ __lsm_tree_open(WT_SESSION_IMPL *session, /* Now the tree is setup, make it visible to others. */ TAILQ_INSERT_HEAD(&S2C(session)->lsmqh, lsm_tree, q); - F_SET(lsm_tree, WT_LSM_TREE_ACTIVE | WT_LSM_TREE_OPEN); + if (!exclusive) + lsm_tree->active = true; + F_SET(lsm_tree, WT_LSM_TREE_OPEN); *treep = lsm_tree; @@ -638,7 +537,7 @@ __wt_lsm_tree_get(WT_SESSION_IMPL *session, ret = __lsm_tree_open(session, uri, exclusive, treep); WT_ASSERT(session, ret != 0 || - (exclusive ? 1 : 0) == (*treep)->exclusive); + (*treep)->excl_session == (exclusive ? session : NULL)); return (ret); } @@ -650,8 +549,11 @@ void __wt_lsm_tree_release(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_ASSERT(session, lsm_tree->refcnt > 0); - if (lsm_tree->exclusive) - lsm_tree->exclusive = 0; + if (lsm_tree->excl_session == session) { + /* We cleared the active flag when getting exclusive access. */ + lsm_tree->active = true; + lsm_tree->excl_session = NULL; + } (void)__wt_atomic_sub32(&lsm_tree->refcnt, 1); } @@ -868,7 +770,7 @@ __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) F_CLR(lsm_tree, WT_LSM_TREE_NEED_SWITCH); ++lsm_tree->dsk_gen; - lsm_tree->modified = 1; + lsm_tree->modified = true; /* * Set the switch transaction in the previous chunk unless this is @@ -964,9 +866,7 @@ __wt_lsm_tree_drop( WT_WITH_HANDLE_LIST_LOCK(session, ret = __wt_lsm_tree_get(session, name, true, &lsm_tree)); WT_RET(ret); - - /* Shut down the LSM worker. */ - WT_ERR(__lsm_tree_close(session, lsm_tree)); + WT_ASSERT(session, !lsm_tree->active); /* Prevent any new opens. */ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); @@ -995,6 +895,7 @@ __wt_lsm_tree_drop( WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree)); ret = __wt_metadata_remove(session, name); + WT_ASSERT(session, !lsm_tree->active); err: if (locked) WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); WT_WITH_HANDLE_LIST_LOCK(session, @@ -1027,9 +928,6 @@ __wt_lsm_tree_rename(WT_SESSION_IMPL *session, ret = __wt_lsm_tree_get(session, olduri, true, &lsm_tree)); WT_RET(ret); - /* Shut down the LSM worker. */ - WT_ERR(__lsm_tree_close(session, lsm_tree)); - /* Prevent any new opens. */ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); locked = true; @@ -1067,8 +965,8 @@ __wt_lsm_tree_rename(WT_SESSION_IMPL *session, err: if (locked) WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); - if (old != NULL) - __wt_free(session, old); + __wt_free(session, old); + /* * Discard this LSM tree structure. The first operation on the renamed * tree will create a new one. @@ -1102,9 +1000,6 @@ __wt_lsm_tree_truncate( ret = __wt_lsm_tree_get(session, name, true, &lsm_tree)); WT_RET(ret); - /* Shut down the LSM worker. */ - WT_ERR(__lsm_tree_close(session, lsm_tree)); - /* Prevent any new opens. */ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); locked = true; @@ -1308,8 +1203,8 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp) if (chunk != NULL) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Compact force flush %s flags 0x%" PRIx32 - " chunk %u flags 0x%" - PRIx32, name, lsm_tree->flags, chunk->id, chunk->flags)); + " chunk %" PRIu32 " flags 0x%" PRIx32, + name, lsm_tree->flags, chunk->id, chunk->flags)); flushing = true; /* * Make sure the in-memory chunk gets flushed do not push a @@ -1331,7 +1226,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp) } /* Wait for the work unit queues to drain. */ - while (F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) { + while (lsm_tree->active) { /* * The flush flag is cleared when the chunk has been flushed. * Continue to push forced flushes until the chunk is on disk. @@ -1342,7 +1237,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp) if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, - "Compact flush done %s chunk %u. " + "Compact flush done %s chunk %" PRIu32 ". " "Start compacting progress %" PRIu64, name, chunk->id, lsm_tree->merge_progressing)); @@ -1353,7 +1248,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp) progress = lsm_tree->merge_progressing; } else { WT_ERR(__wt_verbose(session, WT_VERB_LSM, - "Compact flush retry %s chunk %u", + "Compact flush retry %s chunk %" PRIu32, name, chunk->id)); WT_ERR(__wt_lsm_manager_push_entry(session, WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, @@ -1413,7 +1308,6 @@ err: __wt_lsm_tree_release(session, lsm_tree); return (ret); - } /* @@ -1455,8 +1349,7 @@ __wt_lsm_tree_worker(WT_SESSION_IMPL *session, continue; WT_ERR(__wt_schema_worker(session, chunk->uri, file_func, name_func, cfg, open_flags)); - if (name_func == __wt_backup_list_uri_append && - F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) + if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) WT_ERR(__wt_schema_worker(session, chunk->bloom_uri, file_func, name_func, cfg, open_flags)); } diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c index d5d81df6785..87771e2cb6c 100644 --- a/src/lsm/lsm_work_unit.c +++ b/src/lsm/lsm_work_unit.c @@ -29,7 +29,7 @@ __lsm_copy_chunks(WT_SESSION_IMPL *session, cookie->nchunks = 0; WT_RET(__wt_lsm_tree_readlock(session, lsm_tree)); - if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) + if (!lsm_tree->active) return (__wt_lsm_tree_readunlock(session, lsm_tree)); /* Take a copy of the current state of the LSM tree. */ @@ -72,14 +72,14 @@ __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session, { WT_DECL_RET; WT_LSM_CHUNK *chunk, *evict_chunk, *flush_chunk; - u_int i; + uint32_t i; *chunkp = NULL; chunk = evict_chunk = flush_chunk = NULL; WT_ASSERT(session, lsm_tree->queue_ref > 0); WT_RET(__wt_lsm_tree_readlock(session, lsm_tree)); - if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE) || lsm_tree->nchunks == 0) + if (!lsm_tree->active || lsm_tree->nchunks == 0) return (__wt_lsm_tree_readunlock(session, lsm_tree)); /* Search for a chunk to evict and/or a chunk to flush. */ @@ -118,7 +118,7 @@ __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session, if (chunk != NULL) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, - "Flush%s: return chunk %u of %u: %s", + "Flush%s: return chunk %" PRIu32 " of %" PRIu32 ": %s", force ? " w/ force" : "", i, lsm_tree->nchunks, chunk->uri)); @@ -322,7 +322,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, */ saved_isolation = session->txn.isolation; session->txn.isolation = WT_ISO_READ_UNCOMMITTED; - ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES); + ret = __wt_cache_op(session, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; WT_TRET(__wt_session_release_btree(session)); } @@ -334,11 +334,17 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, /* * Turn on metadata tracking to ensure the checkpoint gets the * necessary handle locks. + * + * Ensure that we don't race with a running checkpoint: the checkpoint + * lock protects against us racing with an application checkpoint in + * this chunk. Don't wait for it, though: checkpoints can take a long + * time, and our checkpoint operation should be very quick. */ WT_ERR(__wt_meta_track_on(session)); - WT_WITH_SCHEMA_LOCK(session, ret, - ret = __wt_schema_worker( - session, chunk->uri, __wt_checkpoint, NULL, NULL, 0)); + WT_WITH_CHECKPOINT_LOCK(session, ret, + WT_WITH_SCHEMA_LOCK(session, ret, + ret = __wt_schema_worker( + session, chunk->uri, __wt_checkpoint, NULL, NULL, 0))); WT_TRET(__wt_meta_track_off(session, false, ret != 0)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM checkpoint"); diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c index 7562cb1cae3..0874da8db13 100644 --- a/src/lsm/lsm_worker.c +++ b/src/lsm/lsm_worker.c @@ -20,7 +20,7 @@ int __wt_lsm_worker_start(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args) { WT_RET(__wt_verbose(session, WT_VERB_LSM_MANAGER, - "Start LSM worker %d type 0x%x", args->id, args->type)); + "Start LSM worker %u type %#" PRIx32, args->id, args->type)); return (__wt_thread_create(session, &args->tid, __lsm_worker, args)); } @@ -59,9 +59,8 @@ __lsm_worker_general_op( */ if (chunk != NULL) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, - "Flush%s chunk %d %s", - force ? " w/ force" : "", - chunk->id, chunk->uri)); + "Flush%s chunk %" PRIu32 " %s", + force ? " w/ force" : "", chunk->id, chunk->uri)); ret = __wt_lsm_checkpoint_chunk( session, entry->lsm_tree, chunk); WT_ASSERT(session, chunk->refcnt > 0); @@ -140,7 +139,7 @@ __lsm_worker(void *arg) if (ret == WT_NOTFOUND) { F_CLR(entry->lsm_tree, WT_LSM_TREE_COMPACTING); ret = 0; - } else if (ret == EBUSY) + } else if (ret == EBUSY || ret == EINTR) ret = 0; /* Paranoia: clear session state. */ @@ -164,7 +163,7 @@ __lsm_worker(void *arg) if (ret != 0) { err: __wt_lsm_manager_free_work_unit(session, entry); WT_PANIC_MSG(session, ret, - "Error in LSM worker thread %d", cookie->id); + "Error in LSM worker thread %u", cookie->id); } return (WT_THREAD_RET_VALUE); } diff --git a/src/meta/meta_apply.c b/src/meta/meta_apply.c index 92766213b33..fb483c21dd9 100644 --- a/src/meta/meta_apply.c +++ b/src/meta/meta_apply.c @@ -15,39 +15,41 @@ */ static inline int __meta_btree_apply(WT_SESSION_IMPL *session, WT_CURSOR *cursor, - int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]) + int (*file_func)(WT_SESSION_IMPL *, const char *[]), + int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), + const char *cfg[]) { WT_DECL_RET; const char *uri; - int cmp; + bool skip; - cursor->set_key(cursor, "file:"); - if ((ret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0) - ret = cursor->next(cursor); - for (; ret == 0; ret = cursor->next(cursor)) { + while ((ret = cursor->next(cursor)) == 0) { WT_RET(cursor->get_key(cursor, &uri)); - if (!WT_PREFIX_MATCH(uri, "file:")) - break; if (strcmp(uri, WT_METAFILE_URI) == 0) continue; + skip = false; + if (name_func != NULL) + WT_RET(name_func(session, uri, &skip)); + + if (file_func == NULL || skip || !WT_PREFIX_MATCH(uri, "file:")) + continue; + /* * We need to pull the handle into the session handle cache * and make sure it's referenced to stop other internal code * dropping the handle (e.g in LSM when cleaning up obsolete * chunks). Holding the metadata lock isn't enough. */ - ret = __wt_session_get_btree(session, uri, NULL, NULL, 0); - if (ret == 0) { - WT_SAVE_DHANDLE(session, ret = func(session, cfg)); - if (WT_META_TRACKING(session)) - WT_TRET(__wt_meta_track_handle_lock( - session, false)); - else - WT_TRET(__wt_session_release_btree(session)); - } else if (ret == EBUSY) - ret = __wt_conn_btree_apply_single( - session, uri, NULL, func, cfg); + if ((ret = __wt_session_get_btree( + session, uri, NULL, NULL, 0)) != 0) + return (ret == EBUSY ? 0 : ret); + WT_SAVE_DHANDLE(session, ret = file_func(session, cfg)); + if (WT_META_TRACKING(session)) + WT_TRET(__wt_meta_track_handle_lock( + session, false)); + else + WT_TRET(__wt_session_release_btree(session)); WT_RET(ret); } WT_RET_NOTFOUND_OK(ret); @@ -56,20 +58,22 @@ __meta_btree_apply(WT_SESSION_IMPL *session, WT_CURSOR *cursor, } /* - * __wt_meta_btree_apply -- + * __wt_meta_apply_all -- * Apply a function to all files listed in the metadata, apart from the * metadata file. */ int -__wt_meta_btree_apply(WT_SESSION_IMPL *session, - int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]) +__wt_meta_apply_all(WT_SESSION_IMPL *session, + int (*file_func)(WT_SESSION_IMPL *, const char *[]), + int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), + const char *cfg[]) { WT_CURSOR *cursor; WT_DECL_RET; WT_RET(__wt_metadata_cursor(session, &cursor)); - WT_SAVE_DHANDLE(session, - ret = __meta_btree_apply(session, cursor, func, cfg)); + WT_SAVE_DHANDLE(session, ret = + __meta_btree_apply(session, cursor, file_func, name_func, cfg)); WT_TRET(__wt_metadata_cursor_release(session, &cursor)); return (ret); diff --git a/src/meta/meta_ckpt.c b/src/meta/meta_ckpt.c index df4cd2cb4d6..0a864432daf 100644 --- a/src/meta/meta_ckpt.c +++ b/src/meta/meta_ckpt.c @@ -212,8 +212,7 @@ __ckpt_last_name( if (found && a.val < found) continue; - if (*namep != NULL) - __wt_free(session, *namep); + __wt_free(session, *namep); WT_ERR(__wt_strndup(session, k.str, k.len, namep)); found = a.val; } @@ -221,7 +220,7 @@ __ckpt_last_name( ret = WT_NOTFOUND; if (0) { -err: __wt_free(session, namep); +err: __wt_free(session, *namep); } return (ret); } diff --git a/src/meta/meta_table.c b/src/meta/meta_table.c index 61cc009c983..e5f2727b5b6 100644 --- a/src/meta/meta_table.c +++ b/src/meta/meta_table.c @@ -67,18 +67,16 @@ __wt_metadata_cursor_open( btree = ((WT_CURSOR_BTREE *)(*cursorp))->btree; /* - * Set special flags for the metadata file: eviction (the metadata file - * is in-memory and never evicted), logging (the metadata file is always - * logged if possible). + * Special settings for metadata: skew eviction so metadata almost + * always stays in cache and make sure metadata is logged if possible. * - * Test flags before setting them so updates can't race in subsequent - * opens (the first update is safe because it's single-threaded from + * Test before setting so updates can't race in subsequent opens (the + * first update is safe because it's single-threaded from * wiredtiger_open). */ - if (!F_ISSET(btree, WT_BTREE_IN_MEMORY)) - F_SET(btree, WT_BTREE_IN_MEMORY); - if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) - F_SET(btree, WT_BTREE_NO_EVICTION); + if (btree->evict_priority == 0) + WT_WITH_BTREE(session, btree, + __wt_evict_priority_set(session, WT_EVICT_INT_SKEW)); if (F_ISSET(btree, WT_BTREE_NO_LOGGING)) F_CLR(btree, WT_BTREE_NO_LOGGING); diff --git a/src/meta/meta_track.c b/src/meta/meta_track.c index 1baab2deae1..a73b7e09d37 100644 --- a/src/meta/meta_track.c +++ b/src/meta/meta_track.c @@ -284,11 +284,12 @@ __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll) * should be included in the checkpoint. */ ckpt_session->txn.id = session->txn.id; - F_SET(ckpt_session, WT_SESSION_LOCKED_SCHEMA); - WT_WITH_DHANDLE(ckpt_session, - WT_SESSION_META_DHANDLE(session), - ret = __wt_checkpoint(ckpt_session, NULL)); - F_CLR(ckpt_session, WT_SESSION_LOCKED_SCHEMA); + F_SET(ckpt_session, WT_SESSION_LOCKED_METADATA); + WT_WITH_METADATA_LOCK(session, ret, + WT_WITH_DHANDLE(ckpt_session, + WT_SESSION_META_DHANDLE(session), + ret = __wt_checkpoint(ckpt_session, NULL))); + F_CLR(ckpt_session, WT_SESSION_LOCKED_METADATA); ckpt_session->txn.id = WT_TXN_NONE; WT_RET(ret); WT_WITH_DHANDLE(session, diff --git a/src/meta/meta_turtle.c b/src/meta/meta_turtle.c index 7182bb0fe5f..471bb65cac0 100644 --- a/src/meta/meta_turtle.c +++ b/src/meta/meta_turtle.c @@ -113,8 +113,9 @@ __metadata_load_bulk(WT_SESSION_IMPL *session) WT_DECL_RET; uint32_t allocsize; bool exist; - const char *filecfg[] = { WT_CONFIG_BASE(session, file_meta), NULL }; - const char *key; + const char *filecfg[] = { + WT_CONFIG_BASE(session, file_meta), NULL, NULL }; + const char *key, *value; /* * If a file was being bulk-loaded during the hot backup, it will appear @@ -135,6 +136,8 @@ __metadata_load_bulk(WT_SESSION_IMPL *session) * If the file doesn't exist, assume it's a bulk-loaded file; * retrieve the allocation size and re-create the file. */ + WT_ERR(cursor->get_value(cursor, &value)); + filecfg[1] = value; WT_ERR(__wt_direct_io_size_check( session, filecfg, "allocation_size", &allocsize)); WT_ERR(__wt_block_manager_create(session, key, allocsize)); @@ -153,10 +156,11 @@ int __wt_turtle_init(WT_SESSION_IMPL *session) { WT_DECL_RET; - bool exist, exist_incr; + bool exist_backup, exist_incr, exist_turtle, load; char *metaconf; metaconf = NULL; + load = false; /* * Discard any turtle setup file left-over from previous runs. This @@ -179,13 +183,29 @@ __wt_turtle_init(WT_SESSION_IMPL *session) * done. */ WT_RET(__wt_exist(session, WT_INCREMENTAL_BACKUP, &exist_incr)); - WT_RET(__wt_exist(session, WT_METADATA_TURTLE, &exist)); - if (exist) { + WT_RET(__wt_exist(session, WT_METADATA_BACKUP, &exist_backup)); + WT_RET(__wt_exist(session, WT_METADATA_TURTLE, &exist_turtle)); + if (exist_turtle) { if (exist_incr) WT_RET_MSG(session, EINVAL, "Incremental backup after running recovery " "is not allowed."); - } else { + /* + * If we have a backup file and metadata and turtle files, + * we want to recreate the metadata from the backup. + */ + if (exist_backup) { + WT_RET(__wt_msg(session, "Both %s and %s exist. " + "Recreating metadata from backup.", + WT_METADATA_TURTLE, WT_METADATA_BACKUP)); + WT_RET(__wt_remove_if_exists(session, WT_METAFILE)); + WT_RET(__wt_remove_if_exists( + session, WT_METADATA_TURTLE)); + load = true; + } + } else + load = true; + if (load) { if (exist_incr) F_SET(S2C(session), WT_CONN_WAS_BACKUP); diff --git a/src/os_posix/os_alloc.c b/src/os_posix/os_alloc.c index 3876f9a1afe..cfc7b80450e 100644 --- a/src/os_posix/os_alloc.c +++ b/src/os_posix/os_alloc.c @@ -18,22 +18,13 @@ #include <gperftools/tcmalloc.h> #define calloc tc_calloc +#define malloc tc_malloc #define realloc tc_realloc #define posix_memalign tc_posix_memalign #define free tc_free #endif /* - * There's no malloc interface, WiredTiger never calls malloc. - * - * The problem is an application might allocate memory, write secret stuff in - * it, free the memory, then WiredTiger allocates the memory and uses it for a - * file page or log record, then writes it to disk, without having overwritten - * it fully. That results in the secret stuff being protected by WiredTiger's - * permission mechanisms, potentially inappropriate for the secret stuff. - */ - -/* * __wt_calloc -- * ANSI calloc function. */ @@ -67,12 +58,46 @@ __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp) } /* - * __wt_realloc -- - * ANSI realloc function. + * __wt_malloc -- + * ANSI malloc function. */ int -__wt_realloc(WT_SESSION_IMPL *session, - size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp) +__wt_malloc(WT_SESSION_IMPL *session, size_t bytes_to_allocate, void *retp) +{ + void *p; + + /* + * Defensive: if our caller doesn't handle errors correctly, ensure a + * free won't fail. + */ + *(void **)retp = NULL; + + /* + * !!! + * This function MUST handle a NULL WT_SESSION_IMPL handle. + */ + WT_ASSERT(session, bytes_to_allocate != 0); + + if (session != NULL) + WT_STAT_FAST_CONN_INCR(session, memory_allocation); + + if ((p = malloc(bytes_to_allocate)) == NULL) + WT_RET_MSG(session, __wt_errno(), + "memory allocation of %" WT_SIZET_FMT " bytes failed", + bytes_to_allocate); + + *(void **)retp = p; + return (0); +} + +/* + * __realloc_func -- + * ANSI realloc function. + */ +static int +__realloc_func(WT_SESSION_IMPL *session, + size_t *bytes_allocated_ret, size_t bytes_to_allocate, bool clear_memory, + void *retp) { void *p; size_t bytes_allocated; @@ -107,15 +132,12 @@ __wt_realloc(WT_SESSION_IMPL *session, bytes_to_allocate); /* - * Clear the allocated memory -- an application might: allocate memory, - * write secret stuff into it, free the memory, then we re-allocate the - * memory and use it for a file page or log record, and then write it to - * disk. That would result in the secret stuff being protected by the - * WiredTiger permission mechanisms, potentially inappropriate for the - * secret stuff. + * Clear the allocated memory, parts of WiredTiger depend on allocated + * memory being cleared. */ - memset((uint8_t *) - p + bytes_allocated, 0, bytes_to_allocate - bytes_allocated); + if (clear_memory) + memset((uint8_t *)p + bytes_allocated, + 0, bytes_to_allocate - bytes_allocated); /* Update caller's bytes allocated value. */ if (bytes_allocated_ret != NULL) @@ -126,9 +148,33 @@ __wt_realloc(WT_SESSION_IMPL *session, } /* + * __wt_realloc -- + * WiredTiger's realloc API. + */ +int +__wt_realloc(WT_SESSION_IMPL *session, + size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp) +{ + return (__realloc_func( + session, bytes_allocated_ret, bytes_to_allocate, true, retp)); +} + +/* + * __wt_realloc_noclear -- + * WiredTiger's realloc API, not clearing allocated memory. + */ +int +__wt_realloc_noclear(WT_SESSION_IMPL *session, + size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp) +{ + return (__realloc_func( + session, bytes_allocated_ret, bytes_to_allocate, false, retp)); +} + +/* * __wt_realloc_aligned -- * ANSI realloc function that aligns to buffer boundaries, configured with - * the "buffer_alignment" key to wiredtiger_open. + * the "buffer_alignment" key to wiredtiger_open. */ int __wt_realloc_aligned(WT_SESSION_IMPL *session, @@ -184,10 +230,6 @@ __wt_realloc_aligned(WT_SESSION_IMPL *session, __wt_free(session, p); p = newp; - /* Clear the allocated memory (see above). */ - memset((uint8_t *)p + bytes_allocated, 0, - bytes_to_allocate - bytes_allocated); - /* Update caller's bytes allocated value. */ if (bytes_allocated_ret != NULL) *bytes_allocated_ret = bytes_to_allocate; @@ -200,11 +242,11 @@ __wt_realloc_aligned(WT_SESSION_IMPL *session, * If there is no posix_memalign function, or no alignment configured, * fall back to realloc. * - * Windows note: Visual C CRT memalign does not match Posix behavior - * and would also double each allocation so it is bad for memory use + * Windows note: Visual C CRT memalign does not match POSIX behavior + * and would also double each allocation so it is bad for memory use. */ - return (__wt_realloc( - session, bytes_allocated_ret, bytes_to_allocate, retp)); + return (__realloc_func( + session, bytes_allocated_ret, bytes_to_allocate, false, retp)); } /* @@ -221,13 +263,14 @@ __wt_strndup(WT_SESSION_IMPL *session, const void *str, size_t len, void *retp) return (0); } - WT_RET(__wt_calloc(session, len + 1, 1, &p)); + WT_RET(__wt_malloc(session, len + 1, &p)); /* * Don't change this to strncpy, we rely on this function to duplicate * "strings" that contain nul bytes. */ memcpy(p, str, len); + ((uint8_t *)p)[len] = '\0'; *(void **)retp = p; return (0); diff --git a/src/os_posix/os_errno.c b/src/os_posix/os_errno.c index a58ae88447e..a0f1202c6ef 100644 --- a/src/os_posix/os_errno.c +++ b/src/os_posix/os_errno.c @@ -23,6 +23,22 @@ __wt_errno(void) } /* + * __wt_map_error_rdonly -- + * Map an error into a WiredTiger error code specific for + * read-only operation which intercepts based on certain types + * of failures. + */ +int +__wt_map_error_rdonly(int error) +{ + if (error == ENOENT) + return (WT_NOTFOUND); + else if (error == EACCES) + return (WT_PERM_DENIED); + return (error); +} + +/* * __wt_strerror -- * POSIX implementation of WT_SESSION.strerror and wiredtiger_strerror. */ diff --git a/src/os_posix/os_fallocate.c b/src/os_posix/os_fallocate.c index 9d160afd179..bf20a99bdef 100644 --- a/src/os_posix/os_fallocate.c +++ b/src/os_posix/os_fallocate.c @@ -115,6 +115,7 @@ __wt_fallocate( { WT_DECL_RET; + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); switch (fh->fallocate_available) { /* * Check for already configured handles and make the configured call. diff --git a/src/os_posix/os_fsync.c b/src/os_posix/os_fsync.c index f5afddc557b..0bd0359338b 100644 --- a/src/os_posix/os_fsync.c +++ b/src/os_posix/os_fsync.c @@ -60,6 +60,7 @@ __wt_directory_sync_fh(WT_SESSION_IMPL *session, WT_FH *fh) #ifdef __linux__ WT_DECL_RET; + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); if ((ret = __wt_handle_sync(fh->fd)) == 0) return (0); WT_RET_MSG(session, ret, "%s: fsync", fh->name); @@ -108,6 +109,7 @@ __wt_directory_sync(WT_SESSION_IMPL *session, const char *path) if (ret != 0) WT_RET_MSG(session, ret, "%s: open", path); + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); if ((ret = __wt_handle_sync(fd)) != 0) WT_ERR_MSG(session, ret, "%s: fsync", path); @@ -134,6 +136,9 @@ __wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh) WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: fsync", fh->name)); + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY) || + WT_STRING_MATCH(fh->name, WT_SINGLETHREAD, + strlen(WT_SINGLETHREAD))); if ((ret = __wt_handle_sync(fh->fd)) == 0) return (0); WT_RET_MSG(session, ret, "%s fsync error", fh->name); @@ -149,6 +154,7 @@ __wt_fsync_async(WT_SESSION_IMPL *session, WT_FH *fh) #ifdef HAVE_SYNC_FILE_RANGE WT_DECL_RET; + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); WT_RET(__wt_verbose( session, WT_VERB_FILEOPS, "%s: sync_file_range", fh->name)); diff --git a/src/os_posix/os_ftruncate.c b/src/os_posix/os_ftruncate.c index 2af90512f26..94d6cba3bf5 100644 --- a/src/os_posix/os_ftruncate.c +++ b/src/os_posix/os_ftruncate.c @@ -17,6 +17,7 @@ __wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len) { WT_DECL_RET; + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); WT_SYSCALL_RETRY(ftruncate(fh->fd, len), ret); if (ret == 0) return (0); diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c index b085676c53b..219b26c2fa1 100644 --- a/src/os_posix/os_open.c +++ b/src/os_posix/os_open.c @@ -73,7 +73,16 @@ __wt_open(WT_SESSION_IMPL *session, goto setupfh; } - f = O_RDWR; + /* + * If this is a read-only connection, open all files read-only + * except the lock file. + */ + if (F_ISSET(conn, WT_CONN_READONLY) && + !WT_STRING_MATCH(name, WT_SINGLETHREAD, + strlen(WT_SINGLETHREAD))) + f = O_RDONLY; + else + f = O_RDWR; #ifdef O_BINARY /* Windows clones: we always want to treat the file as a binary. */ f |= O_BINARY; @@ -94,6 +103,9 @@ __wt_open(WT_SESSION_IMPL *session, #endif if (ok_create) { + WT_ASSERT(session, !F_ISSET(conn, WT_CONN_READONLY) || + WT_STRING_MATCH(name, WT_SINGLETHREAD, + strlen(WT_SINGLETHREAD))); f |= O_CREAT; if (exclusive) f |= O_EXCL; diff --git a/src/os_posix/os_remove.c b/src/os_posix/os_remove.c index bc244c12e46..eb2e37fdc38 100644 --- a/src/os_posix/os_remove.c +++ b/src/os_posix/os_remove.c @@ -21,6 +21,7 @@ __remove_file_check(WT_SESSION_IMPL *session, const char *name) uint64_t bucket; conn = S2C(session); + WT_ASSERT(session, !F_ISSET(conn, WT_CONN_READONLY)); fh = NULL; bucket = __wt_hash_city64(name, strlen(name)) % WT_HASH_ARRAY_SIZE; diff --git a/src/os_posix/os_rename.c b/src/os_posix/os_rename.c index 301190305c4..8ec4ee3aa23 100644 --- a/src/os_posix/os_rename.c +++ b/src/os_posix/os_rename.c @@ -21,6 +21,7 @@ __wt_rename(WT_SESSION_IMPL *session, const char *from, const char *to) WT_RET(__wt_verbose( session, WT_VERB_FILEOPS, "rename %s to %s", from, to)); + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); from_path = to_path = NULL; WT_RET(__wt_filename(session, from, &from_path)); diff --git a/src/os_posix/os_rw.c b/src/os_posix/os_rw.c index 8733bfe0f53..3d49fa7e712 100644 --- a/src/os_posix/os_rw.c +++ b/src/os_posix/os_rw.c @@ -65,6 +65,9 @@ __wt_write(WT_SESSION_IMPL *session, "%s: write %" WT_SIZET_FMT " bytes at offset %" PRIuMAX, fh->name, len, (uintmax_t)offset)); + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY) || + WT_STRING_MATCH(fh->name, WT_SINGLETHREAD, + strlen(WT_SINGLETHREAD))); /* Assert direct I/O is aligned and a multiple of the alignment. */ WT_ASSERT(session, !fh->direct_io || diff --git a/src/os_posix/os_stdio.c b/src/os_posix/os_stdio.c index 7ab107eda1e..65a0f40a659 100644 --- a/src/os_posix/os_stdio.c +++ b/src/os_posix/os_stdio.c @@ -46,8 +46,7 @@ __wt_fopen(WT_SESSION_IMPL *session, if (*fpp == NULL) ret = __wt_errno(); - if (pathbuf != NULL) - __wt_free(session, pathbuf); + __wt_free(session, pathbuf); if (ret == 0) return (0); diff --git a/src/os_win/os_errno.c b/src/os_win/os_errno.c index 6a9daf8443f..590fcdc9d44 100644 --- a/src/os_win/os_errno.c +++ b/src/os_win/os_errno.c @@ -17,11 +17,13 @@ static const int windows_error_offset = -29000; * Windows errors are from 0 - 15999 according to the documentation */ static DWORD -__wt_map_error_to_windows_error(int error) { - /* Ensure we do not exceed the error range - Also validate he do not get any COM errors - (which are negative integers) - */ +__wt_map_error_to_windows_error(int error) +{ + /* + * Ensure we do not exceed the error range + * Also validate we do not get any COM errors + * (which are negative integers) + */ WT_ASSERT(NULL, error < 0); return (error + -(windows_error_offset)); @@ -32,11 +34,28 @@ __wt_map_error_to_windows_error(int error) { * Return a positive integer, a decoded Windows error */ static int -__wt_map_windows_error_to_error(DWORD winerr) { +__wt_map_windows_error_to_error(DWORD winerr) +{ return (winerr + windows_error_offset); } /* + * __wt_map_error_rdonly -- + * Map an error into a WiredTiger error code specific for + * read-only operation which intercepts based on certain types + * of failures. + */ +int +__wt_map_error_rdonly(int winerr) +{ + if (winerr == ERROR_FILE_NOT_FOUND) + return (WT_NOTFOUND); + else if (winerr == ERROR_ACCESS_DENIED) + return (WT_PERM_DENIED); + return (winerr); +} + +/* * __wt_errno -- * Return errno, or WT_ERROR if errno not set. */ diff --git a/src/os_win/os_fallocate.c b/src/os_win/os_fallocate.c index cdc7a1c46ee..a324687ca73 100644 --- a/src/os_win/os_fallocate.c +++ b/src/os_win/os_fallocate.c @@ -35,6 +35,7 @@ int __wt_fallocate( WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, wt_off_t len) { + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); WT_UNUSED(session); WT_UNUSED(fh); WT_UNUSED(offset); diff --git a/src/os_win/os_fsync.c b/src/os_win/os_fsync.c index 913b7ca5a4e..c196fc6c06a 100644 --- a/src/os_win/os_fsync.c +++ b/src/os_win/os_fsync.c @@ -15,6 +15,7 @@ int __wt_directory_sync_fh(WT_SESSION_IMPL *session, WT_FH *fh) { + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); WT_UNUSED(session); WT_UNUSED(fh); return (0); @@ -27,6 +28,7 @@ __wt_directory_sync_fh(WT_SESSION_IMPL *session, WT_FH *fh) int __wt_directory_sync(WT_SESSION_IMPL *session, const char *path) { + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); WT_UNUSED(session); WT_UNUSED(path); return (0); @@ -44,6 +46,9 @@ __wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh) WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: FlushFileBuffers", fh->name)); + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY) || + WT_STRING_MATCH(fh->name, WT_SINGLETHREAD, + strlen(WT_SINGLETHREAD))); if ((ret = FlushFileBuffers(fh->filehandle)) == FALSE) WT_RET_MSG(session, __wt_errno(), "%s FlushFileBuffers error", fh->name); @@ -58,6 +63,7 @@ __wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh) int __wt_fsync_async(WT_SESSION_IMPL *session, WT_FH *fh) { + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); WT_UNUSED(session); WT_UNUSED(fh); diff --git a/src/os_win/os_ftruncate.c b/src/os_win/os_ftruncate.c index 0c11b5509b7..88fcf9542c1 100644 --- a/src/os_win/os_ftruncate.c +++ b/src/os_win/os_ftruncate.c @@ -18,6 +18,7 @@ __wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len) WT_DECL_RET; LARGE_INTEGER largeint; + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); largeint.QuadPart = len; if ((ret = SetFilePointerEx( diff --git a/src/os_win/os_open.c b/src/os_win/os_open.c index 3ec53daf001..f10582c5bd1 100644 --- a/src/os_win/os_open.c +++ b/src/os_win/os_open.c @@ -58,7 +58,17 @@ __wt_open(WT_SESSION_IMPL *session, WT_RET(__wt_filename(session, name, &path)); - share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE; + /* + * If this is a read-only connection, open all files read-only + * except the lock file. + */ + if (F_ISSET(conn, WT_CONN_READONLY) && + !WT_STRING_MATCH(name, WT_SINGLETHREAD, + strlen(WT_SINGLETHREAD))) + share_mode = FILE_SHARE_READ; + else + share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE; + /* * Security: * The application may spawn a new process, and we don't want another @@ -72,6 +82,9 @@ __wt_open(WT_SESSION_IMPL *session, dwCreationDisposition = 0; if (ok_create) { + WT_ASSERT(session, !F_ISSET(conn, WT_CONN_READONLY) || + WT_STRING_MATCH(name, WT_SINGLETHREAD, + strlen(WT_SINGLETHREAD))); dwCreationDisposition = CREATE_NEW; if (exclusive) dwCreationDisposition = CREATE_ALWAYS; diff --git a/src/os_win/os_remove.c b/src/os_win/os_remove.c index 5682a25d7f2..84f1dd86674 100644 --- a/src/os_win/os_remove.c +++ b/src/os_win/os_remove.c @@ -21,6 +21,7 @@ __remove_file_check(WT_SESSION_IMPL *session, const char *name) uint64_t bucket; conn = S2C(session); + WT_ASSERT(session, !F_ISSET(conn, WT_CONN_READONLY)); fh = NULL; bucket = __wt_hash_city64(name, strlen(name)) % WT_HASH_ARRAY_SIZE; diff --git a/src/os_win/os_rename.c b/src/os_win/os_rename.c index 829ab1d16e9..b4be2dba24c 100644 --- a/src/os_win/os_rename.c +++ b/src/os_win/os_rename.c @@ -22,6 +22,7 @@ __wt_rename(WT_SESSION_IMPL *session, const char *from, const char *to) WT_RET(__wt_verbose( session, WT_VERB_FILEOPS, "rename %s to %s", from, to)); + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY)); from_path = to_path = NULL; WT_RET(__wt_filename(session, from, &from_path)); diff --git a/src/os_win/os_rw.c b/src/os_win/os_rw.c index 49f011001a4..a9537a648f9 100644 --- a/src/os_win/os_rw.c +++ b/src/os_win/os_rw.c @@ -74,6 +74,9 @@ __wt_write(WT_SESSION_IMPL *session, "%s: write %" WT_SIZET_FMT " bytes at offset %" PRIuMAX, fh->name, len, (uintmax_t)offset)); + WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY) || + WT_STRING_MATCH(fh->name, WT_SINGLETHREAD, + strlen(WT_SINGLETHREAD))); /* Assert direct I/O is aligned and a multiple of the alignment. */ WT_ASSERT(session, !fh->direct_io || diff --git a/src/packing/pack_impl.c b/src/packing/pack_impl.c index 0e3ed44ba6a..5dbb0f33842 100644 --- a/src/packing/pack_impl.c +++ b/src/packing/pack_impl.c @@ -107,36 +107,6 @@ __wt_struct_unpack(WT_SESSION_IMPL *session, } /* - * __wt_struct_unpack_size -- - * Determine the packed size of a buffer matching the format. - */ -int -__wt_struct_unpack_size(WT_SESSION_IMPL *session, - const void *buffer, size_t size, const char *fmt, size_t *resultp) -{ - WT_DECL_PACK_VALUE(pv); - WT_DECL_RET; - WT_PACK pack; - const uint8_t *p, *end; - - p = buffer; - end = p + size; - - WT_RET(__pack_init(session, &pack, fmt)); - while ((ret = __pack_next(&pack, &pv)) == 0) - WT_RET(__unpack_read(session, &pv, &p, (size_t)(end - p))); - - /* Be paranoid - __pack_write should never overflow. */ - WT_ASSERT(session, p <= end); - - if (ret != WT_NOTFOUND) - return (ret); - - *resultp = WT_PTRDIFF(p, buffer); - return (0); -} - -/* * __wt_struct_repack -- * Return the subset of the packed buffer that represents part of * the format. If the result is not contiguous in the existing @@ -144,70 +114,43 @@ __wt_struct_unpack_size(WT_SESSION_IMPL *session, */ int __wt_struct_repack(WT_SESSION_IMPL *session, const char *infmt, - const char *outfmt, const WT_ITEM *inbuf, WT_ITEM *outbuf, void **reallocp) + const char *outfmt, const WT_ITEM *inbuf, WT_ITEM *outbuf) { WT_DECL_PACK_VALUE(pvin); WT_DECL_PACK_VALUE(pvout); WT_DECL_RET; WT_PACK packin, packout; const uint8_t *before, *end, *p; - uint8_t *pout; - size_t len; const void *start; start = NULL; p = inbuf->data; end = p + inbuf->size; - /* - * Handle this non-contiguous case: 'U' -> 'u' at the end of the buf. - * The former case has the size embedded before the item, the latter - * does not. - */ - if ((len = strlen(outfmt)) > 1 && outfmt[len - 1] == 'u' && - strlen(infmt) > len && infmt[len - 1] == 'U') { - WT_ERR(__wt_realloc(session, NULL, inbuf->size, reallocp)); - pout = *reallocp; - } else - pout = NULL; - - WT_ERR(__pack_init(session, &packout, outfmt)); - WT_ERR(__pack_init(session, &packin, infmt)); + WT_RET(__pack_init(session, &packout, outfmt)); + WT_RET(__pack_init(session, &packin, infmt)); /* Outfmt should complete before infmt */ while ((ret = __pack_next(&packout, &pvout)) == 0) { if (p >= end) - WT_ERR(EINVAL); - WT_ERR(__pack_next(&packin, &pvin)); + WT_RET(EINVAL); + if (pvout.type == 'x' && pvout.size == 0 && pvout.havesize) + continue; + WT_RET(__pack_next(&packin, &pvin)); before = p; - WT_ERR(__unpack_read(session, &pvin, &p, (size_t)(end - p))); - if (pvout.type != pvin.type) { - if (pvout.type == 'u' && pvin.type == 'U') { - /* Skip the prefixed size, we don't need it */ - WT_ERR(__wt_struct_unpack_size(session, before, - (size_t)(end - before), "I", &len)); - before += len; - } else - WT_ERR(ENOTSUP); - } - if (pout != NULL) { - memcpy(pout, before, WT_PTRDIFF(p, before)); - pout += p - before; - } else if (start == NULL) + WT_RET(__unpack_read(session, &pvin, &p, (size_t)(end - p))); + if (pvout.type != pvin.type) + WT_RET(ENOTSUP); + if (start == NULL) start = before; } - WT_ERR_NOTFOUND_OK(ret); + WT_RET_NOTFOUND_OK(ret); /* Be paranoid - __pack_write should never overflow. */ WT_ASSERT(session, p <= end); - if (pout != NULL) { - outbuf->data = *reallocp; - outbuf->size = WT_PTRDIFF(pout, *reallocp); - } else { - outbuf->data = start; - outbuf->size = WT_PTRDIFF(p, start); - } + outbuf->data = start; + outbuf->size = WT_PTRDIFF(p, start); -err: return (ret); + return (0); } diff --git a/src/packing/pack_stream.c b/src/packing/pack_stream.c index 98da5b405c3..1393eb9a9c1 100644 --- a/src/packing/pack_stream.c +++ b/src/packing/pack_stream.c @@ -65,8 +65,7 @@ wiredtiger_pack_close(WT_PACK_STREAM *ps, size_t *usedp) if (usedp != NULL) *usedp = WT_PTRDIFF(ps->p, ps->start); - if (ps != NULL) - __wt_free(ps->pack.session, ps); + __wt_free(ps->pack.session, ps); return (0); } @@ -327,3 +326,139 @@ wiredtiger_unpack_uint(WT_PACK_STREAM *ps, uint64_t *up) } return (0); } + +/* + * __wt_ext_pack_start -- + * WT_EXTENSION.pack_start method. + */ +int +__wt_ext_pack_start(WT_EXTENSION_API *wt_api, + WT_SESSION *wt_session, const char *format, + void *buffer, size_t size, WT_PACK_STREAM **psp) +{ + WT_CONNECTION_IMPL *conn; + + conn = (WT_CONNECTION_IMPL *)wt_api->conn; + if (wt_session == NULL) + wt_session = (WT_SESSION *)conn->default_session; + return (wiredtiger_pack_start(wt_session, format, buffer, size, psp)); +} + +/* + * __wt_ext_unpack_start -- + * WT_EXTENSION.unpack_start + */ +int +__wt_ext_unpack_start(WT_EXTENSION_API *wt_api, + WT_SESSION *wt_session, const char *format, + const void *buffer, size_t size, WT_PACK_STREAM **psp) +{ + WT_CONNECTION_IMPL *conn; + + conn = (WT_CONNECTION_IMPL *)wt_api->conn; + if (wt_session == NULL) + wt_session = (WT_SESSION *)conn->default_session; + return (wiredtiger_unpack_start(wt_session, format, buffer, size, psp)); +} + +/* + * __wt_ext_pack_close -- + * WT_EXTENSION.pack_close + */ +int +__wt_ext_pack_close(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, size_t *usedp) +{ + WT_UNUSED(wt_api); + return (wiredtiger_pack_close(ps, usedp)); +} + +/* + * __wt_ext_pack_item -- + * WT_EXTENSION.pack_item + */ +int +__wt_ext_pack_item(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, WT_ITEM *item) +{ + WT_UNUSED(wt_api); + return (wiredtiger_pack_item(ps, item)); +} + +/* + * __wt_ext_pack_int -- + * WT_EXTENSION.pack_int + */ +int +__wt_ext_pack_int(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, int64_t i) +{ + WT_UNUSED(wt_api); + return (wiredtiger_pack_int(ps, i)); +} + +/* + * __wt_ext_pack_str -- + * WT_EXTENSION.pack_str + */ +int +__wt_ext_pack_str(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, const char *s) +{ + WT_UNUSED(wt_api); + return (wiredtiger_pack_str(ps, s)); +} + +/* + * __wt_ext_pack_uint -- + * WT_EXTENSION.pack_uint + */ +int +__wt_ext_pack_uint(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, uint64_t u) +{ + WT_UNUSED(wt_api); + return (wiredtiger_pack_uint(ps, u)); +} + +/* + * __wt_ext_unpack_item -- + * WT_EXTENSION.unpack_item + */ +int +__wt_ext_unpack_item(WT_EXTENSION_API *wt_api, + WT_PACK_STREAM *ps, WT_ITEM *item) +{ + WT_UNUSED(wt_api); + return (wiredtiger_unpack_item(ps, item)); +} + +/* + * __wt_ext_unpack_int -- + * WT_EXTENSION.unpack_int + */ +int +__wt_ext_unpack_int(WT_EXTENSION_API *wt_api, + WT_PACK_STREAM *ps, int64_t *ip) +{ + WT_UNUSED(wt_api); + return (wiredtiger_unpack_int(ps, ip)); +} + +/* + * __wt_ext_unpack_str -- + * WT_EXTENSION.unpack_str + */ +int +__wt_ext_unpack_str(WT_EXTENSION_API *wt_api, + WT_PACK_STREAM *ps, const char **sp) +{ + WT_UNUSED(wt_api); + return (wiredtiger_unpack_str(ps, sp)); +} + +/* + * __wt_ext_unpack_uint -- + * WT_EXTENSION.unpack_uint + */ +int +__wt_ext_unpack_uint(WT_EXTENSION_API *wt_api, WT_PACK_STREAM *ps, uint64_t *up) +{ + WT_UNUSED(wt_api); + return (wiredtiger_unpack_uint(ps, up)); +} diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index c25d7b5e493..a69f335c9b3 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -363,6 +363,17 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_ASSERT(session, __wt_page_is_modified(page)); /* + * Reconciliation locks the page for three reasons: + * Reconciliation reads the lists of page updates, obsolete updates + * cannot be discarded while reconciliation is in progress; + * The compaction process reads page modification information, which + * reconciliation modifies; + * In-memory splits: reconciliation of an internal page cannot handle + * a child page splitting during the reconciliation. + */ + WT_RET(__wt_fair_lock(session, &page->page_lock)); + + /* * Check that transaction time always moves forward for a given page. * If this check fails, reconciliation can free something that a future * reconciliation will need. @@ -376,17 +387,6 @@ __wt_reconcile(WT_SESSION_IMPL *session, session, ref, flags, salvage, &session->reconcile)); r = session->reconcile; - /* - * Reconciliation locks the page for three reasons: - * Reconciliation reads the lists of page updates, obsolete updates - * cannot be discarded while reconciliation is in progress; - * The compaction process reads page modification information, which - * reconciliation modifies; - * In-memory splits: reconciliation of an internal page cannot handle - * a child page splitting during the reconciliation. - */ - WT_RET(__wt_fair_lock(session, &page->page_lock)); - /* Reconcile the page. */ switch (page->type) { case WT_PAGE_COL_FIX: @@ -1313,7 +1313,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, } while (0) typedef enum { - WT_CHILD_IGNORE, /* Deleted child: ignore */ + WT_CHILD_IGNORE, /* Ignored child */ WT_CHILD_MODIFIED, /* Modified child */ WT_CHILD_ORIGINAL, /* Original child */ WT_CHILD_PROXY /* Deleted child: proxy */ @@ -1450,16 +1450,15 @@ __rec_child_modify(WT_SESSION_IMPL *session, /* * This function is called when walking an internal page to decide how - * to handle child pages referenced by the internal page, specifically - * if the child page is to be merged into its parent. + * to handle child pages referenced by the internal page. * * Internal pages are reconciled for two reasons: first, when evicting * an internal page, second by the checkpoint code when writing internal - * pages. During eviction, the subtree is locked down so all pages - * should be in the WT_REF_DISK or WT_REF_LOCKED state. During - * checkpoint, any eviction that might affect our review of an internal - * page is prohibited, however, as the subtree is not reserved for our - * exclusive use, there are other page states that must be considered. + * pages. During eviction, all pages should be in the WT_REF_DISK or + * WT_REF_DELETED state. During checkpoint, eviction that might affect + * review of an internal page is prohibited, however, as the subtree is + * not reserved for our exclusive use, there are other page states that + * must be considered. */ for (;; __wt_yield()) switch (r->tested_ref_state = ref->state) { @@ -1488,15 +1487,14 @@ __rec_child_modify(WT_SESSION_IMPL *session, /* * Locked. * - * If evicting, the evicted page's subtree, including - * this child, was selected for eviction by us and the - * state is stable until we reset it, it's an in-memory - * state. This is the expected state for a child being - * merged into a page (where the page was selected by - * the eviction server for eviction). + * We should never be here during eviction, active child + * pages in an evicted page's subtree fails the eviction + * attempt. */ - if (F_ISSET(r, WT_EVICTING)) - goto in_memory; + if (F_ISSET(r, WT_EVICTING)) { + WT_ASSERT(session, !F_ISSET(r, WT_EVICTING)); + return (EBUSY); + } /* * If called during checkpoint, the child is being @@ -1514,24 +1512,21 @@ __rec_child_modify(WT_SESSION_IMPL *session, /* * In memory. * - * If evicting, the evicted page's subtree, including - * this child, was selected for eviction by us and the - * state is stable until we reset it, it's an in-memory - * state. This is the expected state for a child being - * merged into a page (where the page belongs to a file - * being discarded from the cache during close). + * We should never be here during eviction, active child + * pages in an evicted page's subtree fails the eviction + * attempt. */ - if (F_ISSET(r, WT_EVICTING)) - goto in_memory; + if (F_ISSET(r, WT_EVICTING)) { + WT_ASSERT(session, !F_ISSET(r, WT_EVICTING)); + return (EBUSY); + } /* * If called during checkpoint, acquire a hazard pointer * so the child isn't evicted, it's an in-memory case. * - * This call cannot return split/restart, eviction of - * pages that split into their parent is shutout during - * checkpoint, all splits in process will have completed - * before we walk any pages for checkpoint. + * This call cannot return split/restart, we have a lock + * on the parent which prevents a child page split. */ ret = __wt_page_in(session, ref, WT_READ_CACHE | WT_READ_NO_EVICT | @@ -1548,29 +1543,31 @@ __rec_child_modify(WT_SESSION_IMPL *session, /* * Being read, not modified by definition. * - * We should never be here during eviction, a child page - * in this state within an evicted page's subtree would - * have caused normally eviction to fail, and exclusive - * eviction shouldn't ever see pages being read. + * We should never be here during eviction, active child + * pages in an evicted page's subtree fails the eviction + * attempt. */ - WT_ASSERT(session, !F_ISSET(r, WT_EVICTING)); + if (F_ISSET(r, WT_EVICTING)) { + WT_ASSERT(session, !F_ISSET(r, WT_EVICTING)); + return (EBUSY); + } goto done; case WT_REF_SPLIT: /* * The page was split out from under us. * - * We should never be here during eviction, a child page - * in this state within an evicted page's subtree would - * have caused eviction to fail. + * We should never be here during eviction, active child + * pages in an evicted page's subtree fails the eviction + * attempt. * * We should never be here during checkpoint, dirty page * eviction is shutout during checkpoint, all splits in * process will have completed before we walk any pages * for checkpoint. */ - WT_ASSERT(session, ref->state != WT_REF_SPLIT); - /* FALLTHROUGH */ + WT_ASSERT(session, WT_REF_SPLIT != WT_REF_SPLIT); + return (EBUSY); WT_ILLEGAL_VALUE(session); } @@ -1581,11 +1578,21 @@ in_memory: * modify structure has been instantiated. If the modify structure * exists and the page has actually been modified, set that state. * If that's not the case, we would normally use the original cell's - * disk address as our reference, but, if we're forced to instantiate - * a deleted child page and it's never modified, we end up here with - * a page that has a modify structure, no modifications, and no disk - * address. Ignore those pages, they're not modified and there is no - * reason to write the cell. + * disk address as our reference, however there are two special cases, + * both flagged by a missing block address. + * + * First, if forced to instantiate a deleted child page and it's never + * modified, we end up here with a page that has a modify structure, no + * modifications, and no disk address. Ignore those pages, they're not + * modified and there is no reason to write the cell. + * + * Second, insert splits are permitted during checkpoint. When doing the + * final checkpoint pass, we first walk the internal page's page-index + * and write out any dirty pages we find, then we write out the internal + * page in post-order traversal. If we found the split page in the first + * step, it will have an address; if we didn't find the split page in + * the first step, it won't have an address and we ignore it, it's not + * part of the checkpoint. */ mod = ref->page->modify; if (mod != NULL && mod->rec_result != 0) @@ -1953,12 +1960,21 @@ __rec_split_init(WT_SESSION_IMPL *session, WT_RET(__wt_buf_init(session, &r->disk_image, corrected_page_size)); /* - * Clear the disk page's header and block-manager space, set the page - * type (the type doesn't change, and setting it later would require - * additional code in a few different places). + * Clear the disk page header to ensure all of it is initialized, even + * the unused fields. + * + * In the case of fixed-length column-store, clear the entire buffer: + * fixed-length column-store sets bits in bytes, where the bytes are + * assumed to initially be 0. + */ + memset(r->disk_image.mem, 0, page->type == WT_PAGE_COL_FIX ? + corrected_page_size : WT_PAGE_HEADER_SIZE); + + /* + * Set the page type (the type doesn't change, and setting it later + * would require additional code in a few different places). */ dsk = r->disk_image.mem; - memset(dsk, 0, WT_PAGE_HEADER_BYTE_SIZE(btree)); dsk->type = page->type; /* @@ -3019,13 +3035,13 @@ __rec_split_fixup(WT_SESSION_IMPL *session, WT_RECONCILE *r) * The data isn't laid out on a page boundary or nul padded; copy it to * a clean, aligned, padded buffer before writing it. * - * Allocate a scratch buffer to hold the new disk image. Copy the - * WT_PAGE_HEADER header onto the scratch buffer, most of the header - * information remains unchanged between the pages. + * Allocate a scratch buffer to hold the new disk image. Copy the disk + * page's header and block-manager space into the scratch buffer, most + * of the header information remains unchanged between the pages. */ WT_RET(__wt_scr_alloc(session, r->disk_image.memsize, &tmp)); dsk = tmp->mem; - memcpy(dsk, r->disk_image.mem, WT_PAGE_HEADER_SIZE); + memcpy(dsk, r->disk_image.mem, WT_PAGE_HEADER_BYTE_SIZE(btree)); /* * For each split chunk we've created, update the disk image and copy @@ -3808,7 +3824,7 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) switch (state) { case WT_CHILD_IGNORE: - /* Deleted child we don't have to write. */ + /* Ignored child. */ WT_CHILD_RELEASE_ERR(session, hazard, ref); continue; @@ -3977,7 +3993,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * record 100 moves to another page. When we reconcile * the original page, we write record 98, then we don't * see record 99 for whatever reason. If we've moved - * record 1000, we don't know to write a deleted record + * record 100, we don't know to write a deleted record * 99 on the page.) * * The record number recorded during the split is the @@ -3999,8 +4015,6 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) } else { WT_RET( __rec_txn_read(session, r, ins, NULL, NULL, &upd)); - if (upd == NULL) - continue; recno = WT_INSERT_RECNO(ins); } for (;;) { @@ -4536,22 +4550,25 @@ compare: /* * record 100 moves to another page. When we reconcile * the original page, we write record 98, then we don't * see record 99 for whatever reason. If we've moved - * record 1000, we don't know to write a deleted record + * record 100, we don't know to write a deleted record * 99 on the page.) * + * Assert the recorded record number is past the end of + * the page. + * * The record number recorded during the split is the * first key on the split page, that is, one larger than * the last key on this page, we have to decrement it. */ if ((n = page->modify->mod_split_recno) == WT_RECNO_OOB) break; + WT_ASSERT(session, n >= src_recno); n -= 1; + upd = NULL; } else { WT_ERR( __rec_txn_read(session, r, ins, NULL, NULL, &upd)); - if (upd == NULL) - continue; n = WT_INSERT_RECNO(ins); } while (src_recno <= n) { @@ -4734,10 +4751,10 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) switch (state) { case WT_CHILD_IGNORE: /* - * Deleted child we don't have to write. + * Ignored child. * - * Overflow keys referencing discarded pages are no - * longer useful, schedule them for discard. Don't + * Overflow keys referencing pages we're not writing are + * no longer useful, schedule them for discard. Don't * worry about instantiation, internal page keys are * always instantiated. Don't worry about reuse, * reusing this key in this reconciliation is unlikely. diff --git a/src/schema/schema_create.c b/src/schema/schema_create.c index 9b3b76b62de..756f1fdcc6c 100644 --- a/src/schema/schema_create.c +++ b/src/schema/schema_create.c @@ -9,22 +9,6 @@ #include "wt_internal.h" /* - * __wt_schema_create_strip -- - * Discard any configuration information from a schema entry that is not - * applicable to an session.create call, here for the wt dump command utility, - * which only wants to dump the schema information needed for load. - */ -int -__wt_schema_create_strip(WT_SESSION_IMPL *session, - const char *v1, const char *v2, char **value_ret) -{ - const char *cfg[] = - { WT_CONFIG_BASE(session, WT_SESSION_create), v1, v2, NULL }; - - return (__wt_config_collapse(session, cfg, value_ret)); -} - -/* * __wt_direct_io_size_check -- * Return a size from the configuration, complaining if it's insufficient * for direct I/O. diff --git a/src/schema/schema_open.c b/src/schema/schema_open.c index 49318f80959..e7ce4e42498 100644 --- a/src/schema/schema_open.c +++ b/src/schema/schema_open.c @@ -109,8 +109,7 @@ __wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table) err: __wt_scr_free(session, &buf); __wt_schema_destroy_colgroup(session, &colgroup); - if (cgconfig != NULL) - __wt_free(session, cgconfig); + __wt_free(session, cgconfig); return (ret); } diff --git a/src/schema/schema_plan.c b/src/schema/schema_plan.c index 612a2d2d192..12a1aa9c22f 100644 --- a/src/schema/schema_plan.c +++ b/src/schema/schema_plan.c @@ -212,7 +212,7 @@ __wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table, WT_ASSERT(session, !value_only || coltype == WT_PROJ_VALUE); WT_RET(__wt_buf_catfmt( - session, plan, "%d%c", cg, coltype)); + session, plan, "%u%c", cg, coltype)); /* * Set the current column group and column @@ -226,7 +226,7 @@ __wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table, if (current_col < col) { if (col - current_col > 1) WT_RET(__wt_buf_catfmt(session, - plan, "%d", col - current_col)); + plan, "%u", col - current_col)); WT_RET(__wt_buf_catfmt(session, plan, "%c", WT_PROJ_SKIP)); } @@ -375,8 +375,8 @@ __wt_struct_reformat(WT_SESSION_IMPL *session, WT_TABLE *table, pv.type = 'u'; if (pv.havesize) - WT_RET(__wt_buf_catfmt( - session, format, "%d%c", (int)pv.size, pv.type)); + WT_RET(__wt_buf_catfmt(session, + format, "%" PRIu32 "%c", pv.size, pv.type)); else WT_RET(__wt_buf_catfmt(session, format, "%c", pv.type)); } while (have_next); @@ -399,8 +399,8 @@ __wt_struct_truncate(WT_SESSION_IMPL *session, while (ncols-- > 0) { WT_RET(__pack_next(&pack, &pv)); if (pv.havesize) - WT_RET(__wt_buf_catfmt( - session, format, "%d%c", (int)pv.size, pv.type)); + WT_RET(__wt_buf_catfmt(session, + format, "%" PRIu32 "%c", pv.size, pv.type)); else WT_RET(__wt_buf_catfmt(session, format, "%c", pv.type)); } diff --git a/src/schema/schema_truncate.c b/src/schema/schema_truncate.c index e7752b60ca4..d9a798b6ed8 100644 --- a/src/schema/schema_truncate.c +++ b/src/schema/schema_truncate.c @@ -131,22 +131,19 @@ int __wt_schema_range_truncate( WT_SESSION_IMPL *session, WT_CURSOR *start, WT_CURSOR *stop) { - WT_CURSOR *cursor; WT_DATA_SOURCE *dsrc; WT_DECL_RET; const char *uri; - cursor = (start != NULL) ? start : stop; - uri = cursor->internal_uri; + uri = start->internal_uri; if (WT_PREFIX_MATCH(uri, "file:")) { - if (start != NULL) - WT_CURSOR_NEEDKEY(start); + WT_CURSOR_NEEDKEY(start); if (stop != NULL) WT_CURSOR_NEEDKEY(stop); - WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)cursor)->btree, + WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)start)->btree, ret = __wt_btcur_range_truncate( - (WT_CURSOR_BTREE *)start, (WT_CURSOR_BTREE *)stop)); + (WT_CURSOR_BTREE *)start, (WT_CURSOR_BTREE *)stop)); } else if (WT_PREFIX_MATCH(uri, "table:")) ret = __wt_table_range_truncate( (WT_CURSOR_TABLE *)start, (WT_CURSOR_TABLE *)stop); diff --git a/src/schema/schema_worker.c b/src/schema/schema_worker.c index b5ee3bb7f7d..52be76bb7a5 100644 --- a/src/schema/schema_worker.c +++ b/src/schema/schema_worker.c @@ -55,18 +55,11 @@ __wt_schema_worker(WT_SESSION_IMPL *session, WT_ERR(ret); } - if ((ret = __wt_session_get_btree_ckpt( - session, uri, cfg, open_flags)) == 0) { - WT_SAVE_DHANDLE(session, - ret = file_func(session, cfg)); - WT_TRET(__wt_session_release_btree(session)); - } else if (ret == EBUSY) { - WT_ASSERT(session, !FLD_ISSET( - open_flags, WT_DHANDLE_EXCLUSIVE)); - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_conn_btree_apply_single_ckpt( - session, uri, file_func, cfg)); - } + WT_ERR(__wt_session_get_btree_ckpt( + session, uri, cfg, open_flags)); + WT_SAVE_DHANDLE(session, + ret = file_func(session, cfg)); + WT_TRET(__wt_session_release_btree(session)); WT_ERR(ret); } } else if (WT_PREFIX_MATCH(uri, "colgroup:")) { @@ -133,7 +126,7 @@ __wt_schema_worker(WT_SESSION_IMPL *session, dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg)); else if (file_func == __wt_checkpoint) ; - else if (file_func == __wt_checkpoint_list) + else if (file_func == __wt_checkpoint_get_handles) ; else if (file_func == __wt_checkpoint_sync) ; diff --git a/src/session/session_api.c b/src/session/session_api.c index c03b5fdc044..bb496494234 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -13,6 +13,20 @@ static int __session_snapshot(WT_SESSION *, const char *); static int __session_rollback_transaction(WT_SESSION *, const char *); /* + * __wt_session_notsup -- + * Unsupported session method. + */ +int +__wt_session_notsup(WT_SESSION *wt_session) +{ + WT_SESSION_IMPL *session; + + session = (WT_SESSION_IMPL *)wt_session; + + WT_RET_MSG(session, ENOTSUP, "Unsupported session method"); +} + +/* * __wt_session_reset_cursors -- * Reset all open cursors. */ @@ -26,7 +40,8 @@ __wt_session_reset_cursors(WT_SESSION_IMPL *session, bool free_buffers) /* Stop when there are no positioned cursors. */ if (session->ncursors == 0) break; - WT_TRET(cursor->reset(cursor)); + if (!F_ISSET(cursor, WT_CURSTD_JOINED)) + WT_TRET(cursor->reset(cursor)); /* Optionally, free the cursor buffers */ if (free_buffers) { __wt_buf_free(session, &cursor->key); @@ -478,10 +493,13 @@ __session_create(WT_SESSION *wt_session, const char *uri, const char *config) /* * We can't disallow type entirely, a configuration string might * innocently include it, for example, a dump/load pair. If the - * URI type prefix and the type are the same, let it go. + * underlying type is "file", it's OK ("file" is the underlying + * type for every type); if the URI type prefix and the type are + * the same, let it go. */ if ((ret = __wt_config_getones(session, config, "type", &cval)) == 0 && + !WT_STRING_MATCH("file", cval.str, cval.len) && (strncmp(uri, cval.str, cval.len) != 0 || uri[cval.len] != ':')) WT_ERR_MSG(session, EINVAL, @@ -495,6 +513,20 @@ err: API_END_RET_NOTFOUND_MAP(session, ret); } /* + * __session_create_readonly -- + * WT_SESSION->create method; readonly version. + */ +static int +__session_create_readonly( + WT_SESSION *wt_session, const char *uri, const char *config) +{ + WT_UNUSED(uri); + WT_UNUSED(config); + + return (__wt_session_notsup(wt_session)); +} + +/* * __session_log_flush -- * WT_SESSION->log_flush method. */ @@ -532,6 +564,18 @@ err: API_END_RET(session, ret); } /* + * __session_log_flush_readonly -- + * WT_SESSION->log_flush method; readonly version. + */ +static int +__session_log_flush_readonly(WT_SESSION *wt_session, const char *config) +{ + WT_UNUSED(config); + + return (__wt_session_notsup(wt_session)); +} + +/* * __session_log_printf -- * WT_SESSION->log_printf method. */ @@ -554,6 +598,19 @@ err: API_END_RET(session, ret); } /* + * __session_log_printf_readonly -- + * WT_SESSION->log_printf method; readonly version. + */ +static int +__session_log_printf_readonly(WT_SESSION *wt_session, const char *fmt, ...) + WT_GCC_FUNC_ATTRIBUTE((format (printf, 2, 3))) +{ + WT_UNUSED(fmt); + + return (__wt_session_notsup(wt_session)); +} + +/* * __session_rebalance -- * WT_SESSION->rebalance method. */ @@ -567,9 +624,6 @@ __session_rebalance(WT_SESSION *wt_session, const char *uri, const char *config) SESSION_API_CALL(session, rebalance, config, cfg); - if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) - WT_ERR(ENOTSUP); - /* Block out checkpoints to avoid spurious EBUSY errors. */ WT_WITH_CHECKPOINT_LOCK(session, ret, WT_WITH_SCHEMA_LOCK(session, ret, @@ -580,6 +634,20 @@ err: API_END_RET_NOTFOUND_MAP(session, ret); } /* + * __session_rebalance_readonly -- + * WT_SESSION->rebalance method; readonly version. + */ +static int +__session_rebalance_readonly( + WT_SESSION *wt_session, const char *uri, const char *config) +{ + WT_UNUSED(uri); + WT_UNUSED(config); + + return (__wt_session_notsup(wt_session)); +} + +/* * __session_rename -- * WT_SESSION->rename method. */ @@ -597,14 +665,30 @@ __session_rename(WT_SESSION *wt_session, WT_ERR(__wt_str_name_check(session, uri)); WT_ERR(__wt_str_name_check(session, newuri)); - WT_WITH_SCHEMA_LOCK(session, ret, - WT_WITH_TABLE_LOCK(session, ret, - ret = __wt_schema_rename(session, uri, newuri, cfg))); + WT_WITH_CHECKPOINT_LOCK(session, ret, + WT_WITH_SCHEMA_LOCK(session, ret, + WT_WITH_TABLE_LOCK(session, ret, + ret = __wt_schema_rename(session, uri, newuri, cfg)))); err: API_END_RET_NOTFOUND_MAP(session, ret); } /* + * __session_rename_readonly -- + * WT_SESSION->rename method; readonly version. + */ +static int +__session_rename_readonly(WT_SESSION *wt_session, + const char *uri, const char *newuri, const char *config) +{ + WT_UNUSED(uri); + WT_UNUSED(newuri); + WT_UNUSED(config); + + return (__wt_session_notsup(wt_session)); +} + +/* * __session_reset -- * WT_SESSION->reset method. */ @@ -646,9 +730,10 @@ __wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) if (!lock_wait) F_SET(session, WT_SESSION_LOCK_NO_WAIT); - WT_WITH_SCHEMA_LOCK(session, ret, - WT_WITH_TABLE_LOCK(session, ret, - ret = __wt_schema_drop(session, uri, cfg))); + WT_WITH_CHECKPOINT_LOCK(session, ret, + WT_WITH_SCHEMA_LOCK(session, ret, + WT_WITH_TABLE_LOCK(session, ret, + ret = __wt_schema_drop(session, uri, cfg)))); if (!lock_wait) F_CLR(session, WT_SESSION_LOCK_NO_WAIT); @@ -679,6 +764,20 @@ err: /* Note: drop operations cannot be unrolled (yet?). */ } /* + * __session_drop_readonly -- + * WT_SESSION->drop method; readonly version. + */ +static int +__session_drop_readonly( + WT_SESSION *wt_session, const char *uri, const char *config) +{ + WT_UNUSED(uri); + WT_UNUSED(config); + + return (__wt_session_notsup(wt_session)); +} + +/* * __session_join -- * WT_SESSION->join method. */ @@ -823,6 +922,20 @@ err: API_END_RET_NOTFOUND_MAP(session, ret); } /* + * __session_salvage_readonly -- + * WT_SESSION->salvage method; readonly version. + */ +static int +__session_salvage_readonly( + WT_SESSION *wt_session, const char *uri, const char *config) +{ + WT_UNUSED(uri); + WT_UNUSED(config); + + return (__wt_session_notsup(wt_session)); +} + +/* * __wt_session_range_truncate -- * Session handling of a range truncate. */ @@ -1004,6 +1117,22 @@ err: TXN_API_END_RETRY(session, ret, 0); } /* + * __session_truncate_readonly -- + * WT_SESSION->truncate method; readonly version. + */ +static int +__session_truncate_readonly(WT_SESSION *wt_session, + const char *uri, WT_CURSOR *start, WT_CURSOR *stop, const char *config) +{ + WT_UNUSED(uri); + WT_UNUSED(start); + WT_UNUSED(stop); + WT_UNUSED(config); + + return (__wt_session_notsup(wt_session)); +} + +/* * __session_upgrade -- * WT_SESSION->upgrade method. */ @@ -1026,6 +1155,20 @@ err: API_END_RET_NOTFOUND_MAP(session, ret); } /* + * __session_upgrade_readonly -- + * WT_SESSION->upgrade method; readonly version. + */ +static int +__session_upgrade_readonly( + WT_SESSION *wt_session, const char *uri, const char *config) +{ + WT_UNUSED(uri); + WT_UNUSED(config); + + return (__wt_session_notsup(wt_session)); +} + +/* * __session_verify -- * WT_SESSION->verify method. */ @@ -1247,6 +1390,18 @@ err: API_END_RET(session, ret); } /* + * __session_transaction_sync_readonly -- + * WT_SESSION->transaction_sync method; readonly version. + */ +static int +__session_transaction_sync_readonly(WT_SESSION *wt_session, const char *config) +{ + WT_UNUSED(config); + + return (__wt_session_notsup(wt_session)); +} + +/* * __session_checkpoint -- * WT_SESSION->checkpoint method. */ @@ -1295,6 +1450,18 @@ err: API_END_RET_NOTFOUND_MAP(session, ret); } /* + * __session_checkpoint_readonly -- + * WT_SESSION->checkpoint method; readonly version. + */ +static int +__session_checkpoint_readonly(WT_SESSION *wt_session, const char *config) +{ + WT_UNUSED(config); + + return (__wt_session_notsup(wt_session)); +} + +/* * __session_snapshot -- * WT_SESSION->snapshot method. */ @@ -1380,6 +1547,33 @@ __open_session(WT_CONNECTION_IMPL *conn, __session_snapshot, __session_transaction_pinned_range, __session_transaction_sync + }, stds_readonly = { + NULL, + NULL, + __session_close, + __session_reconfigure, + __session_strerror, + __session_open_cursor, + __session_create_readonly, + __wt_session_compact_readonly, + __session_drop_readonly, + __session_join, + __session_log_flush_readonly, + __session_log_printf_readonly, + __session_rebalance_readonly, + __session_rename_readonly, + __session_reset, + __session_salvage_readonly, + __session_truncate_readonly, + __session_upgrade_readonly, + __session_verify, + __session_begin_transaction, + __session_commit_transaction, + __session_rollback_transaction, + __session_checkpoint_readonly, + __session_snapshot, + __session_transaction_pinned_range, + __session_transaction_sync_readonly }; WT_DECL_RET; WT_SESSION_IMPL *session, *session_ret; @@ -1407,7 +1601,7 @@ __open_session(WT_CONNECTION_IMPL *conn, if (i == conn->session_size) WT_ERR_MSG(session, ENOMEM, "only configured to support %" PRIu32 " sessions" - " (including %d additional internal sessions)", + " (including %" PRIu32 " additional internal sessions)", conn->session_size, WT_EXTRA_INTERNAL_SESSIONS); /* @@ -1419,7 +1613,8 @@ __open_session(WT_CONNECTION_IMPL *conn, conn->session_cnt = i + 1; session_ret->id = i; - session_ret->iface = stds; + session_ret->iface = + F_ISSET(conn, WT_CONN_READONLY) ? stds_readonly : stds; session_ret->iface.connection = &conn->iface; WT_ERR(__wt_cond_alloc(session, "session", false, &session_ret->cond)); diff --git a/src/session/session_compact.c b/src/session/session_compact.c index 5abccbd1366..2a53ad58f52 100644 --- a/src/session/session_compact.c +++ b/src/session/session_compact.c @@ -97,13 +97,13 @@ */ /* - * __wt_compact_uri_analyze -- + * __compact_uri_analyze -- * Extract information relevant to deciding what work compact needs to * do from a URI that is part of a table schema. * Called via the schema_worker function. */ -int -__wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, bool *skipp) +static int +__compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, bool *skipp) { /* * Add references to schema URI objects to the list of objects to be @@ -120,6 +120,61 @@ __wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, bool *skipp) } /* + * __compact_start -- + * Start object compaction. + */ +static int +__compact_start(WT_SESSION_IMPL *session) +{ + WT_BM *bm; + + bm = S2BT(session)->bm; + return (bm->compact_start(bm, session)); +} + +/* + * __compact_end -- + * End object compaction. + */ +static int +__compact_end(WT_SESSION_IMPL *session) +{ + WT_BM *bm; + + bm = S2BT(session)->bm; + return (bm->compact_end(bm, session)); +} + +/* + * __compact_handle_append -- + * Gather a file handle to be compacted. + * Called via the schema_worker function. + */ +static int +__compact_handle_append(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_DECL_RET; + + WT_UNUSED(cfg); + + /* Make sure there is space for the next entry. */ + WT_RET(__wt_realloc_def(session, &session->op_handle_allocated, + session->op_handle_next + 1, &session->op_handle)); + + WT_RET(__wt_session_get_btree( + session, session->dhandle->name, NULL, NULL, 0)); + + /* Set compact active on the handle. */ + if ((ret = __compact_start(session)) != 0) { + WT_TRET(__wt_session_release_btree(session)); + return (ret); + } + + session->op_handle[session->op_handle_next++] = session->dhandle; + return (0); +} + +/* * __session_compact_check_timeout -- * Check if the timeout has been exceeded. */ @@ -143,21 +198,25 @@ __session_compact_check_timeout( * Function to alternate between checkpoints and compaction calls. */ static int -__compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) +__compact_file(WT_SESSION_IMPL *session, const char *cfg[]) { struct timespec start_time; + WT_DATA_HANDLE *dhandle; WT_DECL_ITEM(t); WT_DECL_RET; int i; const char *checkpoint_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_checkpoint), NULL, NULL }; + dhandle = session->dhandle; + /* * Force the checkpoint: we don't want to skip it because the work we * need to have done is done in the underlying block manager. */ WT_ERR(__wt_scr_alloc(session, 128, &t)); - WT_ERR(__wt_buf_fmt(session, t, "target=(\"%s\"),force=1", uri)); + WT_ERR(__wt_buf_fmt( + session, t, "target=(\"%s\"),force=1", dhandle->name)); checkpoint_cfg[1] = t->data; WT_ERR(__wt_epoch(session, &start_time)); @@ -173,9 +232,8 @@ __compact_file(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) WT_ERR(__wt_txn_checkpoint(session, checkpoint_cfg)); session->compact_state = WT_COMPACT_RUNNING; - WT_WITH_SCHEMA_LOCK(session, ret, - ret = __wt_schema_worker( - session, uri, __wt_compact, NULL, cfg, 0)); + WT_WITH_DHANDLE(session, dhandle, + ret = __wt_compact(session, cfg)); WT_ERR(ret); if (session->compact_state != WT_COMPACT_SUCCESS) break; @@ -193,6 +251,7 @@ err: session->compact_state = WT_COMPACT_NONE; /* * __wt_session_compact -- + * WT_SESSION.compact method. */ int __wt_session_compact( @@ -203,6 +262,7 @@ __wt_session_compact( WT_DECL_RET; WT_SESSION_IMPL *session; WT_TXN *txn; + u_int i; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, compact, config, cfg); @@ -227,10 +287,10 @@ __wt_session_compact( WT_ERR(__wt_config_gets(session, cfg, "timeout", &cval)); session->compact->max_time = (uint64_t)cval.val; - /* Find the types of data sources are being compacted. */ + /* Find the types of data sources being compacted. */ WT_WITH_SCHEMA_LOCK(session, ret, - ret = __wt_schema_worker( - session, uri, NULL, __wt_compact_uri_analyze, cfg, 0)); + ret = __wt_schema_worker(session, uri, + __compact_handle_append, __compact_uri_analyze, cfg, 0)); WT_ERR(ret); if (session->compact->lsm_count != 0) @@ -247,11 +307,25 @@ __wt_session_compact( WT_ERR_MSG(session, EINVAL, " File compaction not permitted in a transaction"); - WT_ERR(__compact_file(session, uri, cfg)); + for (i = 0; i < session->op_handle_next; ++i) { + WT_WITH_DHANDLE(session, session->op_handle[i], + ret = __compact_file(session, cfg)); + WT_ERR(ret); + } } err: session->compact = NULL; + for (i = 0; i < session->op_handle_next; ++i) { + WT_WITH_DHANDLE(session, session->op_handle[i], + WT_TRET(__compact_end(session))); + WT_WITH_DHANDLE(session, session->op_handle[i], + WT_TRET(__wt_session_release_btree(session))); + } + + __wt_free(session, session->op_handle); + session->op_handle_allocated = session->op_handle_next = 0; + /* * Release common session resources (for example, checkpoint may acquire * significant reconciliation structures/memory). @@ -260,3 +334,17 @@ err: session->compact = NULL; API_END_RET_NOTFOUND_MAP(session, ret); } + +/* + * __wt_session_compact_readonly -- + * WT_SESSION.compact method; readonly version. + */ +int +__wt_session_compact_readonly( + WT_SESSION *wt_session, const char *uri, const char *config) +{ + WT_UNUSED(uri); + WT_UNUSED(config); + + return (__wt_session_notsup(wt_session)); +} diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c index 1ee3342442c..ddf4d3dfa33 100644 --- a/src/session/session_dhandle.c +++ b/src/session/session_dhandle.c @@ -15,24 +15,21 @@ static int __session_dhandle_sweep(WT_SESSION_IMPL *); * Add a handle to the session's cache. */ static int -__session_add_dhandle( - WT_SESSION_IMPL *session, WT_DATA_HANDLE_CACHE **dhandle_cachep) +__session_add_dhandle(WT_SESSION_IMPL *session) { WT_DATA_HANDLE_CACHE *dhandle_cache; uint64_t bucket; + /* Allocate a handle cache entry. */ WT_RET(__wt_calloc_one(session, &dhandle_cache)); + dhandle_cache->dhandle = session->dhandle; bucket = dhandle_cache->dhandle->name_hash % WT_HASH_ARRAY_SIZE; TAILQ_INSERT_HEAD(&session->dhandles, dhandle_cache, q); TAILQ_INSERT_HEAD(&session->dhhash[bucket], dhandle_cache, hashq); - if (dhandle_cachep != NULL) - *dhandle_cachep = dhandle_cache; - - /* Sweep the handle list to remove any dead handles. */ - return (__session_dhandle_sweep(session)); + return (0); } /* @@ -450,14 +447,23 @@ __session_get_dhandle( return (0); } + /* Sweep the handle list to remove any dead handles. */ + WT_RET(__session_dhandle_sweep(session)); + /* * We didn't find a match in the session cache, search the shared * handle list and cache the handle we find. */ WT_WITH_HANDLE_LIST_LOCK(session, ret = __session_find_shared_dhandle(session, uri, checkpoint)); - if (ret == 0) - ret = __session_add_dhandle(session, NULL); + WT_RET(ret); + + /* + * Fixup the reference count on failure (we incremented the reference + * count while holding the handle-list lock). + */ + if ((ret = __session_add_dhandle(session)) != 0) + (void)__wt_atomic_sub32(&session->dhandle->session_ref, 1); return (ret); } @@ -571,7 +577,7 @@ __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint) * files, since changes to the underlying file are visible to the in * memory pages. */ - WT_ERR(__wt_cache_op(session, NULL, WT_SYNC_DISCARD)); + WT_ERR(__wt_cache_op(session, WT_SYNC_DISCARD)); /* * We lock checkpoint handles that we are overwriting, so the handle diff --git a/src/support/cksum.c b/src/support/cksum.c index c2982c40015..0b086753406 100644 --- a/src/support/cksum.c +++ b/src/support/cksum.c @@ -1260,6 +1260,23 @@ __wt_cksum_hw(const void *chunk, size_t len) } #endif +#if defined(__powerpc64__) + +unsigned int crc32_vpmsum(unsigned int crc, const unsigned char *p, + unsigned long len); + +/* + * __wt_cksum_hw -- + * Return a checksum for a chunk of memory, computed in hardware + * using 8 byte steps. + */ +static uint32_t +__wt_cksum_hw(const void *chunk, size_t len) +{ + return crc32_vpmsum(0, chunk, len); +} +#endif + /* * __wt_cksum -- * Return a checksum for a chunk of memory using the fastest method @@ -1302,6 +1319,8 @@ __wt_cksum_init(void) __wt_cksum_func = __wt_cksum_hw; else __wt_cksum_func = __wt_cksum_sw; +#elif defined(__powerpc64__) + __wt_cksum_func = __wt_cksum_hw; #else __wt_cksum_func = __wt_cksum_sw; #endif diff --git a/src/support/cond_auto.c b/src/support/cond_auto.c new file mode 100644 index 00000000000..ec95622f333 --- /dev/null +++ b/src/support/cond_auto.c @@ -0,0 +1,136 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "wt_internal.h" + +/* + * This is an implementation of condition variables that automatically adjust + * the wait time depending on whether the wake is resulting in useful work. + */ + +/* + * __wt_cond_auto_alloc -- + * Allocate and initialize an automatically adjusting condition variable. + */ +int +__wt_cond_auto_alloc( + WT_SESSION_IMPL *session, const char *name, + bool is_signalled, uint64_t min, uint64_t max, WT_CONDVAR **condp) +{ + WT_CONDVAR *cond; + + WT_RET(__wt_cond_alloc(session, name, is_signalled, condp)); + cond = *condp; + + cond->min_wait = min; + cond->max_wait = max; + cond->prev_wait = min; + + return (0); +} + +/* + * __wt_cond_auto_signal -- + * Signal a condition variable. + */ +int +__wt_cond_auto_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) +{ + + WT_ASSERT(session, cond->min_wait != 0); + return (__wt_cond_signal(session, cond)); +} + +/* + * __wt_cond_auto_wait_signal -- + * Wait on a mutex, optionally timing out. If we get it before the time + * out period expires, let the caller know. + * TODO: Can this version of the API be removed, now that we have the + * auto adjusting condition variables? + */ +int +__wt_cond_auto_wait_signal( + WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool *signalled) +{ + uint64_t delta; + + /* + * Catch cases where this function is called with a condition variable + * that was initialized non-auto. + */ + WT_ASSERT(session, cond->min_wait != 0); + + WT_STAT_FAST_CONN_INCR(session, cond_auto_wait); + if (progress) + cond->prev_wait = cond->min_wait; + else { + delta = WT_MAX(1, (cond->max_wait - cond->min_wait) / 10); + cond->prev_wait = WT_MIN( + cond->max_wait, cond->prev_wait + delta); + } + + WT_RET(__wt_cond_wait_signal( + session, cond, cond->prev_wait, signalled)); + + if (progress || *signalled) + WT_STAT_FAST_CONN_INCR(session, cond_auto_wait_reset); + if (*signalled) + cond->prev_wait = cond->min_wait; + + return (0); +} + +/* + * __wt_cond_auto_wait -- + * Wait on a mutex, optionally timing out. If we get it before the time + * out period expires, let the caller know. + */ +int +__wt_cond_auto_wait( + WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress) +{ + bool signalled; + + /* + * Call the signal version so the wait period is reset if the + * condition is woken explicitly. + */ + WT_RET(__wt_cond_auto_wait_signal(session, cond, progress, &signalled)); + + return (0); +} + +/* + * __wt_cond_auto_destroy -- + * Destroy a condition variable. + */ +int +__wt_cond_auto_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp) +{ + return (__wt_cond_destroy(session, condp)); +} diff --git a/src/support/huffman.c b/src/support/huffman.c index edd0bc9f648..1e1aaeab5b5 100644 --- a/src/support/huffman.c +++ b/src/support/huffman.c @@ -492,11 +492,12 @@ __wt_huffman_open(WT_SESSION_IMPL *session, uint8_t symbol; uint32_t weighted_length; - printf("leaf depth %" PRIu16 "..%" PRIu16 ", memory use: " - "codes %u# * %uB + code2symbol %u# * %uB\n", + printf("leaf depth %" PRIu16 "..%" PRIu16 + ", memory use: codes %u# * %" WT_SIZET_FMT + "B + code2symbol %u# * %" WT_SIZET_FMT "B\n", huffman->min_depth, huffman->max_depth, - huffman->numSymbols, (u_int)sizeof(WT_HUFFMAN_CODE), - 1U << huffman->max_depth, (u_int)sizeof(uint16_t)); + huffman->numSymbols, sizeof(WT_HUFFMAN_CODE), + 1U << huffman->max_depth, sizeof(uint16_t)); /* * measure quality of computed Huffman codes, for different max bit diff --git a/src/support/power8/LICENSE.TXT b/src/support/power8/LICENSE.TXT new file mode 100644 index 00000000000..2f4bb91f574 --- /dev/null +++ b/src/support/power8/LICENSE.TXT @@ -0,0 +1,476 @@ +Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM + +crc32-vpmsum is free software; you can redistribute it and/or +modify it under the terms of either: + + a) the GNU General Public License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version., or + b) the Apache License, Version 2.0 + + + + + + + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + + + + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS diff --git a/src/support/power8/README.md b/src/support/power8/README.md new file mode 100644 index 00000000000..3e2976650cd --- /dev/null +++ b/src/support/power8/README.md @@ -0,0 +1,208 @@ +crc32-vpmsum +============ + +A set of examples for accelerating CRC32 calculations using the vector +polynomial multiply sum (vpmsum) instructions introduced in POWER8. These +instructions implement byte, halfword, word and doubleword carryless +multiply/add. + +Performance +----------- + +An implementation of slice-by-8, one of the fastest lookup table methods +is included so we can compare performance against it. Testing 5000000 +iterations of a CRC of 32 kB of data (to keep it L1 cache contained): + +``` +# time slice_by_8_bench 32768 5000000 +122.220 seconds + +# time crc32_bench 32768 5000000 +2.937 seconds +``` + +The vpmsum accelerated CRC is just over 41x faster. + +This test was run on a 4.1 GHz POWER8, so the algorithm sustains about +52 GiB/sec or 13.6 bytes/cycle. The theoretical limit is 16 bytes/cycle +since we can execute a maximum of one vpmsum instruction per cycle. + +In another test, a version was added to the kernel and btrfs write +performance was shown to be 3.8x faster. The test was done to a ramdisk +to mitigate any I/O induced variability. + +Quick start +----------- + +- Modify CRC and OPTIONS in the Makefile. There are examples for the two most + common crc32s. + +- Type make to create the constants (crc32_constants.h) + +- Import the code into your application (crc32.S crc32_wrapper.c + crc32_constants.h ppc-opcode.h) and call the CRC: + +``` +unsigned int crc32_vpmsum(unsigned int crc, unsigned char *p, unsigned long len); +``` + +CRC background +-------------- + +For a good background on CRCs, check out: + +http://www.ross.net/crc/download/crc_v3.txt + +A few key points: + +- A CRC is the remainder after dividing a message by the CRC polynomial, + ie M mod CRC_POLY +- multiply/divide is carryless +- add/subtract is an xor +- n (where n is the order of the CRC) bits of zeroes are appended to the + end of the message. + +One more important piece of information - a CRC is a linear function, so: + +``` + CRC(A xor B) = CRC(A) xor CRC(B) + + CRC(A . B) = CRC(A) . CRC(B) (remember this is carryless multiply) +``` + +If we take 64bits of data, represented by two 32 bit chunks (AAAAAAAA +and BBBBBBBB): + +``` +CRC(AAAAAAAABBBBBBBB) + = CRC(AAAAAAAA00000000 xor BBBBBBBB) + = CRC(AAAAAAAA00000000) xor CRC(BBBBBBBB) +``` + +If we operate on AAAAAAAA: + +``` +CRC(AAAAAAAA00000000) + = CRC(AAAAAAAA . 100000000) + = CRC(AAAAAAAA) . CRC(100000000) +``` + +And CRC(100000000) is a constant which we can pre-calculate: + +``` +CRC(100000000) + = 100000000 mod CRC_POLY + = 2^32 mod CRC_POLY +``` + +Finally we can add our modified AAAAAAAA to BBBBBBBB: + +``` +CRC(AAAAAAAABBBBBBBB) + = ((2^32 mod CRC_POLY) . CRC(AAAAAAAA)) xor CRC(BBBBBBBB) +``` + +In other words, with the right constants pre-calculated we can shift the +input data around and we can also calculate the CRC in as many parallel +chunks as we want. + +No matter how much shifting we do, the final result will be be 64 bits of +data (63 actually, because there is no carry into the top bit). To reduce +it further we need a another trick, and that is Barrett reduction: + +http://en.wikipedia.org/wiki/Barrett_reduction + +Barrett reduction is a method of calculating a mod n. The idea is to +calculate q, the multiple of our polynomial that we need to subtract. By +doing the computation 2x bits higher (ie 64 bits) and shifting the +result back down 2x bits, we round down to the nearest multiple. + +``` + k = 32 + m = floor((4^k)/n) = floor((4^32))/n) + n = 64 bits of data + a = 32 bit CRC + + q = floor(ma/(2^64)) + result = a - qn +``` + +An example in the floating point domain makes it clearer how this works: + +``` +a mod n = a - floor(am) * n +``` + +Let's use it to calculate 22 mod 10: + +``` + a = 22 + n = 10 + m = 1/n = 1/10 = 0.1 + +22 mod 10 + = 22 - floor(22*0.1) * 10 + = 22 - 2 * 10 + = 22 - 20 + = 2 +``` + +There is one more issue left - bit reflection. Some CRCs are defined to +operate on the least significant bit first (eg CRC32c). Lets look at +how this would get laid out in a register, and lets simplify it to just +two bytes (vs a 16 byte VMX register): + + [ 8..15 ] [ 0..7 ] + +Notice how the bits and bytes are out of order. Since we are doing +multi word multiplication on these values we need them to both be +in order. + +The simplest way to fix this is to reflect the bits in each byte: + + [ 15..8 ] [ 7..0 ] + +However shuffling bits in a byte is expensive on most CPUs. It is +however relatively cheap to shuffle bytes around. What if we load +the bytes in reversed: + + [ 0..7 ] [ 8..15 ] + +Now the bits and bytes are in order, except the least significant bit +of the register is now on the left and the most significant bit is on the +right. We operate as if the register is reflected, which normally we +cannot do. The reason we get away with this is our multiplies are carryless +and our addition and subtraction is xor, so our operations never create +carries. + +The only trick is we have to shift the result of multiplies left one +because the high bit of the multiply is always 0, and we want that high bit +on the right not the left. + +Implementation +-------------- + +The vpmsum instructions on POWER8 have a 6 cycle latency and we can +execute one every cycle. In light of this the main loop has 8 parallel +streams which consume 8 x 16 B each iteration. At the completion of this +loop we have taken 32 kB of data and reduced it to 8 x 16 B (128 B). + +The next step is to take this 128 B and reduce it to 8 B. At this stage +we also add 32 bits of 0 to the end. + +We then apply Barrett reduction to get our CRC. + +Examples +-------- +- barrett_reduction: An example of Barrett reduction + +- final_fold: Starting with 128 bits, add 32 bits of zeros and reduce it to + 64 bits, then apply Barrett reduction + +- final_fold2: A second method of reduction + +Acknowledgements +---------------- + +Thanks to Michael Gschwind, Jeff Derby, Lorena Pesantez and Stewart Smith +for their ideas and assistance. diff --git a/src/support/power8/crc32.S b/src/support/power8/crc32.S new file mode 100644 index 00000000000..c0b81143f07 --- /dev/null +++ b/src/support/power8/crc32.S @@ -0,0 +1,771 @@ +#if defined(__powerpc64__) +/* + * Calculate the checksum of data that is 16 byte aligned and a multiple of + * 16 bytes. + * + * The first step is to reduce it to 1024 bits. We do this in 8 parallel + * chunks in order to mask the latency of the vpmsum instructions. If we + * have more than 32 kB of data to checksum we repeat this step multiple + * times, passing in the previous 1024 bits. + * + * The next step is to reduce the 1024 bits to 64 bits. This step adds + * 32 bits of 0s to the end - this matches what a CRC does. We just + * calculate constants that land the data in this 32 bits. + * + * We then use fixed point Barrett reduction to compute a mod n over GF(2) + * for n = CRC using POWER8 instructions. We use x = 32. + * + * http://en.wikipedia.org/wiki/Barrett_reduction + * + * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <ppc-asm.h> +#include "ppc-opcode.h" + +#undef toc + +#ifndef r1 +#define r1 1 +#endif + +#ifndef r2 +#define r2 2 +#endif + + .section .rodata +.balign 16 + +.byteswap_constant: + /* byte reverse permute constant */ + .octa 0x0F0E0D0C0B0A09080706050403020100 + +#define __ASSEMBLY__ +#include "crc32_constants.h" + + .text + +#if defined(__BIG_ENDIAN__) && defined(REFLECT) +#define BYTESWAP_DATA +#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT) +#define BYTESWAP_DATA +#else +#undef BYTESWAP_DATA +#endif + +#define off16 r25 +#define off32 r26 +#define off48 r27 +#define off64 r28 +#define off80 r29 +#define off96 r30 +#define off112 r31 + +#define const1 v24 +#define const2 v25 + +#define byteswap v26 +#define mask_32bit v27 +#define mask_64bit v28 +#define zeroes v29 + +#ifdef BYTESWAP_DATA +#define VPERM(A, B, C, D) vperm A, B, C, D +#else +#define VPERM(A, B, C, D) +#endif + +/* unsigned int __crc32_vpmsum(unsigned int crc, void *p, unsigned long len) */ +FUNC_START(__crc32_vpmsum) + std r31,-8(r1) + std r30,-16(r1) + std r29,-24(r1) + std r28,-32(r1) + std r27,-40(r1) + std r26,-48(r1) + std r25,-56(r1) + + li off16,16 + li off32,32 + li off48,48 + li off64,64 + li off80,80 + li off96,96 + li off112,112 + li r0,0 + + /* Enough room for saving 10 non volatile VMX registers */ + subi r6,r1,56+10*16 + subi r7,r1,56+2*16 + + stvx v20,0,r6 + stvx v21,off16,r6 + stvx v22,off32,r6 + stvx v23,off48,r6 + stvx v24,off64,r6 + stvx v25,off80,r6 + stvx v26,off96,r6 + stvx v27,off112,r6 + stvx v28,0,r7 + stvx v29,off16,r7 + + mr r10,r3 + + vxor zeroes,zeroes,zeroes + vspltisw v0,-1 + + vsldoi mask_32bit,zeroes,v0,4 + vsldoi mask_64bit,zeroes,v0,8 + + /* Get the initial value into v8 */ + vxor v8,v8,v8 + MTVRD(v8, r3) +#ifdef REFLECT + vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */ +#else + vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */ +#endif + +#ifdef BYTESWAP_DATA + addis r3,r2,.byteswap_constant@toc@ha + addi r3,r3,.byteswap_constant@toc@l + + lvx byteswap,0,r3 + addi r3,r3,16 +#endif + + cmpdi r5,256 + blt .Lshort + + rldicr r6,r5,0,56 + + /* Checksum in blocks of MAX_SIZE */ +1: lis r7,MAX_SIZE@h + ori r7,r7,MAX_SIZE@l + mr r9,r7 + cmpd r6,r7 + bgt 2f + mr r7,r6 +2: subf r6,r7,r6 + + /* our main loop does 128 bytes at a time */ + srdi r7,r7,7 + + /* + * Work out the offset into the constants table to start at. Each + * constant is 16 bytes, and it is used against 128 bytes of input + * data - 128 / 16 = 8 + */ + sldi r8,r7,4 + srdi r9,r9,3 + subf r8,r8,r9 + + /* We reduce our final 128 bytes in a separate step */ + addi r7,r7,-1 + mtctr r7 + + addis r3,r2,.constants@toc@ha + addi r3,r3,.constants@toc@l + + /* Find the start of our constants */ + add r3,r3,r8 + + /* zero v0-v7 which will contain our checksums */ + vxor v0,v0,v0 + vxor v1,v1,v1 + vxor v2,v2,v2 + vxor v3,v3,v3 + vxor v4,v4,v4 + vxor v5,v5,v5 + vxor v6,v6,v6 + vxor v7,v7,v7 + + lvx const1,0,r3 + + /* + * If we are looping back to consume more data we use the values + * already in v16-v23. + */ + cmpdi r0,1 + beq 2f + + /* First warm up pass */ + lvx v16,0,r4 + lvx v17,off16,r4 + VPERM(v16,v16,v16,byteswap) + VPERM(v17,v17,v17,byteswap) + lvx v18,off32,r4 + lvx v19,off48,r4 + VPERM(v18,v18,v18,byteswap) + VPERM(v19,v19,v19,byteswap) + lvx v20,off64,r4 + lvx v21,off80,r4 + VPERM(v20,v20,v20,byteswap) + VPERM(v21,v21,v21,byteswap) + lvx v22,off96,r4 + lvx v23,off112,r4 + VPERM(v22,v22,v22,byteswap) + VPERM(v23,v23,v23,byteswap) + addi r4,r4,8*16 + + /* xor in initial value */ + vxor v16,v16,v8 + +2: bdz .Lfirst_warm_up_done + + addi r3,r3,16 + lvx const2,0,r3 + + /* Second warm up pass */ + VPMSUMD(v8,v16,const1) + lvx v16,0,r4 + VPERM(v16,v16,v16,byteswap) + ori r2,r2,0 + + VPMSUMD(v9,v17,const1) + lvx v17,off16,r4 + VPERM(v17,v17,v17,byteswap) + ori r2,r2,0 + + VPMSUMD(v10,v18,const1) + lvx v18,off32,r4 + VPERM(v18,v18,v18,byteswap) + ori r2,r2,0 + + VPMSUMD(v11,v19,const1) + lvx v19,off48,r4 + VPERM(v19,v19,v19,byteswap) + ori r2,r2,0 + + VPMSUMD(v12,v20,const1) + lvx v20,off64,r4 + VPERM(v20,v20,v20,byteswap) + ori r2,r2,0 + + VPMSUMD(v13,v21,const1) + lvx v21,off80,r4 + VPERM(v21,v21,v21,byteswap) + ori r2,r2,0 + + VPMSUMD(v14,v22,const1) + lvx v22,off96,r4 + VPERM(v22,v22,v22,byteswap) + ori r2,r2,0 + + VPMSUMD(v15,v23,const1) + lvx v23,off112,r4 + VPERM(v23,v23,v23,byteswap) + + addi r4,r4,8*16 + + bdz .Lfirst_cool_down + + /* + * main loop. We modulo schedule it such that it takes three iterations + * to complete - first iteration load, second iteration vpmsum, third + * iteration xor. + */ + .balign 16 +4: lvx const1,0,r3 + addi r3,r3,16 + ori r2,r2,0 + + vxor v0,v0,v8 + VPMSUMD(v8,v16,const2) + lvx v16,0,r4 + VPERM(v16,v16,v16,byteswap) + ori r2,r2,0 + + vxor v1,v1,v9 + VPMSUMD(v9,v17,const2) + lvx v17,off16,r4 + VPERM(v17,v17,v17,byteswap) + ori r2,r2,0 + + vxor v2,v2,v10 + VPMSUMD(v10,v18,const2) + lvx v18,off32,r4 + VPERM(v18,v18,v18,byteswap) + ori r2,r2,0 + + vxor v3,v3,v11 + VPMSUMD(v11,v19,const2) + lvx v19,off48,r4 + VPERM(v19,v19,v19,byteswap) + lvx const2,0,r3 + ori r2,r2,0 + + vxor v4,v4,v12 + VPMSUMD(v12,v20,const1) + lvx v20,off64,r4 + VPERM(v20,v20,v20,byteswap) + ori r2,r2,0 + + vxor v5,v5,v13 + VPMSUMD(v13,v21,const1) + lvx v21,off80,r4 + VPERM(v21,v21,v21,byteswap) + ori r2,r2,0 + + vxor v6,v6,v14 + VPMSUMD(v14,v22,const1) + lvx v22,off96,r4 + VPERM(v22,v22,v22,byteswap) + ori r2,r2,0 + + vxor v7,v7,v15 + VPMSUMD(v15,v23,const1) + lvx v23,off112,r4 + VPERM(v23,v23,v23,byteswap) + + addi r4,r4,8*16 + + bdnz 4b + +.Lfirst_cool_down: + /* First cool down pass */ + lvx const1,0,r3 + addi r3,r3,16 + + vxor v0,v0,v8 + VPMSUMD(v8,v16,const1) + ori r2,r2,0 + + vxor v1,v1,v9 + VPMSUMD(v9,v17,const1) + ori r2,r2,0 + + vxor v2,v2,v10 + VPMSUMD(v10,v18,const1) + ori r2,r2,0 + + vxor v3,v3,v11 + VPMSUMD(v11,v19,const1) + ori r2,r2,0 + + vxor v4,v4,v12 + VPMSUMD(v12,v20,const1) + ori r2,r2,0 + + vxor v5,v5,v13 + VPMSUMD(v13,v21,const1) + ori r2,r2,0 + + vxor v6,v6,v14 + VPMSUMD(v14,v22,const1) + ori r2,r2,0 + + vxor v7,v7,v15 + VPMSUMD(v15,v23,const1) + ori r2,r2,0 + +.Lsecond_cool_down: + /* Second cool down pass */ + vxor v0,v0,v8 + vxor v1,v1,v9 + vxor v2,v2,v10 + vxor v3,v3,v11 + vxor v4,v4,v12 + vxor v5,v5,v13 + vxor v6,v6,v14 + vxor v7,v7,v15 + +#ifdef REFLECT + /* + * vpmsumd produces a 96 bit result in the least significant bits + * of the register. Since we are bit reflected we have to shift it + * left 32 bits so it occupies the least significant bits in the + * bit reflected domain. + */ + vsldoi v0,v0,zeroes,4 + vsldoi v1,v1,zeroes,4 + vsldoi v2,v2,zeroes,4 + vsldoi v3,v3,zeroes,4 + vsldoi v4,v4,zeroes,4 + vsldoi v5,v5,zeroes,4 + vsldoi v6,v6,zeroes,4 + vsldoi v7,v7,zeroes,4 +#endif + + /* xor with last 1024 bits */ + lvx v8,0,r4 + lvx v9,off16,r4 + VPERM(v8,v8,v8,byteswap) + VPERM(v9,v9,v9,byteswap) + lvx v10,off32,r4 + lvx v11,off48,r4 + VPERM(v10,v10,v10,byteswap) + VPERM(v11,v11,v11,byteswap) + lvx v12,off64,r4 + lvx v13,off80,r4 + VPERM(v12,v12,v12,byteswap) + VPERM(v13,v13,v13,byteswap) + lvx v14,off96,r4 + lvx v15,off112,r4 + VPERM(v14,v14,v14,byteswap) + VPERM(v15,v15,v15,byteswap) + + addi r4,r4,8*16 + + vxor v16,v0,v8 + vxor v17,v1,v9 + vxor v18,v2,v10 + vxor v19,v3,v11 + vxor v20,v4,v12 + vxor v21,v5,v13 + vxor v22,v6,v14 + vxor v23,v7,v15 + + li r0,1 + cmpdi r6,0 + addi r6,r6,128 + bne 1b + + /* Work out how many bytes we have left */ + andi. r5,r5,127 + + /* Calculate where in the constant table we need to start */ + subfic r6,r5,128 + add r3,r3,r6 + + /* How many 16 byte chunks are in the tail */ + srdi r7,r5,4 + mtctr r7 + + /* + * Reduce the previously calculated 1024 bits to 64 bits, shifting + * 32 bits to include the trailing 32 bits of zeros + */ + lvx v0,0,r3 + lvx v1,off16,r3 + lvx v2,off32,r3 + lvx v3,off48,r3 + lvx v4,off64,r3 + lvx v5,off80,r3 + lvx v6,off96,r3 + lvx v7,off112,r3 + addi r3,r3,8*16 + + VPMSUMW(v0,v16,v0) + VPMSUMW(v1,v17,v1) + VPMSUMW(v2,v18,v2) + VPMSUMW(v3,v19,v3) + VPMSUMW(v4,v20,v4) + VPMSUMW(v5,v21,v5) + VPMSUMW(v6,v22,v6) + VPMSUMW(v7,v23,v7) + + /* Now reduce the tail (0 - 112 bytes) */ + cmpdi r7,0 + beq 1f + + lvx v16,0,r4 + lvx v17,0,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off16,r4 + lvx v17,off16,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off32,r4 + lvx v17,off32,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off48,r4 + lvx v17,off48,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off64,r4 + lvx v17,off64,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off80,r4 + lvx v17,off80,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off96,r4 + lvx v17,off96,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + + /* Now xor all the parallel chunks together */ +1: vxor v0,v0,v1 + vxor v2,v2,v3 + vxor v4,v4,v5 + vxor v6,v6,v7 + + vxor v0,v0,v2 + vxor v4,v4,v6 + + vxor v0,v0,v4 + +.Lbarrett_reduction: + /* Barrett constants */ + addis r3,r2,.barrett_constants@toc@ha + addi r3,r3,.barrett_constants@toc@l + + lvx const1,0,r3 + lvx const2,off16,r3 + + vsldoi v1,v0,v0,8 + vxor v0,v0,v1 /* xor two 64 bit results together */ + +#ifdef REFLECT + /* shift left one bit */ + vspltisb v1,1 + vsl v0,v0,v1 +#endif + + vand v0,v0,mask_64bit + +#ifndef REFLECT + /* + * Now for the Barrett reduction algorithm. The idea is to calculate q, + * the multiple of our polynomial that we need to subtract. By + * doing the computation 2x bits higher (ie 64 bits) and shifting the + * result back down 2x bits, we round down to the nearest multiple. + */ + VPMSUMD(v1,v0,const1) /* ma */ + vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */ + VPMSUMD(v1,v1,const2) /* qn */ + vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ + + /* + * Get the result into r3. We need to shift it left 8 bytes: + * V0 [ 0 1 2 X ] + * V0 [ 0 X 2 3 ] + */ + vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */ +#else + /* + * The reflected version of Barrett reduction. Instead of bit + * reflecting our data (which is expensive to do), we bit reflect our + * constants and our algorithm, which means the intermediate data in + * our vector registers goes from 0-63 instead of 63-0. We can reflect + * the algorithm because we don't carry in mod 2 arithmetic. + */ + vand v1,v0,mask_32bit /* bottom 32 bits of a */ + VPMSUMD(v1,v1,const1) /* ma */ + vand v1,v1,mask_32bit /* bottom 32bits of ma */ + VPMSUMD(v1,v1,const2) /* qn */ + vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ + + /* + * Since we are bit reflected, the result (ie the low 32 bits) is in + * the high 32 bits. We just need to shift it left 4 bytes + * V0 [ 0 1 X 3 ] + * V0 [ 0 X 2 3 ] + */ + vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */ +#endif + + /* Get it into r3 */ + MFVRD(r3, v0) + +.Lout: + subi r6,r1,56+10*16 + subi r7,r1,56+2*16 + + lvx v20,0,r6 + lvx v21,off16,r6 + lvx v22,off32,r6 + lvx v23,off48,r6 + lvx v24,off64,r6 + lvx v25,off80,r6 + lvx v26,off96,r6 + lvx v27,off112,r6 + lvx v28,0,r7 + lvx v29,off16,r7 + + ld r31,-8(r1) + ld r30,-16(r1) + ld r29,-24(r1) + ld r28,-32(r1) + ld r27,-40(r1) + ld r26,-48(r1) + ld r25,-56(r1) + + blr + +.Lfirst_warm_up_done: + lvx const1,0,r3 + addi r3,r3,16 + + VPMSUMD(v8,v16,const1) + VPMSUMD(v9,v17,const1) + VPMSUMD(v10,v18,const1) + VPMSUMD(v11,v19,const1) + VPMSUMD(v12,v20,const1) + VPMSUMD(v13,v21,const1) + VPMSUMD(v14,v22,const1) + VPMSUMD(v15,v23,const1) + + b .Lsecond_cool_down + +.Lshort: + cmpdi r5,0 + beq .Lzero + + addis r3,r2,.short_constants@toc@ha + addi r3,r3,.short_constants@toc@l + + /* Calculate where in the constant table we need to start */ + subfic r6,r5,256 + add r3,r3,r6 + + /* How many 16 byte chunks? */ + srdi r7,r5,4 + mtctr r7 + + vxor v19,v19,v19 + vxor v20,v20,v20 + + lvx v0,0,r4 + lvx v16,0,r3 + VPERM(v0,v0,v16,byteswap) + vxor v0,v0,v8 /* xor in initial value */ + VPMSUMW(v0,v0,v16) + bdz .Lv0 + + lvx v1,off16,r4 + lvx v17,off16,r3 + VPERM(v1,v1,v17,byteswap) + VPMSUMW(v1,v1,v17) + bdz .Lv1 + + lvx v2,off32,r4 + lvx v16,off32,r3 + VPERM(v2,v2,v16,byteswap) + VPMSUMW(v2,v2,v16) + bdz .Lv2 + + lvx v3,off48,r4 + lvx v17,off48,r3 + VPERM(v3,v3,v17,byteswap) + VPMSUMW(v3,v3,v17) + bdz .Lv3 + + lvx v4,off64,r4 + lvx v16,off64,r3 + VPERM(v4,v4,v16,byteswap) + VPMSUMW(v4,v4,v16) + bdz .Lv4 + + lvx v5,off80,r4 + lvx v17,off80,r3 + VPERM(v5,v5,v17,byteswap) + VPMSUMW(v5,v5,v17) + bdz .Lv5 + + lvx v6,off96,r4 + lvx v16,off96,r3 + VPERM(v6,v6,v16,byteswap) + VPMSUMW(v6,v6,v16) + bdz .Lv6 + + lvx v7,off112,r4 + lvx v17,off112,r3 + VPERM(v7,v7,v17,byteswap) + VPMSUMW(v7,v7,v17) + bdz .Lv7 + + addi r3,r3,128 + addi r4,r4,128 + + lvx v8,0,r4 + lvx v16,0,r3 + VPERM(v8,v8,v16,byteswap) + VPMSUMW(v8,v8,v16) + bdz .Lv8 + + lvx v9,off16,r4 + lvx v17,off16,r3 + VPERM(v9,v9,v17,byteswap) + VPMSUMW(v9,v9,v17) + bdz .Lv9 + + lvx v10,off32,r4 + lvx v16,off32,r3 + VPERM(v10,v10,v16,byteswap) + VPMSUMW(v10,v10,v16) + bdz .Lv10 + + lvx v11,off48,r4 + lvx v17,off48,r3 + VPERM(v11,v11,v17,byteswap) + VPMSUMW(v11,v11,v17) + bdz .Lv11 + + lvx v12,off64,r4 + lvx v16,off64,r3 + VPERM(v12,v12,v16,byteswap) + VPMSUMW(v12,v12,v16) + bdz .Lv12 + + lvx v13,off80,r4 + lvx v17,off80,r3 + VPERM(v13,v13,v17,byteswap) + VPMSUMW(v13,v13,v17) + bdz .Lv13 + + lvx v14,off96,r4 + lvx v16,off96,r3 + VPERM(v14,v14,v16,byteswap) + VPMSUMW(v14,v14,v16) + bdz .Lv14 + + lvx v15,off112,r4 + lvx v17,off112,r3 + VPERM(v15,v15,v17,byteswap) + VPMSUMW(v15,v15,v17) + +.Lv15: vxor v19,v19,v15 +.Lv14: vxor v20,v20,v14 +.Lv13: vxor v19,v19,v13 +.Lv12: vxor v20,v20,v12 +.Lv11: vxor v19,v19,v11 +.Lv10: vxor v20,v20,v10 +.Lv9: vxor v19,v19,v9 +.Lv8: vxor v20,v20,v8 +.Lv7: vxor v19,v19,v7 +.Lv6: vxor v20,v20,v6 +.Lv5: vxor v19,v19,v5 +.Lv4: vxor v20,v20,v4 +.Lv3: vxor v19,v19,v3 +.Lv2: vxor v20,v20,v2 +.Lv1: vxor v19,v19,v1 +.Lv0: vxor v20,v20,v0 + + vxor v0,v19,v20 + + b .Lbarrett_reduction + +.Lzero: + mr r3,r10 + b .Lout + +FUNC_END(__crc32_vpmsum) +#endif diff --git a/src/support/power8/crc32_constants.h b/src/support/power8/crc32_constants.h new file mode 100644 index 00000000000..02c471d1c56 --- /dev/null +++ b/src/support/power8/crc32_constants.h @@ -0,0 +1,901 @@ +#define CRC 0x1edc6f41 +#define CRC_XOR +#define REFLECT + +#ifndef __ASSEMBLY__ +#ifdef CRC_TABLE +static const unsigned int crc_table[] = { + 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, + 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, + 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, + 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, + 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, + 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, + 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, + 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, + 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, + 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, + 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, + 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, + 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, + 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, + 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, + 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, + 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, + 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, + 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, + 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, + 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, + 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, + 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, + 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, + 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, + 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, + 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, + 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, + 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, + 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, + 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, + 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, + 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, + 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, + 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, + 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, + 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, + 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, + 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, + 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, + 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, + 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, + 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, + 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, + 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, + 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, + 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, + 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, + 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, + 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, + 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, + 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, + 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, + 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, + 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, + 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, + 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, + 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, + 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, + 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, + 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, + 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, + 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, + 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351,}; + +#endif +#else +#define MAX_SIZE 32768 +.constants: + + /* Reduce 262144 kbits to 1024 bits */ + /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */ + .octa 0x00000000b6ca9e20000000009c37c408 + + /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */ + .octa 0x00000000350249a800000001b51df26c + + /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */ + .octa 0x00000001862dac54000000000724b9d0 + + /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */ + .octa 0x00000001d87fb48c00000001c00532fe + + /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */ + .octa 0x00000001f39b699e00000000f05a9362 + + /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */ + .octa 0x0000000101da11b400000001e1007970 + + /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */ + .octa 0x00000001cab571e000000000a57366ee + + /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */ + .octa 0x00000000c7020cfe0000000192011284 + + /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */ + .octa 0x00000000cdaed1ae0000000162716d9a + + /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */ + .octa 0x00000001e804effc00000000cd97ecde + + /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */ + .octa 0x0000000077c3ea3a0000000058812bc0 + + /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */ + .octa 0x0000000068df31b40000000088b8c12e + + /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */ + .octa 0x00000000b059b6c200000001230b234c + + /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */ + .octa 0x0000000145fb8ed800000001120b416e + + /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */ + .octa 0x00000000cbc0916800000001974aecb0 + + /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */ + .octa 0x000000005ceeedc2000000008ee3f226 + + /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */ + .octa 0x0000000047d74e8600000001089aba9a + + /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */ + .octa 0x00000001407e9e220000000065113872 + + /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */ + .octa 0x00000001da967bda000000005c07ec10 + + /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */ + .octa 0x000000006c8983680000000187590924 + + /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */ + .octa 0x00000000f2d14c9800000000e35da7c6 + + /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */ + .octa 0x00000001993c6ad4000000000415855a + + /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */ + .octa 0x000000014683d1ac0000000073617758 + + /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */ + .octa 0x00000001a7c93e6c0000000176021d28 + + /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */ + .octa 0x000000010211e90a00000001c358fd0a + + /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */ + .octa 0x000000001119403e00000001ff7a2c18 + + /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */ + .octa 0x000000001c3261aa00000000f2d9f7e4 + + /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */ + .octa 0x000000014e37a634000000016cf1f9c8 + + /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */ + .octa 0x0000000073786c0c000000010af9279a + + /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */ + .octa 0x000000011dc037f80000000004f101e8 + + /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */ + .octa 0x0000000031433dfc0000000070bcf184 + + /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */ + .octa 0x000000009cde8348000000000a8de642 + + /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */ + .octa 0x0000000038d3c2a60000000062ea130c + + /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */ + .octa 0x000000011b25f26000000001eb31cbb2 + + /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */ + .octa 0x000000001629e6f00000000170783448 + + /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */ + .octa 0x0000000160838b4c00000001a684b4c6 + + /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */ + .octa 0x000000007a44011c00000000253ca5b4 + + /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */ + .octa 0x00000000226f417a0000000057b4b1e2 + + /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */ + .octa 0x0000000045eb2eb400000000b6bd084c + + /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */ + .octa 0x000000014459d70c0000000123c2d592 + + /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */ + .octa 0x00000001d406ed8200000000159dafce + + /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */ + .octa 0x0000000160c8e1a80000000127e1a64e + + /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */ + .octa 0x0000000027ba80980000000056860754 + + /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */ + .octa 0x000000006d92d01800000001e661aae8 + + /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */ + .octa 0x000000012ed7e3f200000000f82c6166 + + /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */ + .octa 0x000000002dc8778800000000c4f9c7ae + + /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */ + .octa 0x0000000018240bb80000000074203d20 + + /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */ + .octa 0x000000001ad381580000000198173052 + + /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */ + .octa 0x00000001396b78f200000001ce8aba54 + + /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */ + .octa 0x000000011a68133400000001850d5d94 + + /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */ + .octa 0x000000012104732e00000001d609239c + + /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */ + .octa 0x00000000a140d90c000000001595f048 + + /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */ + .octa 0x00000001b7215eda0000000042ccee08 + + /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */ + .octa 0x00000001aaf1df3c000000010a389d74 + + /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */ + .octa 0x0000000029d15b8a000000012a840da6 + + /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */ + .octa 0x00000000f1a96922000000001d181c0c + + /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */ + .octa 0x00000001ac80d03c0000000068b7d1f6 + + /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */ + .octa 0x000000000f11d56a000000005b0f14fc + + /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */ + .octa 0x00000001f1c022a20000000179e9e730 + + /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */ + .octa 0x0000000173d00ae200000001ce1368d6 + + /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */ + .octa 0x00000001d4ffe4ac0000000112c3a84c + + /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */ + .octa 0x000000016edc5ae400000000de940fee + + /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */ + .octa 0x00000001f1a0214000000000fe896b7e + + /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */ + .octa 0x00000000ca0b28a000000001f797431c + + /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */ + .octa 0x00000001928e30a20000000053e989ba + + /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */ + .octa 0x0000000097b1b002000000003920cd16 + + /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */ + .octa 0x00000000b15bf90600000001e6f579b8 + + /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */ + .octa 0x00000000411c5d52000000007493cb0a + + /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */ + .octa 0x00000001c36f330000000001bdd376d8 + + /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */ + .octa 0x00000001119227e0000000016badfee6 + + /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */ + .octa 0x00000000114d47020000000071de5c58 + + /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */ + .octa 0x00000000458b5b9800000000453f317c + + /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */ + .octa 0x000000012e31fb8e0000000121675cce + + /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */ + .octa 0x000000005cf619d800000001f409ee92 + + /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */ + .octa 0x0000000063f4d8b200000000f36b9c88 + + /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */ + .octa 0x000000004138dc8a0000000036b398f4 + + /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */ + .octa 0x00000001d29ee8e000000001748f9adc + + /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */ + .octa 0x000000006a08ace800000001be94ec00 + + /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */ + .octa 0x0000000127d4201000000000b74370d6 + + /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */ + .octa 0x0000000019d76b6200000001174d0b98 + + /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */ + .octa 0x00000001b1471f6e00000000befc06a4 + + /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */ + .octa 0x00000001f64c19cc00000001ae125288 + + /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */ + .octa 0x00000000003c0ea00000000095c19b34 + + /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */ + .octa 0x000000014d73abf600000001a78496f2 + + /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */ + .octa 0x00000001620eb84400000001ac5390a0 + + /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */ + .octa 0x0000000147655048000000002a80ed6e + + /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */ + .octa 0x0000000067b5077e00000001fa9b0128 + + /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */ + .octa 0x0000000010ffe20600000001ea94929e + + /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */ + .octa 0x000000000fee8f1e0000000125f4305c + + /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */ + .octa 0x00000001da26fbae00000001471e2002 + + /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */ + .octa 0x00000001b3a8bd880000000132d2253a + + /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */ + .octa 0x00000000e8f3898e00000000f26b3592 + + /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */ + .octa 0x00000000b0d0d28c00000000bc8b67b0 + + /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */ + .octa 0x0000000030f2a798000000013a826ef2 + + /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */ + .octa 0x000000000fba10020000000081482c84 + + /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */ + .octa 0x00000000bdb9bd7200000000e77307c2 + + /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */ + .octa 0x0000000075d3bf5a00000000d4a07ec8 + + /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */ + .octa 0x00000000ef1f98a00000000017102100 + + /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */ + .octa 0x00000000689c760200000000db406486 + + /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */ + .octa 0x000000016d5fa5fe0000000192db7f88 + + /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */ + .octa 0x00000001d0d2b9ca000000018bf67b1e + + /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */ + .octa 0x0000000041e7b470000000007c09163e + + /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */ + .octa 0x00000001cbb6495e000000000adac060 + + /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */ + .octa 0x000000010052a0b000000000bd8316ae + + /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */ + .octa 0x00000001d8effb5c000000019f09ab54 + + /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */ + .octa 0x00000001d969853c0000000125155542 + + /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */ + .octa 0x00000000523ccce2000000018fdb5882 + + /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */ + .octa 0x000000001e2436bc00000000e794b3f4 + + /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */ + .octa 0x00000000ddd1c3a2000000016f9bb022 + + /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */ + .octa 0x0000000019fcfe3800000000290c9978 + + /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */ + .octa 0x00000001ce95db640000000083c0f350 + + /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */ + .octa 0x00000000af5828060000000173ea6628 + + /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */ + .octa 0x00000001006388f600000001c8b4e00a + + /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */ + .octa 0x0000000179eca00a00000000de95d6aa + + /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */ + .octa 0x0000000122410a6a000000010b7f7248 + + /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */ + .octa 0x000000004288e87c00000001326e3a06 + + /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */ + .octa 0x000000016c5490da00000000bb62c2e6 + + /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */ + .octa 0x00000000d1c71f6e0000000156a4b2c2 + + /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */ + .octa 0x00000001b4ce08a6000000011dfe763a + + /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */ + .octa 0x00000001466ba60c000000007bcca8e2 + + /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */ + .octa 0x00000001f6c488a40000000186118faa + + /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */ + .octa 0x000000013bfb06820000000111a65a88 + + /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */ + .octa 0x00000000690e9e54000000003565e1c4 + + /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */ + .octa 0x00000000281346b6000000012ed02a82 + + /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */ + .octa 0x000000015646402400000000c486ecfc + + /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */ + .octa 0x000000016063a8dc0000000001b951b2 + + /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */ + .octa 0x0000000116a663620000000048143916 + + /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */ + .octa 0x000000017e8aa4d200000001dc2ae124 + + /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */ + .octa 0x00000001728eb10c00000001416c58d6 + + /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */ + .octa 0x00000001b08fd7fa00000000a479744a + + /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */ + .octa 0x00000001092a16e80000000096ca3a26 + + /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */ + .octa 0x00000000a505637c00000000ff223d4e + + /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */ + .octa 0x00000000d94869b2000000010e84da42 + + /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */ + .octa 0x00000001c8b203ae00000001b61ba3d0 + + /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */ + .octa 0x000000005704aea000000000680f2de8 + + /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */ + .octa 0x000000012e295fa2000000008772a9a8 + + /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */ + .octa 0x000000011d0908bc0000000155f295bc + + /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */ + .octa 0x0000000193ed97ea00000000595f9282 + + /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */ + .octa 0x000000013a0f1c520000000164b1c25a + + /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */ + .octa 0x000000010c2c40c000000000fbd67c50 + + /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */ + .octa 0x00000000ff6fac3e0000000096076268 + + /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */ + .octa 0x000000017b3609c000000001d288e4cc + + /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */ + .octa 0x0000000088c8c92200000001eaac1bdc + + /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */ + .octa 0x00000001751baae600000001f1ea39e2 + + /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */ + .octa 0x000000010795297200000001eb6506fc + + /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */ + .octa 0x0000000162b00abe000000010f806ffe + + /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */ + .octa 0x000000000d7b404c000000010408481e + + /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */ + .octa 0x00000000763b13d40000000188260534 + + /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */ + .octa 0x00000000f6dc22d80000000058fc73e0 + + /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */ + .octa 0x000000007daae06000000000391c59b8 + + /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */ + .octa 0x000000013359ab7c000000018b638400 + + /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */ + .octa 0x000000008add438a000000011738f5c4 + + /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */ + .octa 0x00000001edbefdea000000008cf7c6da + + /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */ + .octa 0x000000004104e0f800000001ef97fb16 + + /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */ + .octa 0x00000000b48a82220000000102130e20 + + /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */ + .octa 0x00000001bcb4684400000000db968898 + + /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */ + .octa 0x000000013293ce0a00000000b5047b5e + + /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */ + .octa 0x00000001710d0844000000010b90fdb2 + + /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */ + .octa 0x0000000117907f6e000000004834a32e + + /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */ + .octa 0x0000000087ddf93e0000000059c8f2b0 + + /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */ + .octa 0x000000005970e9b00000000122cec508 + + /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */ + .octa 0x0000000185b2b7d0000000000a330cda + + /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */ + .octa 0x00000001dcee0efc000000014a47148c + + /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */ + .octa 0x0000000030da27220000000042c61cb8 + + /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */ + .octa 0x000000012f925a180000000012fe6960 + + /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */ + .octa 0x00000000dd2e357c00000000dbda2c20 + + /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */ + .octa 0x00000000071c80de000000011122410c + + /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */ + .octa 0x000000011513140a00000000977b2070 + + /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */ + .octa 0x00000001df876e8e000000014050438e + + /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */ + .octa 0x000000015f81d6ce0000000147c840e8 + + /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */ + .octa 0x000000019dd94dbe00000001cc7c88ce + + /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */ + .octa 0x00000001373d206e00000001476b35a4 + + /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */ + .octa 0x00000000668ccade000000013d52d508 + + /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */ + .octa 0x00000001b192d268000000008e4be32e + + /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */ + .octa 0x00000000e30f3a7800000000024120fe + + /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */ + .octa 0x000000010ef1f7bc00000000ddecddb4 + + /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */ + .octa 0x00000001f5ac738000000000d4d403bc + + /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */ + .octa 0x000000011822ea7000000001734b89aa + + /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */ + .octa 0x00000000c3a33848000000010e7a58d6 + + /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */ + .octa 0x00000001bd151c2400000001f9f04e9c + + /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */ + .octa 0x0000000056002d7600000000b692225e + + /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */ + .octa 0x000000014657c4f4000000019b8d3f3e + + /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */ + .octa 0x0000000113742d7c00000001a874f11e + + /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */ + .octa 0x000000019c5920ba000000010d5a4254 + + /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */ + .octa 0x000000005216d2d600000000bbb2f5d6 + + /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */ + .octa 0x0000000136f5ad8a0000000179cc0e36 + + /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */ + .octa 0x000000018b07beb600000001dca1da4a + + /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */ + .octa 0x00000000db1e93b000000000feb1a192 + + /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */ + .octa 0x000000000b96fa3a00000000d1eeedd6 + + /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */ + .octa 0x00000001d9968af0000000008fad9bb4 + + /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */ + .octa 0x000000000e4a77a200000001884938e4 + + /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */ + .octa 0x00000000508c2ac800000001bc2e9bc0 + + /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */ + .octa 0x0000000021572a8000000001f9658a68 + + /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */ + .octa 0x00000001b859daf2000000001b9224fc + + /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */ + .octa 0x000000016f7884740000000055b2fb84 + + /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */ + .octa 0x00000001b438810e000000018b090348 + + /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */ + .octa 0x0000000095ddc6f2000000011ccbd5ea + + /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */ + .octa 0x00000001d977c20c0000000007ae47f8 + + /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */ + .octa 0x00000000ebedb99a0000000172acbec0 + + /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */ + .octa 0x00000001df9e9e9200000001c6e3ff20 + + /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */ + .octa 0x00000001a4a3f95200000000e1b38744 + + /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */ + .octa 0x00000000e2f5122000000000791585b2 + + /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */ + .octa 0x000000004aa01f3e00000000ac53b894 + + /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */ + .octa 0x00000000b3e90a5800000001ed5f2cf4 + + /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */ + .octa 0x000000000c9ca2aa00000001df48b2e0 + + /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */ + .octa 0x000000015168231600000000049c1c62 + + /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */ + .octa 0x0000000036fce78c000000017c460c12 + + /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */ + .octa 0x000000009037dc10000000015be4da7e + + /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */ + .octa 0x00000000d3298582000000010f38f668 + + /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */ + .octa 0x00000001b42e8ad60000000039f40a00 + + /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */ + .octa 0x00000000142a983800000000bd4c10c4 + + /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */ + .octa 0x0000000109c7f1900000000042db1d98 + + /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */ + .octa 0x0000000056ff931000000001c905bae6 + + /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */ + .octa 0x00000001594513aa00000000069d40ea + + /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */ + .octa 0x00000001e3b5b1e8000000008e4fbad0 + + /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */ + .octa 0x000000011dd5fc080000000047bedd46 + + /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */ + .octa 0x00000001675f0cc20000000026396bf8 + + /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */ + .octa 0x00000000d1c8dd4400000000379beb92 + + /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */ + .octa 0x0000000115ebd3d8000000000abae54a + + /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */ + .octa 0x00000001ecbd0dac0000000007e6a128 + + /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */ + .octa 0x00000000cdf67af2000000000ade29d2 + + /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */ + .octa 0x000000004c01ff4c00000000f974c45c + + /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */ + .octa 0x00000000f2d8657e00000000e77ac60a + + /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */ + .octa 0x000000006bae74c40000000145895816 + + /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */ + .octa 0x0000000152af8aa00000000038e362be + + /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */ + .octa 0x0000000004663802000000007f991a64 + + /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */ + .octa 0x00000001ab2f5afc00000000fa366d3a + + /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */ + .octa 0x0000000074a4ebd400000001a2bb34f0 + + /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */ + .octa 0x00000001d7ab3a4c0000000028a9981e + + /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */ + .octa 0x00000001a8da60c600000001dbc672be + + /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */ + .octa 0x000000013cf6382000000000b04d77f6 + + /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */ + .octa 0x00000000bec12e1e0000000124400d96 + + /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */ + .octa 0x00000001c6368010000000014ca4b414 + + /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */ + .octa 0x00000001e6e78758000000012fe2c938 + + /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */ + .octa 0x000000008d7f2b3c00000001faed01e6 + + /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */ + .octa 0x000000016b4a156e000000007e80ecfe + + /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */ + .octa 0x00000001c63cfeb60000000098daee94 + + /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */ + .octa 0x000000015f902670000000010a04edea + + /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */ + .octa 0x00000001cd5de11e00000001c00b4524 + + /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */ + .octa 0x000000001acaec540000000170296550 + + /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */ + .octa 0x000000002bd0ca780000000181afaa48 + + /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */ + .octa 0x0000000032d63d5c0000000185a31ffa + + /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */ + .octa 0x000000001c6d4e4c000000002469f608 + + /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */ + .octa 0x0000000106a60b92000000006980102a + + /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */ + .octa 0x00000000d3855e120000000111ea9ca8 + + /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */ + .octa 0x00000000e312563600000001bd1d29ce + + /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */ + .octa 0x000000009e8f7ea400000001b34b9580 + + /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */ + .octa 0x00000001c82e562c000000003076054e + + /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */ + .octa 0x00000000ca9f09ce000000012a608ea4 + + /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */ + .octa 0x00000000c63764e600000000784d05fe + + /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */ + .octa 0x0000000168d2e49e000000016ef0d82a + + /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */ + .octa 0x00000000e986c1480000000075bda454 + + /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */ + .octa 0x00000000cfb65894000000003dc0a1c4 + + /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */ + .octa 0x0000000111cadee400000000e9a5d8be + + /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */ + .octa 0x0000000171fb63ce00000001609bc4b4 + +.short_constants: + + /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */ + /* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod p(x)` */ + .octa 0x7fec2963e5bf80485cf015c388e56f72 + + /* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod p(x)` */ + .octa 0x38e888d4844752a9963a18920246e2e6 + + /* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod p(x)` */ + .octa 0x42316c00730206ad419a441956993a31 + + /* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod p(x)` */ + .octa 0x543d5c543e65ddf9924752ba2b830011 + + /* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod p(x)` */ + .octa 0x78e87aaf56767c9255bd7f9518e4a304 + + /* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod p(x)` */ + .octa 0x8f68fcec1903da7f6d76739fe0553f1e + + /* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod p(x)` */ + .octa 0x3f4840246791d588c133722b1fe0b5c3 + + /* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod p(x)` */ + .octa 0x34c96751b04de25a64b67ee0e55ef1f3 + + /* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` */ + .octa 0x156c8e180b4a395b069db049b8fdb1e7 + + /* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */ + .octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e + + /* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */ + .octa 0x041d37768cd75659817cdc5119b29a35 + + /* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */ + .octa 0x3a0777818cfaa9651ce9d94b36c41f1c + + /* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */ + .octa 0x0e148e8252377a554f256efcb82be955 + + /* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */ + .octa 0x9c25531d19e65ddeec1631edb2dea967 + + /* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */ + .octa 0x790606ff9957c0a65d27e147510ac59a + + /* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */ + .octa 0x82f63b786ea2d55ca66805eb18b8ea18 + + +.barrett_constants: + /* 33 bit reflected Barrett constant m - (4^32)/n */ + .octa 0x000000000000000000000000dea713f1 /* x^64 div p(x)` */ + /* 33 bit reflected Barrett constant n */ + .octa 0x00000000000000000000000105ec76f1 +#endif diff --git a/src/support/power8/crc32_wrapper.c b/src/support/power8/crc32_wrapper.c new file mode 100644 index 00000000000..34ac4150338 --- /dev/null +++ b/src/support/power8/crc32_wrapper.c @@ -0,0 +1,66 @@ +#if defined(__powerpc64__) +#define CRC_TABLE +#include "crc32_constants.h" + +#define VMX_ALIGN 16 +#define VMX_ALIGN_MASK (VMX_ALIGN-1) + +#ifdef REFLECT +static unsigned int crc32_align(unsigned int crc, unsigned char *p, + unsigned long len) +{ + while (len--) + crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8); + return crc; +} +#else +static unsigned int crc32_align(unsigned int crc, unsigned char *p, + unsigned long len) +{ + while (len--) + crc = crc_table[((crc >> 24) ^ *p++) & 0xff] ^ (crc << 8); + return crc; +} +#endif + +unsigned int __crc32_vpmsum(unsigned int crc, unsigned char *p, + unsigned long len); + +unsigned int crc32_vpmsum(unsigned int crc, unsigned char *p, + unsigned long len) +{ + unsigned int prealign; + unsigned int tail; + +#ifdef CRC_XOR + crc ^= 0xffffffff; +#endif + + if (len < VMX_ALIGN + VMX_ALIGN_MASK) { + crc = crc32_align(crc, p, len); + goto out; + } + + if ((unsigned long)p & VMX_ALIGN_MASK) { + prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK); + crc = crc32_align(crc, p, prealign); + len -= prealign; + p += prealign; + } + + crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK); + + tail = len & VMX_ALIGN_MASK; + if (tail) { + p += len & ~VMX_ALIGN_MASK; + crc = crc32_align(crc, p, tail); + } + +out: +#ifdef CRC_XOR + crc ^= 0xffffffff; +#endif + + return crc; +} +#endif diff --git a/src/support/power8/ppc-opcode.h b/src/support/power8/ppc-opcode.h new file mode 100644 index 00000000000..b63feea60a0 --- /dev/null +++ b/src/support/power8/ppc-opcode.h @@ -0,0 +1,23 @@ +#ifndef __OPCODES_H +#define __OPCODES_H + +#define __PPC_RA(a) (((a) & 0x1f) << 16) +#define __PPC_RB(b) (((b) & 0x1f) << 11) +#define __PPC_XA(a) ((((a) & 0x1f) << 16) | (((a) & 0x20) >> 3)) +#define __PPC_XB(b) ((((b) & 0x1f) << 11) | (((b) & 0x20) >> 4)) +#define __PPC_XS(s) ((((s) & 0x1f) << 21) | (((s) & 0x20) >> 5)) +#define __PPC_XT(s) __PPC_XS(s) +#define VSX_XX3(t, a, b) (__PPC_XT(t) | __PPC_XA(a) | __PPC_XB(b)) +#define VSX_XX1(s, a, b) (__PPC_XS(s) | __PPC_RA(a) | __PPC_RB(b)) + +#define PPC_INST_VPMSUMW 0x10000488 +#define PPC_INST_VPMSUMD 0x100004c8 +#define PPC_INST_MFVSRD 0x7c000066 +#define PPC_INST_MTVSRD 0x7c000166 + +#define VPMSUMW(t, a, b) .long PPC_INST_VPMSUMW | VSX_XX3((t), a, b) +#define VPMSUMD(t, a, b) .long PPC_INST_VPMSUMD | VSX_XX3((t), a, b) +#define MFVRD(a, t) .long PPC_INST_MFVSRD | VSX_XX1((t)+32, a, 0) +#define MTVRD(t, a) .long PPC_INST_MTVSRD | VSX_XX1((t)+32, a, 0) + +#endif diff --git a/src/support/scratch.c b/src/support/scratch.c index 94020ba2621..aea98dc49ef 100644 --- a/src/support/scratch.c +++ b/src/support/scratch.c @@ -45,7 +45,7 @@ __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) WT_RET(__wt_realloc_aligned( session, &buf->memsize, size, &buf->mem)); else - WT_RET(__wt_realloc( + WT_RET(__wt_realloc_noclear( session, &buf->memsize, size, &buf->mem)); } diff --git a/src/support/stat.c b/src/support/stat.c index 7a615131628..2a826eda962 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -3,102 +3,102 @@ #include "wt_internal.h" static const char * const __stats_dsrc_desc[] = { - "block-manager: file allocation unit size", - "block-manager: blocks allocated", - "block-manager: checkpoint size", - "block-manager: allocations requiring file extension", - "block-manager: blocks freed", - "block-manager: file magic number", - "block-manager: file major version number", - "block-manager: minor version number", - "block-manager: file bytes available for reuse", - "block-manager: file size in bytes", - "LSM: bloom filters in the LSM tree", "LSM: bloom filter false positives", "LSM: bloom filter hits", "LSM: bloom filter misses", "LSM: bloom filter pages evicted from cache", "LSM: bloom filter pages read into cache", + "LSM: bloom filters in the LSM tree", + "LSM: chunks in the LSM tree", + "LSM: highest merge generation in the LSM tree", + "LSM: queries that could have benefited from a Bloom filter that did not exist", + "LSM: sleep for LSM checkpoint throttle", + "LSM: sleep for LSM merge throttle", "LSM: total size of bloom filters", + "block-manager: allocations requiring file extension", + "block-manager: blocks allocated", + "block-manager: blocks freed", + "block-manager: checkpoint size", + "block-manager: file allocation unit size", + "block-manager: file bytes available for reuse", + "block-manager: file magic number", + "block-manager: file major version number", + "block-manager: file size in bytes", + "block-manager: minor version number", "btree: btree checkpoint generation", - "btree: column-store variable-size deleted values", "btree: column-store fixed-size leaf pages", "btree: column-store internal pages", "btree: column-store variable-size RLE encoded values", + "btree: column-store variable-size deleted values", "btree: column-store variable-size leaf pages", - "btree: pages rewritten by compaction", - "btree: number of key/value pairs", "btree: fixed-record size", - "btree: maximum tree depth", "btree: maximum internal page key size", "btree: maximum internal page size", "btree: maximum leaf page key size", "btree: maximum leaf page size", "btree: maximum leaf page value size", + "btree: maximum tree depth", + "btree: number of key/value pairs", "btree: overflow pages", + "btree: pages rewritten by compaction", "btree: row-store internal pages", "btree: row-store leaf pages", "cache: bytes read into cache", "cache: bytes written from cache", "cache: checkpoint blocked page eviction", - "cache: unmodified pages evicted", - "cache: page split during eviction deepened the tree", - "cache: modified pages evicted", "cache: data source pages selected for eviction unable to be evicted", "cache: hazard pointer blocked page eviction", + "cache: in-memory page passed criteria to be split", + "cache: in-memory page splits", "cache: internal pages evicted", "cache: internal pages split during eviction", "cache: leaf pages split during eviction", - "cache: in-memory page splits", - "cache: in-memory page passed criteria to be split", + "cache: modified pages evicted", + "cache: overflow pages read into cache", "cache: overflow values cached in memory", + "cache: page split during eviction deepened the tree", + "cache: page written requiring lookaside records", "cache: pages read into cache", "cache: pages read into cache requiring lookaside entries", - "cache: overflow pages read into cache", "cache: pages written from cache", - "cache: page written requiring lookaside records", "cache: pages written requiring in-memory restoration", - "compression: raw compression call failed, no additional data available", - "compression: raw compression call failed, additional data available", - "compression: raw compression call succeeded", + "cache: unmodified pages evicted", "compression: compressed pages read", "compression: compressed pages written", "compression: page written failed to compress", "compression: page written was too small to compress", - "cursor: create calls", - "cursor: insert calls", + "compression: raw compression call failed, additional data available", + "compression: raw compression call failed, no additional data available", + "compression: raw compression call succeeded", "cursor: bulk-loaded cursor-insert calls", + "cursor: create calls", "cursor: cursor-insert key and value bytes inserted", + "cursor: cursor-remove key bytes removed", + "cursor: cursor-update value bytes updated", + "cursor: insert calls", "cursor: next calls", "cursor: prev calls", "cursor: remove calls", - "cursor: cursor-remove key bytes removed", "cursor: reset calls", "cursor: restarted searches", "cursor: search calls", "cursor: search near calls", "cursor: truncate calls", "cursor: update calls", - "cursor: cursor-update value bytes updated", - "LSM: sleep for LSM checkpoint throttle", - "LSM: chunks in the LSM tree", - "LSM: highest merge generation in the LSM tree", - "LSM: queries that could have benefited from a Bloom filter that did not exist", - "LSM: sleep for LSM merge throttle", "reconciliation: dictionary matches", + "reconciliation: fast-path pages deleted", + "reconciliation: internal page key bytes discarded using suffix compression", "reconciliation: internal page multi-block writes", - "reconciliation: leaf page multi-block writes", - "reconciliation: maximum blocks required for a page", "reconciliation: internal-page overflow keys", + "reconciliation: leaf page key bytes discarded using prefix compression", + "reconciliation: leaf page multi-block writes", "reconciliation: leaf-page overflow keys", + "reconciliation: maximum blocks required for a page", "reconciliation: overflow values written", - "reconciliation: pages deleted", - "reconciliation: fast-path pages deleted", "reconciliation: page checksum matches", "reconciliation: page reconciliation calls", "reconciliation: page reconciliation calls for eviction", - "reconciliation: leaf page key bytes discarded using prefix compression", - "reconciliation: internal page key bytes discarded using suffix compression", + "reconciliation: pages deleted", "session: object compaction", "session: open cursor count", "transaction: update conflicts", @@ -132,6 +132,18 @@ __wt_stat_dsrc_init(WT_DATA_HANDLE *handle) void __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) { + stats->bloom_false_positive = 0; + stats->bloom_hit = 0; + stats->bloom_miss = 0; + stats->bloom_page_evict = 0; + stats->bloom_page_read = 0; + stats->bloom_count = 0; + stats->lsm_chunk_count = 0; + stats->lsm_generation_max = 0; + stats->lsm_lookup_no_bloom = 0; + stats->lsm_checkpoint_throttle = 0; + stats->lsm_merge_throttle = 0; + stats->bloom_size = 0; stats->block_extension = 0; stats->block_alloc = 0; stats->block_free = 0; @@ -145,9 +157,9 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) /* not clearing btree_checkpoint_generation */ stats->btree_column_fix = 0; stats->btree_column_internal = 0; + stats->btree_column_rle = 0; stats->btree_column_deleted = 0; stats->btree_column_variable = 0; - stats->btree_column_rle = 0; stats->btree_fixed_len = 0; stats->btree_maxintlkey = 0; stats->btree_maxintlpage = 0; @@ -202,18 +214,6 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) stats->cursor_search_near = 0; stats->cursor_truncate = 0; stats->cursor_update = 0; - stats->bloom_false_positive = 0; - stats->bloom_hit = 0; - stats->bloom_miss = 0; - stats->bloom_page_evict = 0; - stats->bloom_page_read = 0; - stats->bloom_count = 0; - stats->lsm_chunk_count = 0; - stats->lsm_generation_max = 0; - stats->lsm_lookup_no_bloom = 0; - stats->lsm_checkpoint_throttle = 0; - stats->lsm_merge_throttle = 0; - stats->bloom_size = 0; stats->rec_dictionary = 0; stats->rec_page_delete_fast = 0; stats->rec_suffix_compression = 0; @@ -246,6 +246,19 @@ void __wt_stat_dsrc_aggregate_single( WT_DSRC_STATS *from, WT_DSRC_STATS *to) { + to->bloom_false_positive += from->bloom_false_positive; + to->bloom_hit += from->bloom_hit; + to->bloom_miss += from->bloom_miss; + to->bloom_page_evict += from->bloom_page_evict; + to->bloom_page_read += from->bloom_page_read; + to->bloom_count += from->bloom_count; + to->lsm_chunk_count += from->lsm_chunk_count; + if (from->lsm_generation_max > to->lsm_generation_max) + to->lsm_generation_max = from->lsm_generation_max; + to->lsm_lookup_no_bloom += from->lsm_lookup_no_bloom; + to->lsm_checkpoint_throttle += from->lsm_checkpoint_throttle; + to->lsm_merge_throttle += from->lsm_merge_throttle; + to->bloom_size += from->bloom_size; to->block_extension += from->block_extension; to->block_alloc += from->block_alloc; to->block_free += from->block_free; @@ -263,9 +276,9 @@ __wt_stat_dsrc_aggregate_single( to->btree_checkpoint_generation += from->btree_checkpoint_generation; to->btree_column_fix += from->btree_column_fix; to->btree_column_internal += from->btree_column_internal; + to->btree_column_rle += from->btree_column_rle; to->btree_column_deleted += from->btree_column_deleted; to->btree_column_variable += from->btree_column_variable; - to->btree_column_rle += from->btree_column_rle; if (from->btree_fixed_len > to->btree_fixed_len) to->btree_fixed_len = from->btree_fixed_len; if (from->btree_maxintlkey > to->btree_maxintlkey) @@ -328,19 +341,6 @@ __wt_stat_dsrc_aggregate_single( to->cursor_search_near += from->cursor_search_near; to->cursor_truncate += from->cursor_truncate; to->cursor_update += from->cursor_update; - to->bloom_false_positive += from->bloom_false_positive; - to->bloom_hit += from->bloom_hit; - to->bloom_miss += from->bloom_miss; - to->bloom_page_evict += from->bloom_page_evict; - to->bloom_page_read += from->bloom_page_read; - to->bloom_count += from->bloom_count; - to->lsm_chunk_count += from->lsm_chunk_count; - if (from->lsm_generation_max > to->lsm_generation_max) - to->lsm_generation_max = from->lsm_generation_max; - to->lsm_lookup_no_bloom += from->lsm_lookup_no_bloom; - to->lsm_checkpoint_throttle += from->lsm_checkpoint_throttle; - to->lsm_merge_throttle += from->lsm_merge_throttle; - to->bloom_size += from->bloom_size; to->rec_dictionary += from->rec_dictionary; to->rec_page_delete_fast += from->rec_page_delete_fast; to->rec_suffix_compression += from->rec_suffix_compression; @@ -367,6 +367,21 @@ __wt_stat_dsrc_aggregate( { int64_t v; + to->bloom_false_positive += WT_STAT_READ(from, bloom_false_positive); + to->bloom_hit += WT_STAT_READ(from, bloom_hit); + to->bloom_miss += WT_STAT_READ(from, bloom_miss); + to->bloom_page_evict += WT_STAT_READ(from, bloom_page_evict); + to->bloom_page_read += WT_STAT_READ(from, bloom_page_read); + to->bloom_count += WT_STAT_READ(from, bloom_count); + to->lsm_chunk_count += WT_STAT_READ(from, lsm_chunk_count); + if ((v = WT_STAT_READ(from, lsm_generation_max)) > + to->lsm_generation_max) + to->lsm_generation_max = v; + to->lsm_lookup_no_bloom += WT_STAT_READ(from, lsm_lookup_no_bloom); + to->lsm_checkpoint_throttle += + WT_STAT_READ(from, lsm_checkpoint_throttle); + to->lsm_merge_throttle += WT_STAT_READ(from, lsm_merge_throttle); + to->bloom_size += WT_STAT_READ(from, bloom_size); to->block_extension += WT_STAT_READ(from, block_extension); to->block_alloc += WT_STAT_READ(from, block_alloc); to->block_free += WT_STAT_READ(from, block_free); @@ -387,10 +402,10 @@ __wt_stat_dsrc_aggregate( to->btree_column_fix += WT_STAT_READ(from, btree_column_fix); to->btree_column_internal += WT_STAT_READ(from, btree_column_internal); + to->btree_column_rle += WT_STAT_READ(from, btree_column_rle); to->btree_column_deleted += WT_STAT_READ(from, btree_column_deleted); to->btree_column_variable += WT_STAT_READ(from, btree_column_variable); - to->btree_column_rle += WT_STAT_READ(from, btree_column_rle); if ((v = WT_STAT_READ(from, btree_fixed_len)) > to->btree_fixed_len) to->btree_fixed_len = v; if ((v = WT_STAT_READ(from, btree_maxintlkey)) > to->btree_maxintlkey) @@ -467,21 +482,6 @@ __wt_stat_dsrc_aggregate( to->cursor_search_near += WT_STAT_READ(from, cursor_search_near); to->cursor_truncate += WT_STAT_READ(from, cursor_truncate); to->cursor_update += WT_STAT_READ(from, cursor_update); - to->bloom_false_positive += WT_STAT_READ(from, bloom_false_positive); - to->bloom_hit += WT_STAT_READ(from, bloom_hit); - to->bloom_miss += WT_STAT_READ(from, bloom_miss); - to->bloom_page_evict += WT_STAT_READ(from, bloom_page_evict); - to->bloom_page_read += WT_STAT_READ(from, bloom_page_read); - to->bloom_count += WT_STAT_READ(from, bloom_count); - to->lsm_chunk_count += WT_STAT_READ(from, lsm_chunk_count); - if ((v = WT_STAT_READ(from, lsm_generation_max)) > - to->lsm_generation_max) - to->lsm_generation_max = v; - to->lsm_lookup_no_bloom += WT_STAT_READ(from, lsm_lookup_no_bloom); - to->lsm_checkpoint_throttle += - WT_STAT_READ(from, lsm_checkpoint_throttle); - to->lsm_merge_throttle += WT_STAT_READ(from, lsm_merge_throttle); - to->bloom_size += WT_STAT_READ(from, bloom_size); to->rec_dictionary += WT_STAT_READ(from, rec_dictionary); to->rec_page_delete_fast += WT_STAT_READ(from, rec_page_delete_fast); to->rec_suffix_compression += @@ -509,12 +509,22 @@ __wt_stat_dsrc_aggregate( } static const char * const __stats_connection_desc[] = { - "async: number of allocation state races", - "async: number of operation slots viewed for allocation", + "LSM: application work units currently queued", + "LSM: merge work units currently queued", + "LSM: rows merged in an LSM tree", + "LSM: sleep for LSM checkpoint throttle", + "LSM: sleep for LSM merge throttle", + "LSM: switch work units currently queued", + "LSM: tree maintenance operations discarded", + "LSM: tree maintenance operations executed", + "LSM: tree maintenance operations scheduled", + "LSM: tree queue hit maximum", "async: current work queue length", + "async: maximum work queue length", + "async: number of allocation state races", "async: number of flush calls", + "async: number of operation slots viewed for allocation", "async: number of times operation allocation failed", - "async: maximum work queue length", "async: number of times worker found no work", "async: total allocations", "async: total compact calls", @@ -522,55 +532,66 @@ static const char * const __stats_connection_desc[] = { "async: total remove calls", "async: total search calls", "async: total update calls", - "block-manager: mapped bytes read", - "block-manager: bytes read", - "block-manager: bytes written", - "block-manager: mapped blocks read", "block-manager: blocks pre-loaded", "block-manager: blocks read", "block-manager: blocks written", - "cache: tracked dirty bytes in the cache", - "cache: tracked bytes belonging to internal pages in the cache", + "block-manager: bytes read", + "block-manager: bytes written", + "block-manager: mapped blocks read", + "block-manager: mapped bytes read", "cache: bytes currently in the cache", - "cache: tracked bytes belonging to leaf pages in the cache", - "cache: maximum bytes configured", - "cache: tracked bytes belonging to overflow pages in the cache", "cache: bytes read into cache", "cache: bytes written from cache", - "cache: pages evicted by application threads", "cache: checkpoint blocked page eviction", - "cache: unmodified pages evicted", - "cache: page split during eviction deepened the tree", - "cache: modified pages evicted", - "cache: pages selected for eviction unable to be evicted", - "cache: pages evicted because they exceeded the in-memory maximum", - "cache: pages evicted because they had chains of deleted items", - "cache: failed eviction of pages that exceeded the in-memory maximum", - "cache: hazard pointer blocked page eviction", - "cache: internal pages evicted", - "cache: maximum page size at eviction", + "cache: eviction currently operating in aggressive mode", "cache: eviction server candidate queue empty when topping up", "cache: eviction server candidate queue not empty when topping up", "cache: eviction server evicting pages", "cache: eviction server populating queue, but not evicting pages", "cache: eviction server unable to reach eviction goal", - "cache: internal pages split during eviction", - "cache: leaf pages split during eviction", - "cache: pages walked for eviction", "cache: eviction worker thread evicting pages", - "cache: in-memory page splits", + "cache: failed eviction of pages that exceeded the in-memory maximum", + "cache: hazard pointer blocked page eviction", "cache: in-memory page passed criteria to be split", + "cache: in-memory page splits", + "cache: internal pages evicted", + "cache: internal pages split during eviction", + "cache: leaf pages split during eviction", "cache: lookaside table insert calls", "cache: lookaside table remove calls", - "cache: percentage overhead", - "cache: tracked dirty pages in the cache", + "cache: maximum bytes configured", + "cache: maximum page size at eviction", + "cache: modified pages evicted", + "cache: page split during eviction deepened the tree", + "cache: page written requiring lookaside records", "cache: pages currently held in the cache", + "cache: pages evicted because they exceeded the in-memory maximum", + "cache: pages evicted because they had chains of deleted items", + "cache: pages evicted by application threads", "cache: pages read into cache", "cache: pages read into cache requiring lookaside entries", + "cache: pages selected for eviction unable to be evicted", + "cache: pages walked for eviction", "cache: pages written from cache", - "cache: page written requiring lookaside records", "cache: pages written requiring in-memory restoration", + "cache: percentage overhead", + "cache: tracked bytes belonging to internal pages in the cache", + "cache: tracked bytes belonging to leaf pages in the cache", + "cache: tracked bytes belonging to overflow pages in the cache", + "cache: tracked dirty bytes in the cache", + "cache: tracked dirty pages in the cache", + "cache: unmodified pages evicted", + "connection: auto adjusting condition resets", + "connection: auto adjusting condition wait calls", + "connection: files currently open", + "connection: memory allocations", + "connection: memory frees", + "connection: memory re-allocations", "connection: pthread mutex condition wait calls", + "connection: pthread mutex shared lock read-lock calls", + "connection: pthread mutex shared lock write-lock calls", + "connection: total read I/Os", + "connection: total write I/Os", "cursor: cursor create calls", "cursor: cursor insert calls", "cursor: cursor next calls", @@ -580,96 +601,81 @@ static const char * const __stats_connection_desc[] = { "cursor: cursor restarted searches", "cursor: cursor search calls", "cursor: cursor search near calls", - "cursor: truncate calls", "cursor: cursor update calls", + "cursor: truncate calls", "data-handle: connection data handles currently active", - "data-handle: session dhandles swept", - "data-handle: session sweep attempts", - "data-handle: connection sweep dhandles closed", "data-handle: connection sweep candidate became referenced", + "data-handle: connection sweep dhandles closed", "data-handle: connection sweep dhandles removed from hash list", "data-handle: connection sweep time-of-death sets", "data-handle: connection sweeps", - "connection: files currently open", - "log: total log buffer size", + "data-handle: session dhandles swept", + "data-handle: session sweep attempts", + "log: busy returns attempting to switch slots", + "log: consolidated slot closures", + "log: consolidated slot join races", + "log: consolidated slot join transitions", + "log: consolidated slot joins", + "log: consolidated slot unbuffered writes", "log: log bytes of payload data", "log: log bytes written", - "log: yields waiting for previous log file close", - "log: total size of compressed records", - "log: total in-memory size of compressed records", - "log: log records too small to compress", - "log: log records not compressed", - "log: log records compressed", + "log: log files manually zero-filled", "log: log flush operations", + "log: log force write operations", + "log: log force write operations skipped", + "log: log records compressed", + "log: log records not compressed", + "log: log records too small to compress", + "log: log release advances write LSN", + "log: log scan operations", + "log: log scan records requiring two reads", + "log: log server thread advances write LSN", + "log: log server thread write LSN walk skipped", + "log: log sync operations", + "log: log sync_dir operations", + "log: log write operations", + "log: logging bytes consolidated", "log: maximum log file size", - "log: pre-allocated log files prepared", "log: number of pre-allocated log files to create", "log: pre-allocated log files not ready and missed", + "log: pre-allocated log files prepared", "log: pre-allocated log files used", - "log: log release advances write LSN", "log: records processed by log scan", - "log: log scan records requiring two reads", - "log: log scan operations", - "log: consolidated slot closures", + "log: total in-memory size of compressed records", + "log: total log buffer size", + "log: total size of compressed records", "log: written slots coalesced", - "log: logging bytes consolidated", - "log: consolidated slot joins", - "log: consolidated slot join races", - "log: busy returns attempting to switch slots", - "log: consolidated slot join transitions", - "log: consolidated slot unbuffered writes", - "log: log sync operations", - "log: log sync_dir operations", - "log: log server thread advances write LSN", - "log: log write operations", - "log: log files manually zero-filled", - "LSM: sleep for LSM checkpoint throttle", - "LSM: sleep for LSM merge throttle", - "LSM: rows merged in an LSM tree", - "LSM: application work units currently queued", - "LSM: merge work units currently queued", - "LSM: tree queue hit maximum", - "LSM: switch work units currently queued", - "LSM: tree maintenance operations scheduled", - "LSM: tree maintenance operations discarded", - "LSM: tree maintenance operations executed", - "connection: memory allocations", - "connection: memory frees", - "connection: memory re-allocations", - "thread-yield: page acquire busy blocked", - "thread-yield: page acquire eviction blocked", - "thread-yield: page acquire locked blocked", - "thread-yield: page acquire read blocked", - "thread-yield: page acquire time sleeping (usecs)", - "connection: total read I/Os", - "reconciliation: pages deleted", + "log: yields waiting for previous log file close", "reconciliation: fast-path pages deleted", "reconciliation: page reconciliation calls", "reconciliation: page reconciliation calls for eviction", + "reconciliation: pages deleted", "reconciliation: split bytes currently awaiting free", "reconciliation: split objects currently awaiting free", - "connection: pthread mutex shared lock read-lock calls", - "connection: pthread mutex shared lock write-lock calls", "session: open cursor count", "session: open session count", + "thread-yield: page acquire busy blocked", + "thread-yield: page acquire eviction blocked", + "thread-yield: page acquire locked blocked", + "thread-yield: page acquire read blocked", + "thread-yield: page acquire time sleeping (usecs)", + "transaction: number of named snapshots created", + "transaction: number of named snapshots dropped", "transaction: transaction begins", - "transaction: transaction checkpoints", - "transaction: transaction checkpoint generation", "transaction: transaction checkpoint currently running", + "transaction: transaction checkpoint generation", "transaction: transaction checkpoint max time (msecs)", "transaction: transaction checkpoint min time (msecs)", "transaction: transaction checkpoint most recent time (msecs)", "transaction: transaction checkpoint total time (msecs)", - "transaction: transactions committed", + "transaction: transaction checkpoints", "transaction: transaction failures due to cache overflow", - "transaction: transaction range of IDs currently pinned by a checkpoint", "transaction: transaction range of IDs currently pinned", + "transaction: transaction range of IDs currently pinned by a checkpoint", "transaction: transaction range of IDs currently pinned by named snapshots", - "transaction: transactions rolled back", - "transaction: number of named snapshots created", - "transaction: number of named snapshots dropped", "transaction: transaction sync calls", - "connection: total write I/Os", + "transaction: transactions committed", + "transaction: transactions rolled back", }; int @@ -700,6 +706,16 @@ __wt_stat_connection_init(WT_CONNECTION_IMPL *handle) void __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) { + /* not clearing lsm_work_queue_app */ + /* not clearing lsm_work_queue_manager */ + stats->lsm_rows_merged = 0; + stats->lsm_checkpoint_throttle = 0; + stats->lsm_merge_throttle = 0; + /* not clearing lsm_work_queue_switch */ + stats->lsm_work_units_discarded = 0; + stats->lsm_work_units_done = 0; + stats->lsm_work_units_created = 0; + stats->lsm_work_queue_max = 0; stats->async_cur_queue = 0; /* not clearing async_max_queue */ stats->async_alloc_race = 0; @@ -724,6 +740,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->cache_bytes_read = 0; stats->cache_bytes_write = 0; stats->cache_eviction_checkpoint = 0; + /* not clearing cache_eviction_aggressive_set */ stats->cache_eviction_queue_empty = 0; stats->cache_eviction_queue_not_empty = 0; stats->cache_eviction_server_evicting = 0; @@ -761,6 +778,8 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) /* not clearing cache_bytes_dirty */ /* not clearing cache_pages_dirty */ stats->cache_eviction_clean = 0; + stats->cond_auto_wait_reset = 0; + stats->cond_auto_wait = 0; /* not clearing file_open */ stats->memory_allocation = 0; stats->memory_free = 0; @@ -799,6 +818,8 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->log_bytes_written = 0; stats->log_zero_fills = 0; stats->log_flush = 0; + stats->log_force_write = 0; + stats->log_force_write_skip = 0; stats->log_compress_writes = 0; stats->log_compress_write_fails = 0; stats->log_compress_small = 0; @@ -806,6 +827,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->log_scans = 0; stats->log_scan_rereads = 0; stats->log_write_lsn = 0; + stats->log_write_lsn_skip = 0; stats->log_sync = 0; stats->log_sync_dir = 0; stats->log_writes = 0; @@ -821,16 +843,6 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->log_compress_len = 0; stats->log_slot_coalesced = 0; stats->log_close_yields = 0; - /* not clearing lsm_work_queue_app */ - /* not clearing lsm_work_queue_manager */ - stats->lsm_rows_merged = 0; - stats->lsm_checkpoint_throttle = 0; - stats->lsm_merge_throttle = 0; - /* not clearing lsm_work_queue_switch */ - stats->lsm_work_units_discarded = 0; - stats->lsm_work_units_done = 0; - stats->lsm_work_units_created = 0; - stats->lsm_work_queue_max = 0; stats->rec_page_delete_fast = 0; stats->rec_pages = 0; stats->rec_pages_eviction = 0; @@ -876,6 +888,21 @@ void __wt_stat_connection_aggregate( WT_CONNECTION_STATS **from, WT_CONNECTION_STATS *to) { + to->lsm_work_queue_app += WT_STAT_READ(from, lsm_work_queue_app); + to->lsm_work_queue_manager += + WT_STAT_READ(from, lsm_work_queue_manager); + to->lsm_rows_merged += WT_STAT_READ(from, lsm_rows_merged); + to->lsm_checkpoint_throttle += + WT_STAT_READ(from, lsm_checkpoint_throttle); + to->lsm_merge_throttle += WT_STAT_READ(from, lsm_merge_throttle); + to->lsm_work_queue_switch += + WT_STAT_READ(from, lsm_work_queue_switch); + to->lsm_work_units_discarded += + WT_STAT_READ(from, lsm_work_units_discarded); + to->lsm_work_units_done += WT_STAT_READ(from, lsm_work_units_done); + to->lsm_work_units_created += + WT_STAT_READ(from, lsm_work_units_created); + to->lsm_work_queue_max += WT_STAT_READ(from, lsm_work_queue_max); to->async_cur_queue += WT_STAT_READ(from, async_cur_queue); to->async_max_queue += WT_STAT_READ(from, async_max_queue); to->async_alloc_race += WT_STAT_READ(from, async_alloc_race); @@ -901,6 +928,8 @@ __wt_stat_connection_aggregate( to->cache_bytes_write += WT_STAT_READ(from, cache_bytes_write); to->cache_eviction_checkpoint += WT_STAT_READ(from, cache_eviction_checkpoint); + to->cache_eviction_aggressive_set += + WT_STAT_READ(from, cache_eviction_aggressive_set); to->cache_eviction_queue_empty += WT_STAT_READ(from, cache_eviction_queue_empty); to->cache_eviction_queue_not_empty += @@ -955,6 +984,8 @@ __wt_stat_connection_aggregate( to->cache_bytes_dirty += WT_STAT_READ(from, cache_bytes_dirty); to->cache_pages_dirty += WT_STAT_READ(from, cache_pages_dirty); to->cache_eviction_clean += WT_STAT_READ(from, cache_eviction_clean); + to->cond_auto_wait_reset += WT_STAT_READ(from, cond_auto_wait_reset); + to->cond_auto_wait += WT_STAT_READ(from, cond_auto_wait); to->file_open += WT_STAT_READ(from, file_open); to->memory_allocation += WT_STAT_READ(from, memory_allocation); to->memory_free += WT_STAT_READ(from, memory_free); @@ -993,6 +1024,8 @@ __wt_stat_connection_aggregate( to->log_bytes_written += WT_STAT_READ(from, log_bytes_written); to->log_zero_fills += WT_STAT_READ(from, log_zero_fills); to->log_flush += WT_STAT_READ(from, log_flush); + to->log_force_write += WT_STAT_READ(from, log_force_write); + to->log_force_write_skip += WT_STAT_READ(from, log_force_write_skip); to->log_compress_writes += WT_STAT_READ(from, log_compress_writes); to->log_compress_write_fails += WT_STAT_READ(from, log_compress_write_fails); @@ -1002,6 +1035,7 @@ __wt_stat_connection_aggregate( to->log_scans += WT_STAT_READ(from, log_scans); to->log_scan_rereads += WT_STAT_READ(from, log_scan_rereads); to->log_write_lsn += WT_STAT_READ(from, log_write_lsn); + to->log_write_lsn_skip += WT_STAT_READ(from, log_write_lsn_skip); to->log_sync += WT_STAT_READ(from, log_sync); to->log_sync_dir += WT_STAT_READ(from, log_sync_dir); to->log_writes += WT_STAT_READ(from, log_writes); @@ -1018,21 +1052,6 @@ __wt_stat_connection_aggregate( to->log_compress_len += WT_STAT_READ(from, log_compress_len); to->log_slot_coalesced += WT_STAT_READ(from, log_slot_coalesced); to->log_close_yields += WT_STAT_READ(from, log_close_yields); - to->lsm_work_queue_app += WT_STAT_READ(from, lsm_work_queue_app); - to->lsm_work_queue_manager += - WT_STAT_READ(from, lsm_work_queue_manager); - to->lsm_rows_merged += WT_STAT_READ(from, lsm_rows_merged); - to->lsm_checkpoint_throttle += - WT_STAT_READ(from, lsm_checkpoint_throttle); - to->lsm_merge_throttle += WT_STAT_READ(from, lsm_merge_throttle); - to->lsm_work_queue_switch += - WT_STAT_READ(from, lsm_work_queue_switch); - to->lsm_work_units_discarded += - WT_STAT_READ(from, lsm_work_units_discarded); - to->lsm_work_units_done += WT_STAT_READ(from, lsm_work_units_done); - to->lsm_work_units_created += - WT_STAT_READ(from, lsm_work_units_created); - to->lsm_work_queue_max += WT_STAT_READ(from, lsm_work_queue_max); to->rec_page_delete_fast += WT_STAT_READ(from, rec_page_delete_fast); to->rec_pages += WT_STAT_READ(from, rec_pages); to->rec_pages_eviction += WT_STAT_READ(from, rec_pages_eviction); diff --git a/src/txn/txn.c b/src/txn/txn.c index e8fd8c0c119..7a768a8fe20 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -344,7 +344,7 @@ retry: current_id - oldest_id > 10000 && oldest_session != NULL) { (void)__wt_verbose(session, WT_VERB_TRANSACTION, "old snapshot %" PRIu64 - " pinned in session %d [%s]" + " pinned in session %" PRIu32 " [%s]" " with snap_min %" PRIu64 "\n", oldest_id, oldest_session->id, oldest_session->lastop, diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 6a2c1eef826..1eebc9e9d04 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -8,6 +8,10 @@ #include "wt_internal.h" +static int __checkpoint_lock_tree( + WT_SESSION_IMPL *, bool, bool, const char *[]); +static int __checkpoint_tree_helper(WT_SESSION_IMPL *, const char *[]); + /* * __wt_checkpoint_name_ok -- * Complain if the checkpoint name isn't acceptable. @@ -155,8 +159,8 @@ __checkpoint_apply_all(WT_SESSION_IMPL *session, const char *cfg[], ckpt_closed = cval.len != 0; } WT_ERR(ckpt_closed ? - __wt_meta_btree_apply(session, op, cfg) : - __wt_conn_btree_apply(session, false, NULL, op, cfg)); + __wt_meta_apply_all(session, op, NULL, cfg) : + __wt_conn_btree_apply(session, NULL, op, NULL, cfg)); } if (fullp != NULL) @@ -179,14 +183,8 @@ __checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[], /* If we have already locked the handles, apply the operation. */ for (i = 0; i < session->ckpt_handle_next; ++i) { - if (session->ckpt_handle[i].dhandle != NULL) - WT_WITH_DHANDLE(session, - session->ckpt_handle[i].dhandle, - ret = (*op)(session, cfg)); - else - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_conn_btree_apply_single(session, - session->ckpt_handle[i].name, NULL, op, cfg)); + WT_WITH_DHANDLE(session, session->ckpt_handle[i], + ret = (*op)(session, cfg)); WT_RET(ret); } @@ -230,11 +228,11 @@ __checkpoint_data_source(WT_SESSION_IMPL *session, const char *cfg[]) } /* - * __wt_checkpoint_list -- + * __wt_checkpoint_get_handles -- * Get a list of handles to flush. */ int -__wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[]) +__wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) { WT_DECL_RET; const char *name; @@ -257,15 +255,18 @@ __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[]) name = session->dhandle->name; session->dhandle = NULL; - /* Record busy file names, we'll deal with them in the checkpoint. */ - if ((ret = __wt_session_get_btree(session, name, NULL, NULL, 0)) == 0) - session->ckpt_handle[session->ckpt_handle_next++].dhandle = - session->dhandle; - else if (ret == EBUSY) - ret = __wt_strdup(session, name, - &session->ckpt_handle[session->ckpt_handle_next++].name); + if ((ret = __wt_session_get_btree(session, name, NULL, NULL, 0)) != 0) + return (ret == EBUSY ? 0 : ret); - return (ret); + WT_SAVE_DHANDLE(session, + ret = __checkpoint_lock_tree(session, true, true, cfg)); + if (ret != 0) { + WT_TRET(__wt_session_release_btree(session)); + return (ret); + } + + session->ckpt_handle[session->ckpt_handle_next++] = session->dhandle; + return (0); } /* @@ -277,7 +278,7 @@ __checkpoint_write_leaves(WT_SESSION_IMPL *session, const char *cfg[]) { WT_UNUSED(cfg); - return (__wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES)); + return (__wt_cache_op(session, WT_SYNC_WRITE_LEAVES)); } /* @@ -381,15 +382,20 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) /* Configure logging only if doing a full checkpoint. */ logging = FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED); + /* Keep track of handles acquired for locking. */ + WT_ERR(__wt_meta_track_on(session)); + tracking = true; + /* * Get a list of handles we want to flush; this may pull closed objects * into the session cache, but we're going to do that eventually anyway. */ + WT_ASSERT(session, session->ckpt_handle_next == 0); WT_WITH_SCHEMA_LOCK(session, ret, WT_WITH_TABLE_LOCK(session, ret, WT_WITH_HANDLE_LIST_LOCK(session, ret = __checkpoint_apply_all( - session, cfg, __wt_checkpoint_list, NULL)))); + session, cfg, __wt_checkpoint_get_handles, NULL)))); WT_ERR(ret); /* @@ -418,15 +424,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * completion. Do it after flushing the pages to give the * asynchronous flush as much time as possible before we wait. */ - if (F_ISSET(conn, WT_CONN_CKPT_SYNC)) - WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync)); - - /* Acquire the schema lock. */ - F_SET(session, WT_SESSION_LOCKED_SCHEMA); - __wt_spin_lock(session, &conn->schema_lock); - - WT_ERR(__wt_meta_track_on(session)); - tracking = true; + WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync)); /* Tell logging that we are about to start a database checkpoint. */ if (full && logging) @@ -440,6 +438,8 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_epoch(session, &start)); /* + * Start the checkpoint for real. + * * Bump the global checkpoint generation, used to figure out whether * checkpoint has visited a tree. There is no need for this to be * atomic: it is only written while holding the checkpoint lock. @@ -503,7 +503,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_txn_checkpoint_log( session, full, WT_TXN_LOG_CKPT_START, NULL)); - WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint)); + WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_tree_helper)); /* * Clear the dhandle so the visibility check doesn't get confused about @@ -522,8 +522,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * Checkpoints have to hit disk (it would be reasonable to configure for * lazy checkpoints, but we don't support them yet). */ - if (F_ISSET(conn, WT_CONN_CKPT_SYNC)) - WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync)); + WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync)); WT_ERR(__checkpoint_verbose_track(session, "sync completed", &verb_timer)); @@ -543,16 +542,25 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * Recovery relies on the checkpoint LSN in the metadata only being * updated by full checkpoints so only checkpoint the metadata for * full or non-logged checkpoints. + * + * This is very similar to __wt_meta_track_off, ideally they would be + * merged. */ if (full || !logging) { session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED; /* Disable metadata tracking during the metadata checkpoint. */ saved_meta_next = session->meta_track_next; session->meta_track_next = NULL; + WT_WITH_METADATA_LOCK(session, ret, + WT_WITH_DHANDLE(session, + WT_SESSION_META_DHANDLE(session), + ret = __wt_checkpoint(session, cfg))); + session->meta_track_next = saved_meta_next; + WT_ERR(ret); + WT_WITH_DHANDLE(session, WT_SESSION_META_DHANDLE(session), - ret = __wt_checkpoint(session, cfg)); - session->meta_track_next = saved_meta_next; + ret = __wt_checkpoint_sync(session, NULL)); WT_ERR(ret); WT_ERR(__checkpoint_verbose_track(session, @@ -610,23 +618,13 @@ err: /* WT_TXN_LOG_CKPT_STOP : WT_TXN_LOG_CKPT_CLEANUP, NULL)); } - for (i = 0; i < session->ckpt_handle_next; ++i) { - if (session->ckpt_handle[i].dhandle == NULL) { - __wt_free(session, session->ckpt_handle[i].name); - continue; - } - WT_WITH_DHANDLE(session, session->ckpt_handle[i].dhandle, + for (i = 0; i < session->ckpt_handle_next; ++i) + WT_WITH_DHANDLE(session, session->ckpt_handle[i], WT_TRET(__wt_session_release_btree(session))); - } __wt_free(session, session->ckpt_handle); session->ckpt_handle_allocated = session->ckpt_handle_next = 0; - if (F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)) { - F_CLR(session, WT_SESSION_LOCKED_SCHEMA); - __wt_spin_unlock(session, &conn->schema_lock); - } - session->isolation = txn->isolation = saved_isolation; return (ret); } @@ -768,14 +766,13 @@ __drop_to(WT_CKPT *ckptbase, const char *name, size_t len) } /* - * __checkpoint_worker -- - * Checkpoint a tree. + * __checkpoint_lock_tree -- + * Acquire the locks required to checkpoint a tree. */ static int -__checkpoint_worker(WT_SESSION_IMPL *session, - const char *cfg[], bool is_checkpoint, bool need_tracking) +__checkpoint_lock_tree(WT_SESSION_IMPL *session, + bool is_checkpoint, bool need_tracking, const char *cfg[]) { - WT_BM *bm; WT_BTREE *btree; WT_CKPT *ckpt, *ckptbase; WT_CONFIG dropconf; @@ -783,19 +780,15 @@ __checkpoint_worker(WT_SESSION_IMPL *session, WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; - WT_LSN ckptlsn; - int deleted, was_modified; - bool fake_ckpt, force, hot_backup_locked; - const char *name; char *name_alloc; + const char *name; + bool hot_backup_locked; btree = S2BT(session); - bm = btree->bm; conn = S2C(session); ckpt = ckptbase = NULL; dhandle = session->dhandle; - was_modified = btree->modified; - fake_ckpt = hot_backup_locked = false; + hot_backup_locked = false; name_alloc = NULL; /* @@ -814,15 +807,6 @@ __checkpoint_worker(WT_SESSION_IMPL *session, WT_ASSERT(session, !need_tracking || WT_IS_METADATA(session, dhandle) || WT_META_TRACKING(session)); - /* - * Set the checkpoint LSN to the maximum LSN so that if logging is - * disabled, recovery will never roll old changes forward over the - * non-logged changes in this checkpoint. If logging is enabled, a - * real checkpoint LSN will be assigned later for this checkpoint and - * overwrite this. - */ - WT_MAX_LSN(&ckptlsn); - /* Get the list of checkpoints for this file. */ WT_RET(__wt_meta_ckptlist_get(session, dhandle->name, &ckptbase)); @@ -873,74 +857,15 @@ __checkpoint_worker(WT_SESSION_IMPL *session, /* Drop checkpoints with the same name as the one we're taking. */ __drop(ckptbase, name, strlen(name)); - /* - * Check for clean objects not requiring a checkpoint. - * - * If we're closing a handle, and the object is clean, we can skip the - * checkpoint, whatever checkpoints we have are sufficient. (We might - * not have any checkpoints if the object was never modified, and that's - * OK: the object creation code doesn't mark the tree modified so we can - * skip newly created trees here.) - * - * If the application repeatedly checkpoints an object (imagine hourly - * checkpoints using the same explicit or internal name), there's no - * reason to repeat the checkpoint for clean objects. The test is if - * the only checkpoint we're deleting is the last one in the list and - * it has the same name as the checkpoint we're about to take, skip the - * work. (We can't skip checkpoints that delete more than the last - * checkpoint because deleting those checkpoints might free up space in - * the file.) This means an application toggling between two (or more) - * checkpoint names will repeatedly take empty checkpoints, but that's - * not likely enough to make detection worthwhile. - * - * Checkpoint read-only objects otherwise: the application must be able - * to open the checkpoint in a cursor after taking any checkpoint, which - * means it must exist. - */ - force = false; - F_CLR(btree, WT_BTREE_SKIP_CKPT); - if (!btree->modified && cfg != NULL) { - ret = __wt_config_gets(session, cfg, "force", &cval); - if (ret != 0 && ret != WT_NOTFOUND) - WT_ERR(ret); - if (ret == 0 && cval.val != 0) - force = true; - } - if (!btree->modified && !force) { - if (!is_checkpoint) - goto nockpt; - - deleted = 0; - WT_CKPT_FOREACH(ckptbase, ckpt) - if (F_ISSET(ckpt, WT_CKPT_DELETE)) - ++deleted; - /* - * Complicated test: if the last checkpoint in the object has - * the same name as the checkpoint we're taking (correcting for - * internal checkpoint names with their generational suffix - * numbers), we can skip the checkpoint, there's nothing to do. - * The exception is if we're deleting two or more checkpoints: - * then we may save space. - */ - if (ckpt > ckptbase && - (strcmp(name, (ckpt - 1)->name) == 0 || - (WT_PREFIX_MATCH(name, WT_CHECKPOINT) && - WT_PREFIX_MATCH((ckpt - 1)->name, WT_CHECKPOINT))) && - deleted < 2) { -nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT); - WT_PUBLISH(btree->checkpoint_gen, - S2C(session)->txn_global.checkpoint_gen); - WT_STAT_FAST_DATA_SET(session, - btree_checkpoint_generation, - btree->checkpoint_gen); - goto done; - } - } - /* Add a new checkpoint entry at the end of the list. */ WT_CKPT_FOREACH(ckptbase, ckpt) ; WT_ERR(__wt_strdup(session, name, &ckpt->name)); + /* + * We are now done with the local use of the name. Free the local + * allocation, if needed. + */ + __wt_free(session, name_alloc); F_SET(ckpt, WT_CKPT_ADD); /* @@ -1021,32 +946,128 @@ nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT); * copy instead of forcing checkpoints on clean objects to associate * names with checkpoints. */ - if (is_checkpoint) - switch (F_MASK(btree, WT_BTREE_SPECIAL_FLAGS)) { - case 0: - break; - case WT_BTREE_BULK: - /* - * The only checkpoints a bulk-loaded file should have - * are fake ones we created without the underlying block - * manager. I'm leaving this code here because it's a - * cheap test and a nasty race. - */ - WT_CKPT_FOREACH(ckptbase, ckpt) - if (!F_ISSET(ckpt, WT_CKPT_ADD | WT_CKPT_FAKE)) - WT_ERR_MSG(session, ret, - "block-manager checkpoint found " - "for a bulk-loaded file"); - fake_ckpt = true; - goto fake; - case WT_BTREE_REBALANCE: - case WT_BTREE_SALVAGE: - case WT_BTREE_UPGRADE: - case WT_BTREE_VERIFY: - WT_ERR_MSG(session, EINVAL, - "checkpoints are blocked during rebalance, " - "salvage, upgrade or verify operations"); + WT_ASSERT(session, + !is_checkpoint || !F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)); + + hot_backup_locked = false; + WT_ERR(__wt_readunlock(session, conn->hot_backup_lock)); + + WT_ASSERT(session, btree->ckpt == NULL); + btree->ckpt = ckptbase; + + return (0); + +err: if (hot_backup_locked) + WT_TRET(__wt_readunlock(session, conn->hot_backup_lock)); + + __wt_meta_ckptlist_free(session, ckptbase); + __wt_free(session, name_alloc); + + return (ret); +} + +/* + * __checkpoint_tree -- + * Checkpoint a single tree. + * Assumes all necessary locks have been acquired by the caller. + */ +static int +__checkpoint_tree( + WT_SESSION_IMPL *session, bool is_checkpoint, const char *cfg[]) +{ + WT_BM *bm; + WT_BTREE *btree; + WT_CKPT *ckpt, *ckptbase; + WT_CONFIG_ITEM cval; + WT_CONNECTION_IMPL *conn; + WT_DATA_HANDLE *dhandle; + WT_DECL_RET; + WT_LSN ckptlsn; + const char *name; + int deleted, was_modified; + bool fake_ckpt, force; + + btree = S2BT(session); + bm = btree->bm; + ckptbase = btree->ckpt; + conn = S2C(session); + dhandle = session->dhandle; + fake_ckpt = false; + was_modified = btree->modified; + + /* + * Set the checkpoint LSN to the maximum LSN so that if logging is + * disabled, recovery will never roll old changes forward over the + * non-logged changes in this checkpoint. If logging is enabled, a + * real checkpoint LSN will be assigned for this checkpoint and + * overwrite this. + */ + WT_MAX_LSN(&ckptlsn); + + /* + * Check for clean objects not requiring a checkpoint. + * + * If we're closing a handle, and the object is clean, we can skip the + * checkpoint, whatever checkpoints we have are sufficient. (We might + * not have any checkpoints if the object was never modified, and that's + * OK: the object creation code doesn't mark the tree modified so we can + * skip newly created trees here.) + * + * If the application repeatedly checkpoints an object (imagine hourly + * checkpoints using the same explicit or internal name), there's no + * reason to repeat the checkpoint for clean objects. The test is if + * the only checkpoint we're deleting is the last one in the list and + * it has the same name as the checkpoint we're about to take, skip the + * work. (We can't skip checkpoints that delete more than the last + * checkpoint because deleting those checkpoints might free up space in + * the file.) This means an application toggling between two (or more) + * checkpoint names will repeatedly take empty checkpoints, but that's + * not likely enough to make detection worthwhile. + * + * Checkpoint read-only objects otherwise: the application must be able + * to open the checkpoint in a cursor after taking any checkpoint, which + * means it must exist. + */ + force = false; + F_CLR(btree, WT_BTREE_SKIP_CKPT); + if (!btree->modified && cfg != NULL) { + ret = __wt_config_gets(session, cfg, "force", &cval); + if (ret != 0 && ret != WT_NOTFOUND) + WT_ERR(ret); + if (ret == 0 && cval.val != 0) + force = true; + } + if (!btree->modified && !force) { + if (!is_checkpoint) + goto nockpt; + + deleted = 0; + WT_CKPT_FOREACH(ckptbase, ckpt) + if (F_ISSET(ckpt, WT_CKPT_DELETE)) + ++deleted; + /* + * Complicated test: if the tree is clean and last two + * checkpoints have the same name (correcting for internal + * checkpoint names with their generational suffix numbers), we + * can skip the checkpoint, there's nothing to do. The + * exception is if we're deleting two or more checkpoints: then + * we may save space. + */ + name = (ckpt - 1)->name; + if (ckpt > ckptbase + 1 && deleted < 2 && + (strcmp(name, (ckpt - 2)->name) == 0 || + (WT_PREFIX_MATCH(name, WT_CHECKPOINT) && + WT_PREFIX_MATCH((ckpt - 2)->name, WT_CHECKPOINT)))) { +nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT); + WT_PUBLISH(btree->checkpoint_gen, + S2C(session)->txn_global.checkpoint_gen); + WT_STAT_FAST_DATA_SET(session, + btree_checkpoint_generation, + btree->checkpoint_gen); + ret = 0; + goto err; } + } /* * If an object has never been used (in other words, if it could become @@ -1100,9 +1121,9 @@ nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT); /* Flush the file from the cache, creating the checkpoint. */ if (is_checkpoint) - WT_ERR(__wt_cache_op(session, ckptbase, WT_SYNC_CHECKPOINT)); + WT_ERR(__wt_cache_op(session, WT_SYNC_CHECKPOINT)); else - WT_ERR(__wt_cache_op(session, ckptbase, WT_SYNC_CLOSE)); + WT_ERR(__wt_cache_op(session, WT_SYNC_CLOSE)); /* * All blocks being written have been written; set the object's write @@ -1134,9 +1155,8 @@ fake: /* * sync the file here or we could roll forward the metadata in * recovery and open a checkpoint that isn't yet durable. */ - if (F_ISSET(conn, WT_CONN_CKPT_SYNC) && - (WT_IS_METADATA(session, dhandle) || - !F_ISSET(&session->txn, WT_TXN_RUNNING))) + if (WT_IS_METADATA(session, dhandle) || + !F_ISSET(&session->txn, WT_TXN_RUNNING)) WT_ERR(__wt_checkpoint_sync(session, NULL)); WT_ERR(__wt_meta_ckptlist_set( @@ -1161,7 +1181,6 @@ fake: /* WT_ERR(__wt_txn_checkpoint_log( session, false, WT_TXN_LOG_CKPT_STOP, NULL)); -done: err: /* * If the checkpoint didn't complete successfully, make sure the * tree is marked dirty. @@ -1169,29 +1188,42 @@ err: /* if (ret != 0 && !btree->modified && was_modified) btree->modified = 1; - if (hot_backup_locked) - WT_TRET(__wt_readunlock(session, conn->hot_backup_lock)); - __wt_meta_ckptlist_free(session, ckptbase); - __wt_free(session, name_alloc); + btree->ckpt = NULL; return (ret); } /* + * __checkpoint_tree_helper -- + * Checkpoint a tree (suitable for use in *_apply functions). + */ +static int +__checkpoint_tree_helper(WT_SESSION_IMPL *session, const char *cfg[]) +{ + return (__checkpoint_tree(session, true, cfg)); +} + +/* * __wt_checkpoint -- * Checkpoint a file. */ int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) { + WT_DECL_RET; + /* Should not be called with a checkpoint handle. */ WT_ASSERT(session, session->dhandle->checkpoint == NULL); - /* Should be holding the schema lock. */ - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); + /* We must hold the metadata lock if checkpointing the metadata. */ + WT_ASSERT(session, !WT_IS_METADATA(session, session->dhandle) || + F_ISSET(session, WT_SESSION_LOCKED_METADATA)); - return (__checkpoint_worker(session, cfg, true, true)); + WT_SAVE_DHANDLE(session, + ret = __checkpoint_lock_tree(session, true, true, cfg)); + WT_RET(ret); + return (__checkpoint_tree(session, true, cfg)); } /* @@ -1210,8 +1242,9 @@ __wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[]) /* Should not be called with a checkpoint handle. */ WT_ASSERT(session, session->dhandle->checkpoint == NULL); - /* Should have an underlying block manager reference. */ - WT_ASSERT(session, bm != NULL); + /* Unnecessary if checkpoint_sync has been configured "off". */ + if (!F_ISSET(S2C(session), WT_CONN_CKPT_SYNC)) + return (0); return (bm->sync(bm, session, false)); } @@ -1240,7 +1273,7 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final) if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT)) F_SET(session->dhandle, WT_DHANDLE_DEAD); if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) - return (__wt_cache_op(session, NULL, WT_SYNC_DISCARD)); + return (__wt_cache_op(session, WT_SYNC_DISCARD)); /* * If closing an unmodified file, check that no update is required @@ -1249,21 +1282,13 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final) if (!btree->modified && !bulk) { __wt_txn_update_oldest(session, true); return (__wt_txn_visible_all(session, btree->rec_max_txn) ? - __wt_cache_op(session, NULL, WT_SYNC_DISCARD) : EBUSY); + __wt_cache_op(session, WT_SYNC_DISCARD) : EBUSY); } /* - * We should already have the schema lock unless we're finishing a bulk - * load -- the only other paths to closing files (sweep and LSM) have - * already checked for read-only trees. - */ - WT_ASSERT(session, - final || bulk || F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); - - /* * Turn on metadata tracking if: * - The session is not already doing metadata tracking. - * - The file was bulk loaded. + * - The file was not bulk loaded. * - The close is not during connection close. */ need_tracking = !WT_META_TRACKING(session) && !bulk && !final; @@ -1271,10 +1296,14 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final) if (need_tracking) WT_RET(__wt_meta_track_on(session)); - WT_TRET(__checkpoint_worker(session, NULL, false, need_tracking)); + WT_SAVE_DHANDLE(session, + ret = __checkpoint_lock_tree(session, false, need_tracking, NULL)); + WT_ASSERT(session, ret == 0); + if (ret == 0) + ret = __checkpoint_tree(session, false, NULL); if (need_tracking) - WT_RET(__wt_meta_track_off(session, true, ret != 0)); + WT_TRET(__wt_meta_track_off(session, true, ret != 0)); return (ret); } diff --git a/src/txn/txn_recover.c b/src/txn/txn_recover.c index e6bd8a8d755..1ea4dba1152 100644 --- a/src/txn/txn_recover.c +++ b/src/txn/txn_recover.c @@ -88,11 +88,11 @@ __recovery_cursor(WT_SESSION_IMPL *session, WT_RECOVERY *r, * Helper to a cursor if this operation is to be applied during recovery. */ #define GET_RECOVERY_CURSOR(session, r, lsnp, fileid, cp) \ - WT_ERR(__recovery_cursor( \ - (session), (r), (lsnp), (fileid), false, (cp))); \ - WT_ERR(__wt_verbose((session), WT_VERB_RECOVERY, \ - "%s op %d to file %d at LSN %u/%u", \ - (cursor == NULL) ? "Skipping" : "Applying", \ + WT_ERR(__recovery_cursor(session, r, lsnp, fileid, false, cp)); \ + WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY, \ + "%s op %" PRIu32 " to file %" PRIu32 " at LSN %" PRIu32 \ + "/%" PRIu32, \ + cursor == NULL ? "Skipping" : "Applying", \ optype, fileid, lsnp->l.file, lsnp->l.offset)); \ if (cursor == NULL) \ break @@ -334,7 +334,7 @@ __recovery_setup_file(WT_RECOVERY *r, const char *uri, const char *config) r->files[fileid].ckpt_lsn = lsn; WT_RET(__wt_verbose(r->session, WT_VERB_RECOVERY, - "Recovering %s with id %u @ (%" PRIu32 ", %" PRIu32 ")", + "Recovering %s with id %" PRIu32 " @ (%" PRIu32 ", %" PRIu32 ")", uri, fileid, lsn.l.file, lsn.l.offset)); return (0); @@ -449,6 +449,18 @@ __wt_txn_recover(WT_SESSION_IMPL *session) */ if (!was_backup) { r.metadata_only = true; + /* + * If this is a read-only connection, check if the checkpoint + * LSN in the metadata file is up to date, indicating a clean + * shutdown. + */ + if (F_ISSET(conn, WT_CONN_READONLY)) { + WT_ERR(__wt_log_needs_recovery( + session, &metafile->ckpt_lsn, &needs_rec)); + if (needs_rec) + WT_ERR_MSG(session, WT_RUN_RECOVERY, + "Read-only database needs recovery"); + } if (WT_IS_INIT_LSN(&metafile->ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r)); @@ -484,7 +496,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session) */ r.metadata_only = false; WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY, - "Main recovery loop: starting at %u/%u", + "Main recovery loop: starting at %" PRIu32 "/%" PRIu32, r.ckpt_lsn.l.file, r.ckpt_lsn.l.offset)); WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec)); /* @@ -492,8 +504,17 @@ __wt_txn_recover(WT_SESSION_IMPL *session) * return an error if the user does not want automatic * recovery. */ - if (needs_rec && FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR)) + if (needs_rec && + (FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR) || + F_ISSET(conn, WT_CONN_READONLY))) { + if (F_ISSET(conn, WT_CONN_READONLY)) + WT_ERR_MSG(session, WT_RUN_RECOVERY, + "Read-only database needs recovery"); WT_ERR(WT_RUN_RECOVERY); + } + + if (F_ISSET(conn, WT_CONN_READONLY)) + goto done; /* * Recovery can touch more data than fits in cache, so it relies on @@ -504,7 +525,8 @@ __wt_txn_recover(WT_SESSION_IMPL *session) eviction_started = true; /* - * Always run recovery even if it was a clean shutdown. + * Always run recovery even if it was a clean shutdown only if + * this is not a read-only connection. * We can consider skipping it in the future. */ if (WT_IS_INIT_LSN(&r.ckpt_lsn)) diff --git a/src/utilities/util_dump.c b/src/utilities/util_dump.c index ca761a52d8a..aedd9168fbd 100644 --- a/src/utilities/util_dump.c +++ b/src/utilities/util_dump.c @@ -22,10 +22,10 @@ static int dump_prefix(WT_SESSION *, bool); static int dump_record(WT_CURSOR *, bool, bool); static int dump_suffix(WT_SESSION *); static int dump_table_config(WT_SESSION *, WT_CURSOR *, const char *); -static int dump_table_config_type( +static int dump_table_config_complex( WT_SESSION *, WT_CURSOR *, WT_CURSOR *, const char *, const char *); static int dup_json_string(const char *, char **); -static int print_config(WT_SESSION *, const char *, const char *, const char *); +static int print_config(WT_SESSION *, const char *, char *[]); static int usage(void); int @@ -150,9 +150,9 @@ dump_config(WT_SESSION *session, const char *uri, bool hex) /* Open a metadata cursor. */ if ((ret = session->open_cursor( - session, "metadata:create", NULL, NULL, &cursor)) != 0) { + session, "metadata:", NULL, NULL, &cursor)) != 0) { fprintf(stderr, "%s: %s: session.open_cursor: %s\n", progname, - "metadata:create", session->strerror(session, ret)); + "metadata:", session->strerror(session, ret)); return (1); } /* @@ -352,12 +352,23 @@ match: if ((ret = cursor->get_key(cursor, &key)) != 0) static int dump_json_table_config(WT_SESSION *session, const char *uri) { + WT_CONFIG_ITEM cval; WT_CURSOR *cursor; WT_DECL_RET; + size_t len; int tret; - char *value; + const char *name, *value; + char *p; + + p = NULL; + + /* Get the table name. */ + if ((name = strchr(uri, ':')) == NULL) { + fprintf(stderr, "%s: %s: corrupted uri\n", progname, uri); + return (1); + } + ++name; - /* Dump the config. */ /* Open a metadata cursor. */ if ((ret = session->open_cursor( session, "metadata:create", NULL, NULL, &cursor)) != 0) { @@ -368,12 +379,41 @@ dump_json_table_config(WT_SESSION *session, const char *uri) } /* - * Search for the object itself, to make sure it - * exists, and get its config string. This where we - * find out a table object doesn't exist, use a simple - * error message. + * Search for the object itself, just to make sure it exists, we don't + * want to output a header if the user entered the wrong name. This is + * where we find out a table doesn't exist, use a simple error message. + * + * Workaround for WiredTiger "simple" table handling. Simple tables + * have column-group entries, but they aren't listed in the metadata's + * table entry. Figure out if it's a simple table and in that case, + * retrieve the column-group entry and use the value from its "source" + * file. */ - cursor->set_key(cursor, uri); + if (WT_PREFIX_MATCH(uri, "table:")) { + len = strlen("colgroup:") + strlen(name) + 1; + if ((p = malloc(len)) == NULL) + return (util_err(session, errno, NULL)); + (void)snprintf(p, len, "colgroup:%s", name); + cursor->set_key(cursor, p); + if ((ret = cursor->search(cursor)) == 0) { + if ((ret = cursor->get_value(cursor, &value)) != 0) + return (util_cerr(cursor, "get_value", ret)); + if ((ret = __wt_config_getones( + (WT_SESSION_IMPL *)session, + value, "source", &cval)) != 0) + return (util_err( + session, ret, "%s: source entry", p)); + free(p); + len = cval.len + 10; + if ((p = malloc(len)) == NULL) + return (util_err(session, errno, NULL)); + (void)snprintf(p, len, "%.*s", (int)cval.len, cval.str); + cursor->set_key(cursor, p); + } else + cursor->set_key(cursor, uri); + } else + cursor->set_key(cursor, uri); + if ((ret = cursor->search(cursor)) == 0) { if ((ret = cursor->get_value(cursor, &value)) != 0) ret = util_cerr(cursor, "get_value", ret); @@ -381,8 +421,7 @@ dump_json_table_config(WT_SESSION *session, const char *uri) session, cursor, uri, value) != 0) ret = 1; } else if (ret == WT_NOTFOUND) - ret = util_err( - session, 0, "%s: No such object exists", uri); + ret = util_err(session, 0, "%s: No such object exists", uri); else ret = util_err(session, ret, "%s", uri); @@ -392,6 +431,7 @@ dump_json_table_config(WT_SESSION *session, const char *uri) ret = tret; } + free(p); return (ret); } @@ -414,10 +454,17 @@ dump_json_table_end(WT_SESSION *session) static int dump_table_config(WT_SESSION *session, WT_CURSOR *cursor, const char *uri) { + WT_CONFIG_ITEM cval; WT_CURSOR *srch; WT_DECL_RET; + size_t len; int tret; - const char *key, *name, *value; + bool complex_table; + const char *name, *v; + char *p, **cfg, *_cfg[4] = {NULL, NULL, NULL, NULL}; + + p = NULL; + cfg = &_cfg[3]; /* Get the table name. */ if ((name = strchr(uri, ':')) == NULL) { @@ -427,59 +474,111 @@ dump_table_config(WT_SESSION *session, WT_CURSOR *cursor, const char *uri) ++name; /* - * Dump out the config information: first, dump the uri entry itself - * (requires a lookup). + * Dump out the config information: first, dump the uri entry itself, + * it overrides all subsequent configurations. */ cursor->set_key(cursor, uri); if ((ret = cursor->search(cursor)) != 0) return (util_cerr(cursor, "search", ret)); - if ((ret = cursor->get_key(cursor, &key)) != 0) - return (util_cerr(cursor, "get_key", ret)); - if ((ret = cursor->get_value(cursor, &value)) != 0) + if ((ret = cursor->get_value(cursor, &v)) != 0) return (util_cerr(cursor, "get_value", ret)); - if (print_config(session, key, value, NULL) != 0) - return (1); + if ((*--cfg = strdup(v)) == NULL) + return (util_err(session, errno, NULL)); /* - * The underlying table configuration function needs a second cursor: - * open one before calling it, it makes error handling hugely simpler. + * Workaround for WiredTiger "simple" table handling. Simple tables + * have column-group entries, but they aren't listed in the metadata's + * table entry, and the name is different from other column-groups. + * Figure out if it's a simple table and in that case, retrieve the + * column-group's configuration value and the column-group's "source" + * entry, where the column-group entry overrides the source's. */ - if ((ret = - session->open_cursor(session, NULL, cursor, NULL, &srch)) != 0) - return (util_cerr(cursor, "open_cursor", ret)); + complex_table = false; + if (WT_PREFIX_MATCH(uri, "table:")) { + len = strlen("colgroup:") + strlen(name) + 1; + if ((p = malloc(len)) == NULL) + return (util_err(session, errno, NULL)); + (void)snprintf(p, len, "colgroup:%s", name); + cursor->set_key(cursor, p); + if ((ret = cursor->search(cursor)) == 0) { + if ((ret = cursor->get_value(cursor, &v)) != 0) + return (util_cerr(cursor, "get_value", ret)); + if ((*--cfg = strdup(v)) == NULL) + return (util_err(session, errno, NULL)); + if ((ret =__wt_config_getones( + (WT_SESSION_IMPL *)session, + *cfg, "source", &cval)) != 0) + return (util_err( + session, ret, "%s: source entry", p)); + free(p); + len = cval.len + 10; + if ((p = malloc(len)) == NULL) + return (util_err(session, errno, NULL)); + (void)snprintf(p, len, "%.*s", (int)cval.len, cval.str); + cursor->set_key(cursor, p); + if ((ret = cursor->search(cursor)) != 0) + return (util_cerr(cursor, "search", ret)); + if ((ret = cursor->get_value(cursor, &v)) != 0) + return (util_cerr(cursor, "get_value", ret)); + if ((*--cfg = strdup(v)) == NULL) + return (util_err(session, errno, NULL)); + } else + complex_table = true; + } - if ((ret = dump_table_config_type( - session, cursor, srch, name, "colgroup:")) == 0) - ret = dump_table_config_type( - session, cursor, srch, name, "index:"); + if (print_config(session, uri, cfg) != 0) + return (1); - if ((tret = srch->close(srch)) != 0) { - tret = util_cerr(cursor, "close", tret); - if (ret == 0) - ret = tret; + if (complex_table) { + /* + * The underlying table configuration function needs a second + * cursor: open one before calling it, it makes error handling + * hugely simpler. + */ + if ((ret = session->open_cursor( + session, "metadata:", NULL, NULL, &srch)) != 0) + return (util_cerr(cursor, "open_cursor", ret)); + + if ((ret = dump_table_config_complex( + session, cursor, srch, name, "colgroup:")) == 0) + ret = dump_table_config_complex( + session, cursor, srch, name, "index:"); + + if ((tret = srch->close(srch)) != 0) { + tret = util_cerr(cursor, "close", tret); + if (ret == 0) + ret = tret; + } } + free(p); + free(_cfg[0]); + free(_cfg[1]); + free(_cfg[2]); return (ret); } /* - * dump_table_config_type -- + * dump_table_config_complex -- * Dump the column groups or indices for a table. */ static int -dump_table_config_type(WT_SESSION *session, +dump_table_config_complex(WT_SESSION *session, WT_CURSOR *cursor, WT_CURSOR *srch, const char *name, const char *entry) { WT_CONFIG_ITEM cval; WT_DECL_RET; - const char *key, *skip, *value, *value_source; + const char *key; + size_t len; int exact; - char *p; + const char *v; + char *p, *cfg[3] = {NULL, NULL, NULL}; /* * Search the file looking for column group and index key/value pairs: * for each one, look up the related source information and append it - * to the base record. + * to the base record, where the column group and index configuration + * overrides the source configuration. */ cursor->set_key(cursor, entry); if ((ret = cursor->search_near(cursor, &exact)) != 0) { @@ -497,27 +596,32 @@ match: if ((ret = cursor->get_key(cursor, &key)) != 0) if (!WT_PREFIX_MATCH(key, entry)) return (0); - /* Check for a table name match. */ - skip = key + strlen(entry); - if (strncmp( - skip, name, strlen(name)) != 0 || skip[strlen(name)] != ':') + /* + * Check for a table name match. This test will match "simple" + * table column-groups as well as the more complex ones, but + * the previous version of the test was wrong and we're only + * in this function in the case of complex tables. + */ + if (!WT_PREFIX_MATCH(key + strlen(entry), name)) continue; /* Get the value. */ - if ((ret = cursor->get_value(cursor, &value)) != 0) + if ((ret = cursor->get_value(cursor, &v)) != 0) return (util_cerr(cursor, "get_value", ret)); + if ((cfg[1] = strdup(v)) == NULL) + return (util_err(session, errno, NULL)); /* Crack it and get the underlying source. */ if ((ret = __wt_config_getones( - (WT_SESSION_IMPL *)session, value, "source", &cval)) != 0) + (WT_SESSION_IMPL *)session, cfg[1], "source", &cval)) != 0) return ( util_err(session, ret, "%s: source entry", key)); /* Nul-terminate the source entry. */ - if ((p = malloc(cval.len + 10)) == NULL) + len = cval.len + 10; + if ((p = malloc(len)) == NULL) return (util_err(session, errno, NULL)); - (void)strncpy(p, cval.str, cval.len); - p[cval.len] = '\0'; + (void)snprintf(p, len, "%.*s", (int)cval.len, cval.str); srch->set_key(srch, p); if ((ret = srch->search(srch)) != 0) ret = util_err(session, ret, "%s: %s", key, p); @@ -526,16 +630,22 @@ match: if ((ret = cursor->get_key(cursor, &key)) != 0) return (1); /* Get the source's value. */ - if ((ret = srch->get_value(srch, &value_source)) != 0) + if ((ret = srch->get_value(srch, &v)) != 0) return (util_cerr(cursor, "get_value", ret)); + if ((cfg[0] = strdup(v)) == NULL) + return (util_err(session, errno, NULL)); /* * The dumped configuration string is the original key plus the - * source's configuration. + * source's configuration, where the values of the original key + * override any source configurations of the same name. */ - if (print_config(session, key, value, value_source) != 0) + if (print_config(session, key, cfg) != 0) return (util_err(session, EIO, NULL)); } + free(cfg[0]); + free(cfg[1]); + if (ret == 0 || ret == WT_NOTFOUND) return (0); return (util_cerr(cursor, "next", ret)); @@ -649,27 +759,21 @@ dup_json_string(const char *str, char **result) * Output a key/value URI pair by combining v1 and v2. */ static int -print_config(WT_SESSION *session, - const char *key, const char *v1, const char *v2) +print_config(WT_SESSION *session, const char *key, char *cfg[]) { WT_DECL_RET; char *value_ret; - const char *cfg[] = { v1, v2, NULL }; /* - * The underlying call will stop if the first string is NULL -- check - * here and swap in that case. + * We have all of the object configuration, but don't have the default + * session.create configuration. Have the underlying library add in the + * defaults and collapse it all into one load configuration string. */ - if (cfg[0] == NULL) { - cfg[0] = cfg[1]; - cfg[1] = NULL; - } - - if ((ret = __wt_config_collapse( + if ((ret = __wt_schema_create_final( (WT_SESSION_IMPL *)session, cfg, &value_ret)) != 0) return (util_err(session, ret, NULL)); ret = printf("%s\n%s\n", key, value_ret); - free((char *)value_ret); + free(value_ret); if (ret < 0) return (util_err(session, EIO, NULL)); return (0); |