diff options
author | Alexander Gorrod <alexander.gorrod@mongodb.com> | 2015-07-06 00:17:49 +0000 |
---|---|---|
committer | Alexander Gorrod <alexander.gorrod@mongodb.com> | 2015-07-06 00:17:49 +0000 |
commit | f31038b98941bdc72c13449183854a690fd20653 (patch) | |
tree | b64ad23009156438cc8a58e994a4de1f63a65ff9 | |
parent | db0ba62bd4a375f86e36c992033894569233000f (diff) | |
download | mongo-f31038b98941bdc72c13449183854a690fd20653.tar.gz |
Import wiredtiger-wiredtiger-mongodb-3.0.4-20-ga3b359d.tar.gz from wiredtiger branch mongodb-3.0
39 files changed, 798 insertions, 348 deletions
diff --git a/src/third_party/wiredtiger/build_win/filelist.win b/src/third_party/wiredtiger/build_win/filelist.win index e297ca16b06..8655c0eda8e 100644 --- a/src/third_party/wiredtiger/build_win/filelist.win +++ b/src/third_party/wiredtiger/build_win/filelist.win @@ -85,6 +85,7 @@ src/log/log.c src/log/log_auto.c src/log/log_slot.c src/lsm/lsm_cursor.c +src/lsm/lsm_cursor_bulk.c src/lsm/lsm_manager.c src/lsm/lsm_merge.c src/lsm/lsm_meta.c diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index 351067d7ba5..5ad422befb4 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -715,22 +715,22 @@ methods = { type='boolean', undoc=True), Config('statistics', '', r''' Specify the statistics to be gathered. Choosing "all" gathers - statistics regardless of cost and may include traversing - on-disk files; "fast" gathers a subset of relatively - inexpensive statistics. The selection must agree with the - database \c statistics configuration specified to - ::wiredtiger_open or WT_CONNECTION::reconfigure. For example, - "all" or "fast" can be configured when the database is - configured with "all", but the cursor open will fail if "all" - is specified when the database is configured with "fast", - and the cursor open will fail in all cases when the database - is configured with "none". If \c statistics is not configured, - the default configuration is the database configuration. - The "clear" configuration resets statistics after gathering - them, where appropriate (for example, a cache size statistic - is not cleared, while the count of cursor insert operations - will be cleared). See @ref statistics for more information''', - type='list', choices=['all', 'fast', 'clear']), + statistics regardless of cost and may include traversing on-disk files; + "fast" gathers a subset of relatively inexpensive statistics. The + selection must agree with the database \c statistics configuration + specified to ::wiredtiger_open or WT_CONNECTION::reconfigure. For + example, "all" or "fast" can be configured when the database is + configured with "all", but the cursor open will fail if "all" is + specified when the database is configured with "fast", and the cursor + open will fail in all cases when the database is configured with + "none". If "size" is configured, only the underlying size of the + object on disk is filled in and the object is not opened. If \c + statistics is not configured, the default configuration is the database + configuration. The "clear" configuration resets statistics after + gathering them, where appropriate (for example, a cache size statistic + is not cleared, while the count of cursor insert operations will be + cleared). See @ref statistics for more information''', + type='list', choices=['all', 'fast', 'clear', 'size']), Config('target', '', r''' if non-empty, backup the list of objects; valid only for a backup data source''', @@ -767,6 +767,11 @@ methods = { Config('dump_shape', 'false', r''' Display the shape of the tree after verification, using the application's message handler, intended for debugging''', + type='boolean'), + Config('strict', 'false', r''' + Treat any verification problem as an error; by default, verify will + warn, but not fail, in the case of errors that won't affect future + behavior (for example, a leaked block)''', type='boolean') ]), diff --git a/src/third_party/wiredtiger/dist/filelist b/src/third_party/wiredtiger/dist/filelist index ee70ccf765e..af72bab6718 100644 --- a/src/third_party/wiredtiger/dist/filelist +++ b/src/third_party/wiredtiger/dist/filelist @@ -85,6 +85,7 @@ src/log/log.c src/log/log_auto.c src/log/log_slot.c src/lsm/lsm_cursor.c +src/lsm/lsm_cursor_bulk.c src/lsm/lsm_manager.c src/lsm/lsm_merge.c src/lsm/lsm_meta.c diff --git a/src/third_party/wiredtiger/src/block/block_mgr.c b/src/third_party/wiredtiger/src/block/block_mgr.c index 13e6ec73b32..558008ee7b0 100644 --- a/src/third_party/wiredtiger/src/block/block_mgr.c +++ b/src/third_party/wiredtiger/src/block/block_mgr.c @@ -302,9 +302,10 @@ __bm_salvage_end(WT_BM *bm, WT_SESSION_IMPL *session) * Start a block manager verify. */ static int -__bm_verify_start(WT_BM *bm, WT_SESSION_IMPL *session, WT_CKPT *ckptbase) +__bm_verify_start(WT_BM *bm, + WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) { - return (__wt_block_verify_start(session, bm->block, ckptbase)); + return (__wt_block_verify_start(session, bm->block, ckptbase, cfg)); } /* diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c index 5a882f0fb7c..8e45ec85a97 100644 --- a/src/third_party/wiredtiger/src/block/block_open.c +++ b/src/third_party/wiredtiger/src/block/block_open.c @@ -388,7 +388,7 @@ err: __wt_scr_free(session, &buf); /* * __wt_block_stat -- - * Block statistics + * Set the statistics for a live block handle. */ void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats) @@ -409,3 +409,19 @@ __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats) WT_STAT_SET(stats, block_size, block->fh->size); __wt_spin_unlock(session, &block->live_lock); } + +/* + * __wt_block_manager_size -- + * Set the size statistic for a file. + */ +int +__wt_block_manager_size( + WT_SESSION_IMPL *session, const char *filename, WT_DSRC_STATS *stats) +{ + wt_off_t filesize; + + WT_RET(__wt_filesize_name(session, filename, &filesize)); + WT_STAT_SET(stats, block_size, filesize); + + return (0); +} diff --git a/src/third_party/wiredtiger/src/block/block_vrfy.c b/src/third_party/wiredtiger/src/block/block_vrfy.c index 1e341aff77a..29a9e4950b4 100644 --- a/src/third_party/wiredtiger/src/block/block_vrfy.c +++ b/src/third_party/wiredtiger/src/block/block_vrfy.c @@ -28,10 +28,11 @@ static int __verify_last_truncate(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *); * Start file verification. */ int -__wt_block_verify_start( - WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) +__wt_block_verify_start(WT_SESSION_IMPL *session, + WT_BLOCK *block, WT_CKPT *ckptbase, const char *cfg[]) { WT_CKPT *ckpt; + WT_CONFIG_ITEM cval; wt_off_t size; /* @@ -98,6 +99,10 @@ __wt_block_verify_start( */ WT_RET(__verify_last_avail(session, block, ckpt)); + /* Configuration: strict behavior on any error. */ + WT_RET(__wt_config_gets(session, cfg, "strict", &cval)); + block->verify_strict = cval.val ? 1 : 0; + block->verify = 1; return (0); } @@ -164,14 +169,18 @@ __wt_block_verify_end(WT_SESSION_IMPL *session, WT_BLOCK *block) /* Confirm we verified every file block. */ ret = __verify_filefrag_chk(session, block); + block->verify = 0; + block->verify_strict = 0; + block->verify_size = 0; + /* Discard the accumulated allocation list. */ __wt_block_extlist_free(session, &block->verify_alloc); /* Discard the fragment tracking lists. */ + block->frags = 0; __wt_free(session, block->fragfile); __wt_free(session, block->fragckpt); - block->verify = 0; return (ret); } @@ -434,7 +443,7 @@ __verify_filefrag_chk(WT_SESSION_IMPL *session, WT_BLOCK *block) return (0); __wt_errx(session, "file ranges never verified: %" PRIu64, count); - return (WT_ERROR); + return (block->verify_strict ? WT_ERROR : 0); } /* @@ -527,5 +536,5 @@ __verify_ckptfrag_chk(WT_SESSION_IMPL *session, WT_BLOCK *block) __wt_errx(session, "checkpoint ranges never verified: %" PRIu64, count); - return (WT_ERROR); + return (block->verify_strict ? WT_ERROR : 0); } diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c index 0b93cc981d7..120220223f8 100644 --- a/src/third_party/wiredtiger/src/btree/bt_page.c +++ b/src/third_party/wiredtiger/src/btree/bt_page.c @@ -49,6 +49,9 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) /* Trigger eviction on the next page release. */ __wt_page_evict_soon(page); + /* Bump the oldest ID, we're about to do some visibility checks. */ + __wt_txn_update_oldest(session, 0); + /* If eviction cannot succeed, don't try. */ return (__wt_page_can_evict(session, page, 1)); } @@ -168,7 +171,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags page->read_gen != WT_READGEN_OLDEST && page->read_gen < __wt_cache_read_gen(session)) page->read_gen = - __wt_cache_read_gen_set(session); + __wt_cache_read_gen_bump(session); return (0); WT_ILLEGAL_VALUE(session); diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index f5c3d5fa331..eb2382cd610 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -1028,20 +1028,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, * are holding it locked. */ if (ret == 0 && !exclusive && - !F_ISSET_ATOMIC(parent, WT_PAGE_REFUSE_DEEPEN) && - __split_should_deepen(session, parent_ref, &children)) { - /* - * XXX - * Temporary hack to avoid a bug where the root page is split - * even when it's no longer doing any good. - */ - uint64_t __a, __b; - __a = parent->memory_footprint; + __split_should_deepen(session, parent_ref, &children)) ret = __split_deepen(session, parent, children); - __b = parent->memory_footprint; - if (__b * 2 >= __a) - F_SET_ATOMIC(parent, WT_PAGE_REFUSE_DEEPEN); - } err: if (!complete) for (i = 0; i < parent_entries; ++i) { diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index 71b0d0abdb3..ca3b8f327b3 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -71,7 +71,7 @@ __sync_file(WT_SESSION_IMPL *session, int syncop) __wt_txn_visible_all( session, page->modify->update_txn)) { if (txn->isolation == TXN_ISO_READ_COMMITTED) - __wt_txn_refresh(session, 1); + __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, 0)); @@ -190,6 +190,18 @@ err: /* On error, clear any left-over tree walk. */ if (btree->checkpointing) { /* + * Update the checkpoint generation for this handle so visible + * updates newer than the checkpoint can be evicted. + * + * This has to be published before eviction is enabled again, + * so that eviction knows that the checkpoint has completed. + */ + WT_PUBLISH(btree->checkpoint_gen, + S2C(session)->txn_global.checkpoint_gen); + WT_STAT_FAST_DATA_SET(session, + btree_checkpoint_generation, btree->checkpoint_gen); + + /* * Clear the checkpoint flag and push the change; not required, * but publishing the change means stalled eviction gets moving * as soon as possible. diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c index 45c2029f6ed..93d1ddad8c6 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c @@ -23,7 +23,7 @@ typedef struct { #define WT_VRFY_DUMP(vs) \ ((vs)->dump_address || \ (vs)->dump_blocks || (vs)->dump_pages || (vs)->dump_shape) - int dump_address; /* Debugging hooks */ + int dump_address; /* Configure: dump special */ int dump_blocks; int dump_pages; int dump_shape; @@ -176,7 +176,7 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) __wt_meta_ckptlist_get(session, btree->dhandle->name, &ckptbase)); /* Inform the underlying block manager we're verifying. */ - WT_ERR(bm->verify_start(bm, session, ckptbase)); + WT_ERR(bm->verify_start(bm, session, ckptbase, cfg)); bm_start = 1; /* Loop through the file's checkpoints, verifying each one. */ diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index e2990f26719..d068c196771 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -312,7 +312,7 @@ static const WT_CONFIG_CHECK confchk_session_open_cursor[] = { { "readonly", "boolean", NULL, NULL, NULL }, { "skip_sort_check", "boolean", NULL, NULL, NULL }, { "statistics", "list", - NULL, "choices=[\"all\",\"fast\",\"clear\"]", + NULL, "choices=[\"all\",\"fast\",\"clear\",\"size\"]", NULL }, { "target", "list", NULL, NULL, NULL }, { NULL, NULL, NULL, NULL, NULL } @@ -337,6 +337,7 @@ static const WT_CONFIG_CHECK confchk_session_verify[] = { { "dump_offsets", "list", NULL, NULL, NULL }, { "dump_pages", "boolean", NULL, NULL, NULL }, { "dump_shape", "boolean", NULL, NULL, NULL }, + { "strict", "boolean", NULL, NULL, NULL }, { NULL, NULL, NULL, NULL, NULL } }; @@ -780,7 +781,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { }, { "session.verify", "dump_address=0,dump_blocks=0,dump_offsets=,dump_pages=0," - "dump_shape=0", + "dump_shape=0,strict=0", confchk_session_verify }, { "table.meta", diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c index 9d49e36a5ca..85d9bb08d26 100644 --- a/src/third_party/wiredtiger/src/conn/conn_log.c +++ b/src/third_party/wiredtiger/src/conn/conn_log.c @@ -270,11 +270,11 @@ err: } /* - * __log_close_server -- + * __log_file_server -- * The log close server thread. */ static WT_THREAD_RET -__log_close_server(void *arg) +__log_file_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; @@ -317,6 +317,8 @@ __log_close_server(void *arg) __wt_spin_lock(session, &log->log_sync_lock); locked = 1; WT_ERR(__wt_close(session, &close_fh)); + WT_ASSERT(session, + LOG_CMP(&close_end_lsn, &log->sync_lsn) >= 0); log->sync_lsn = close_end_lsn; WT_ERR(__wt_cond_signal(session, log->log_sync_cond)); locked = 0; @@ -324,7 +326,7 @@ __log_close_server(void *arg) } else /* Wait until the next event. */ WT_ERR(__wt_cond_wait(session, - conn->log_close_cond, WT_MILLION)); + conn->log_file_cond, WT_MILLION)); } if (0) { @@ -433,7 +435,7 @@ __log_wrlsn_server(void *arg) */ if (F_ISSET(slot, SLOT_CLOSEFH)) WT_ERR(__wt_cond_signal(session, - conn->log_close_cond)); + conn->log_file_cond)); WT_ERR(__wt_log_slot_free(session, slot)); } } @@ -583,16 +585,16 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) * If logging is enabled, this thread runs. */ WT_RET(__wt_open_internal_session( - conn, "log-close-server", 0, 0, &conn->log_close_session)); - WT_RET(__wt_cond_alloc(conn->log_close_session, - "log close server", 0, &conn->log_close_cond)); + conn, "log-close-server", 0, 0, &conn->log_file_session)); + WT_RET(__wt_cond_alloc(conn->log_file_session, + "log close server", 0, &conn->log_file_cond)); /* * Start the log file close thread. */ - WT_RET(__wt_thread_create(conn->log_close_session, - &conn->log_close_tid, __log_close_server, conn->log_close_session)); - conn->log_close_tid_set = 1; + WT_RET(__wt_thread_create(conn->log_file_session, + &conn->log_file_tid, __log_file_server, conn->log_file_session)); + conn->log_file_tid_set = 1; /* * Start the log write LSN thread. It is not configurable. @@ -667,16 +669,16 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) conn->log_tid_set = 0; } WT_TRET(__wt_cond_destroy(session, &conn->log_cond)); - if (conn->log_close_tid_set) { - WT_TRET(__wt_cond_signal(session, conn->log_close_cond)); - WT_TRET(__wt_thread_join(session, conn->log_close_tid)); - conn->log_close_tid_set = 0; + if (conn->log_file_tid_set) { + WT_TRET(__wt_cond_signal(session, conn->log_file_cond)); + WT_TRET(__wt_thread_join(session, conn->log_file_tid)); + conn->log_file_tid_set = 0; } - WT_TRET(__wt_cond_destroy(session, &conn->log_close_cond)); - if (conn->log_close_session != NULL) { - wt_session = &conn->log_close_session->iface; + WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond)); + if (conn->log_file_session != NULL) { + wt_session = &conn->log_file_session->iface; WT_TRET(wt_session->close(wt_session, NULL)); - conn->log_close_session = NULL; + conn->log_file_session = NULL; } if (conn->log_wrlsn_tid_set) { WT_TRET(__wt_cond_signal(session, conn->log_wrlsn_cond)); diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c index e0e59dea8ba..ca8335fbdb9 100644 --- a/src/third_party/wiredtiger/src/conn/conn_open.c +++ b/src/third_party/wiredtiger/src/conn/conn_open.c @@ -92,7 +92,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) * transaction ID will catch up with the current ID. */ for (;;) { - __wt_txn_update_oldest(session); + __wt_txn_update_oldest(session, 1); if (txn_global->oldest_id == txn_global->current) break; __wt_yield(); diff --git a/src/third_party/wiredtiger/src/cursor/cur_stat.c b/src/third_party/wiredtiger/src/cursor/cur_stat.c index 85442592c39..82568401319 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_stat.c +++ b/src/third_party/wiredtiger/src/cursor/cur_stat.c @@ -373,6 +373,22 @@ __curstat_file_init(WT_SESSION_IMPL *session, { WT_DATA_HANDLE *dhandle; WT_DECL_RET; + const char *filename; + + /* + * If we are only getting the size of the file, we don't need to open + * the tree. + */ + if (F_ISSET(cst, WT_CONN_STAT_SIZE)) { + filename = uri; + if (!WT_PREFIX_SKIP(filename, "file:")) + return (EINVAL); + __wt_stat_init_dsrc_stats(&cst->u.dsrc_stats); + WT_RET(__wt_block_manager_size( + session, filename, &cst->u.dsrc_stats)); + __wt_curstat_dsrc_final(cst); + return (0); + } WT_RET(__wt_session_get_btree_ckpt(session, uri, cfg, 0)); dhandle = session->dhandle; @@ -508,8 +524,22 @@ __wt_curstat_open(WT_SESSION_IMPL *session, } WT_ERR_NOTFOUND_OK(ret); if ((ret = __wt_config_subgets( - session, &cval, "clear", &sval)) == 0 && sval.val != 0) + session, &cval, "size", &sval)) == 0 && sval.val != 0) { + if (F_ISSET(cst, WT_CONN_STAT_FAST | WT_CONN_STAT_ALL)) + WT_ERR_MSG(session, EINVAL, + "only one statistics configuration value " + "may be specified"); + F_SET(cst, WT_CONN_STAT_SIZE); + } + WT_ERR_NOTFOUND_OK(ret); + if ((ret = __wt_config_subgets( + session, &cval, "clear", &sval)) == 0 && sval.val != 0) { + if (F_ISSET(cst, WT_CONN_STAT_SIZE)) + WT_ERR_MSG(session, EINVAL, + "clear is incompatible with size " + "statistics"); F_SET(cst, WT_CONN_STAT_CLEAR); + } WT_ERR_NOTFOUND_OK(ret); /* If no configuration, use the connection's configuration. */ diff --git a/src/third_party/wiredtiger/src/evict/evict_file.c b/src/third_party/wiredtiger/src/evict/evict_file.c index 864c116a380..795833d3b25 100644 --- a/src/third_party/wiredtiger/src/evict/evict_file.c +++ b/src/third_party/wiredtiger/src/evict/evict_file.c @@ -27,7 +27,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop) WT_RET(__wt_evict_file_exclusive_on(session, &evict_reset)); /* Make sure the oldest transaction ID is up-to-date. */ - __wt_txn_update_oldest(session); + __wt_txn_update_oldest(session, 1); /* Walk the tree, discarding pages. */ next_ref = NULL; diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index a3bab5457f6..63a905539ce 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -493,6 +493,14 @@ __evict_pass(WT_SESSION_IMPL *session) session, cache->evict_waiter_cond)); } + /* + * Increment the shared read generation. We do this + * occasionally even if eviction is not currently required, so + * that pages have some relative read generation when the + * eviction server does need to do some work. + */ + __wt_cache_read_gen_incr(session); + WT_RET(__evict_has_work(session, &flags)); if (flags == 0) break; @@ -681,7 +689,7 @@ __wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref) * before evicting, using a special "eviction" isolation level, where * only globally visible updates can be evicted. */ - __wt_txn_update_oldest(session); + __wt_txn_update_oldest(session, 1); txn = &session->txn; saved_iso = txn->isolation; txn->isolation = TXN_ISO_EVICTION; @@ -838,6 +846,9 @@ __evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags) WT_ASSERT(session, cache->evict[0].ref != NULL); + /* Track the oldest read generation we have in the queue. */ + cache->read_gen_oldest = cache->evict[0].ref->page->read_gen; + if (LF_ISSET(WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK)) /* * Take all candidates if we only gathered pages with an oldest @@ -933,16 +944,13 @@ __evict_walk(WT_SESSION_IMPL *session, uint32_t flags) incr = dhandle_locked = 0; retries = 0; - /* Increment the shared read generation. */ - __wt_cache_read_gen_incr(session); - /* * Update the oldest ID: we use it to decide whether pages are * candidates for eviction. Without this, if all threads are blocked * after a long-running transaction (such as a checkpoint) completes, * we may never start evicting again. */ - __wt_txn_update_oldest(session); + __wt_txn_update_oldest(session, 1); if (cache->evict_current == NULL) WT_STAT_FAST_CONN_INCR(session, cache_eviction_queue_empty); @@ -1222,15 +1230,11 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags) continue; /* - * If this page has never been considered for eviction, - * set its read generation to a little bit in the - * future and move on, give readers a chance to start - * updating the read generation. + * If this page has never been considered for eviction, set its + * read generation to somewhere in the middle of the LRU list. */ - if (page->read_gen == WT_READGEN_NOTSET) { - page->read_gen = __wt_cache_read_gen_set(session); - continue; - } + if (page->read_gen == WT_READGEN_NOTSET) + page->read_gen = __wt_cache_read_gen_new(session); fast: /* If the page can't be evicted, give up. */ if (!__wt_page_can_evict(session, page, 1)) @@ -1424,7 +1428,7 @@ __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_server) */ page = ref->page; if (page->read_gen != WT_READGEN_OLDEST) - page->read_gen = __wt_cache_read_gen_set(session); + page->read_gen = __wt_cache_read_gen_bump(session); /* * If we are evicting in a dead tree, don't write dirty pages. @@ -1475,7 +1479,7 @@ __wt_cache_wait(WT_SESSION_IMPL *session, int full) * to make sure there is free space in the cache. */ txn_global = &S2C(session)->txn_global; - txn_state = &txn_global->states[session->id]; + txn_state = WT_SESSION_TXN_STATE(session); busy = txn_state->id != WT_TXN_NONE || session->nhazard > 0 || (txn_state->snap_min != WT_TXN_NONE && @@ -1524,7 +1528,7 @@ __wt_cache_wait(WT_SESSION_IMPL *session, int full) * are not busy. */ if (busy) { - __wt_txn_update_oldest(session); + __wt_txn_update_oldest(session, 0); if (txn_state->id == txn_global->oldest_id || txn_state->snap_min == txn_global->oldest_id) return (0); diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index 92ad8d296df..fe08916b24c 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -59,6 +59,9 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive) conn = S2C(session); + /* Checkpoints should never do eviction. */ + WT_ASSERT(session, !WT_SESSION_IS_CHECKPOINT(session)); + page = ref->page; forced_eviction = (page->read_gen == WT_READGEN_OLDEST); inmem_split = 0; diff --git a/src/third_party/wiredtiger/src/include/block.h b/src/third_party/wiredtiger/src/include/block.h index 4ef1b9da4ec..fb8987efdb4 100644 --- a/src/third_party/wiredtiger/src/include/block.h +++ b/src/third_party/wiredtiger/src/include/block.h @@ -185,7 +185,8 @@ struct __wt_bm { int (*sync)(WT_BM *, WT_SESSION_IMPL *, int); int (*verify_addr)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t); int (*verify_end)(WT_BM *, WT_SESSION_IMPL *); - int (*verify_start)(WT_BM *, WT_SESSION_IMPL *, WT_CKPT *); + int (*verify_start) + (WT_BM *, WT_SESSION_IMPL *, WT_CKPT *, const char *[]); int (*write) (WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, uint8_t *, size_t *, int); int (*write_size)(WT_BM *, WT_SESSION_IMPL *, size_t *); @@ -246,6 +247,7 @@ struct __wt_block { /* Verification support */ int verify; /* If performing verification */ + int verify_strict; /* Fail hard on any error */ wt_off_t verify_size; /* Checkpoint's file size */ WT_EXTLIST verify_alloc; /* Verification allocation list */ uint64_t frags; /* Maximum frags in the file */ diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 303162fcc93..23b17ef2cd3 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -194,6 +194,11 @@ struct __wt_page_modify { /* The largest update transaction ID (approximate). */ uint64_t update_txn; +#ifdef HAVE_DIAGNOSTIC + /* Check that transaction time moves forward. */ + uint64_t last_oldest_id; +#endif + /* Dirty bytes added to the cache. */ size_t bytes_dirty; @@ -534,10 +539,9 @@ struct __wt_page { #define WT_PAGE_DISK_ALLOC 0x02 /* Disk image in allocated memory */ #define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */ #define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */ -#define WT_PAGE_REFUSE_DEEPEN 0x10 /* Don't deepen the tree at this page */ -#define WT_PAGE_SCANNING 0x20 /* Obsolete updates are being scanned */ -#define WT_PAGE_SPLIT_INSERT 0x40 /* A leaf page was split for append */ -#define WT_PAGE_SPLITTING 0x80 /* An internal page is growing */ +#define WT_PAGE_SCANNING 0x10 /* Obsolete updates are being scanned */ +#define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */ +#define WT_PAGE_SPLITTING 0x40 /* An internal page is growing */ uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ /* diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h index 11f631416af..58b7b4dbddb 100644 --- a/src/third_party/wiredtiger/src/include/cache.h +++ b/src/third_party/wiredtiger/src/include/cache.h @@ -71,6 +71,8 @@ struct __wt_cache { * Read information. */ uint64_t read_gen; /* Page read generation (LRU) */ + uint64_t read_gen_oldest; /* The oldest read generation that + eviction knows about */ /* * Eviction thread information. diff --git a/src/third_party/wiredtiger/src/include/cache.i b/src/third_party/wiredtiger/src/include/cache.i index f952f1bf698..d84069c43fb 100644 --- a/src/third_party/wiredtiger/src/include/cache.i +++ b/src/third_party/wiredtiger/src/include/cache.i @@ -27,11 +27,11 @@ __wt_cache_read_gen_incr(WT_SESSION_IMPL *session) } /* - * __wt_cache_read_gen_set -- - * Get the read generation to store in a page. + * __wt_cache_read_gen_bump -- + * Get the read generation to keep a page in memory. */ static inline uint64_t -__wt_cache_read_gen_set(WT_SESSION_IMPL *session) +__wt_cache_read_gen_bump(WT_SESSION_IMPL *session) { /* * We return read-generations from the future (where "the future" is @@ -46,6 +46,19 @@ __wt_cache_read_gen_set(WT_SESSION_IMPL *session) } /* + * __wt_cache_read_gen_new -- + * Get the read generation for a new page in memory. + */ +static inline uint64_t +__wt_cache_read_gen_new(WT_SESSION_IMPL *session) +{ + WT_CACHE *cache; + + cache = S2C(session)->cache; + return (__wt_cache_read_gen(session) + cache->read_gen_oldest) / 2; +} + +/* * __wt_cache_pages_inuse -- * Return the number of pages in use. */ diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index a95b051fbc0..f24459a4147 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -252,6 +252,7 @@ struct __wt_connection_impl { #define WT_CONN_STAT_FAST 0x04 /* "fast" statistics configured */ #define WT_CONN_STAT_NONE 0x08 /* don't gather statistics */ #define WT_CONN_STAT_ON_CLOSE 0x10 /* output statistics on close */ +#define WT_CONN_STAT_SIZE 0x20 /* "size" statistics configured */ uint32_t stat_flags; WT_CONNECTION_STATS stats; /* Connection statistics */ @@ -317,10 +318,10 @@ struct __wt_connection_impl { WT_SESSION_IMPL *log_session; /* Log server session */ wt_thread_t log_tid; /* Log server thread */ int log_tid_set; /* Log server thread set */ - WT_CONDVAR *log_close_cond;/* Log close thread wait mutex */ - WT_SESSION_IMPL *log_close_session;/* Log close thread session */ - wt_thread_t log_close_tid; /* Log close thread thread */ - int log_close_tid_set;/* Log close thread set */ + WT_CONDVAR *log_file_cond; /* Log file thread wait mutex */ + WT_SESSION_IMPL *log_file_session;/* Log file thread session */ + wt_thread_t log_file_tid; /* Log file thread thread */ + int log_file_tid_set;/* Log file thread set */ WT_CONDVAR *log_wrlsn_cond;/* Log write lsn thread wait mutex */ WT_SESSION_IMPL *log_wrlsn_session;/* Log write lsn thread session */ wt_thread_t log_wrlsn_tid; /* Log write lsn thread thread */ diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 59e795893b5..63b6bb2cbc5 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -50,6 +50,7 @@ extern int __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const extern int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block); extern int __wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh, uint32_t allocsize); extern void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats); +extern int __wt_block_manager_size( WT_SESSION_IMPL *session, const char *filename, WT_DSRC_STATS *stats); extern int __wt_bm_preload(WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size); extern int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size); extern int __wt_block_read_off_blind( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset); @@ -65,7 +66,7 @@ extern int __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block); extern int __wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size); extern int __wt_block_salvage_next(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t *addr_sizep, int *eofp); extern int __wt_block_salvage_valid(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t addr_size, int valid); -extern int __wt_block_verify_start( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase); +extern int __wt_block_verify_start(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase, const char *cfg[]); extern int __wt_block_verify_end(WT_SESSION_IMPL *session, WT_BLOCK *block); extern int __wt_verify_ckpt_load( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci); extern int __wt_verify_ckpt_unload(WT_SESSION_IMPL *session, WT_BLOCK *block); @@ -363,8 +364,12 @@ extern int __wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); extern int64_t __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size); extern int __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); extern int __wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize); +extern int __wt_clsm_request_switch(WT_CURSOR_LSM *clsm); +extern int __wt_clsm_await_switch(WT_CURSOR_LSM *clsm); extern int __wt_clsm_init_merge( WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks); +extern int __wt_clsm_close(WT_CURSOR *cursor); extern int __wt_clsm_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); +extern int __wt_clsm_open_bulk(WT_CURSOR_LSM *clsm, const char *cfg[]); extern int __wt_lsm_manager_config(WT_SESSION_IMPL *session, const char **cfg); extern int __wt_lsm_manager_reconfig(WT_SESSION_IMPL *session, const char **cfg); extern int __wt_lsm_manager_start(WT_SESSION_IMPL *session); @@ -435,6 +440,7 @@ extern int __wt_meta_track_checkpoint(WT_SESSION_IMPL *session); extern int __wt_meta_track_insert(WT_SESSION_IMPL *session, const char *key); extern int __wt_meta_track_update(WT_SESSION_IMPL *session, const char *key); extern int __wt_meta_track_fileop( WT_SESSION_IMPL *session, const char *olduri, const char *newuri); +extern int __wt_meta_track_drop( WT_SESSION_IMPL *session, const char *filename); extern int __wt_meta_track_handle_lock(WT_SESSION_IMPL *session, int created); extern int __wt_turtle_init(WT_SESSION_IMPL *session); extern int __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, char **valuep); @@ -659,9 +665,9 @@ extern void __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats); extern void __wt_stat_refresh_connection_stats(void *stats_arg); extern int WT_CDECL __wt_txnid_cmp(const void *v1, const void *v2); extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session); -extern void __wt_txn_update_oldest(WT_SESSION_IMPL *session); -extern void __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot); -extern int __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]); +extern void __wt_txn_get_snapshot(WT_SESSION_IMPL *session); +extern void __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force); +extern int __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]); extern void __wt_txn_release(WT_SESSION_IMPL *session); extern int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]); diff --git a/src/third_party/wiredtiger/src/include/lsm.h b/src/third_party/wiredtiger/src/include/lsm.h index aa1d797e3b5..dc6a0d7e027 100644 --- a/src/third_party/wiredtiger/src/include/lsm.h +++ b/src/third_party/wiredtiger/src/include/lsm.h @@ -57,15 +57,16 @@ struct __wt_cursor_lsm { u_int update_count; /* Updates performed. */ -#define WT_CLSM_ACTIVE 0x01 /* Incremented the session count */ -#define WT_CLSM_ITERATE_NEXT 0x02 /* Forward iteration */ -#define WT_CLSM_ITERATE_PREV 0x04 /* Backward iteration */ -#define WT_CLSM_MERGE 0x08 /* Merge cursor, don't update */ -#define WT_CLSM_MINOR_MERGE 0x10 /* Minor merge, include tombstones */ -#define WT_CLSM_MULTIPLE 0x20 /* Multiple cursors have values for the +#define WT_CLSM_ACTIVE 0x001 /* Incremented the session count */ +#define WT_CLSM_BULK 0x002 /* Open for snapshot isolation */ +#define WT_CLSM_ITERATE_NEXT 0x004 /* Forward iteration */ +#define WT_CLSM_ITERATE_PREV 0x008 /* Backward iteration */ +#define WT_CLSM_MERGE 0x010 /* Merge cursor, don't update */ +#define WT_CLSM_MINOR_MERGE 0x020 /* Minor merge, include tombstones */ +#define WT_CLSM_MULTIPLE 0x040 /* Multiple cursors have values for the current key */ -#define WT_CLSM_OPEN_READ 0x40 /* Open for reads */ -#define WT_CLSM_OPEN_SNAPSHOT 0x80 /* Open for snapshot isolation */ +#define WT_CLSM_OPEN_READ 0x080 /* Open for reads */ +#define WT_CLSM_OPEN_SNAPSHOT 0x100 /* Open for snapshot isolation */ uint32_t flags; }; diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index 927ab09d5f9..d2b369a41c4 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -25,6 +25,9 @@ #define WT_SESSION_TXN_STATE(s) (&S2C(s)->txn_global.states[(s)->id]) +#define WT_SESSION_IS_CHECKPOINT(s) \ + ((s)->id != 0 && (s)->id == S2C(s)->txn_global.checkpoint_id) + struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_txn_state { volatile uint64_t id; volatile uint64_t snap_min; @@ -42,22 +45,19 @@ struct __wt_txn_global { */ volatile uint64_t oldest_id; - /* The oldest session found in the last scan. */ - uint32_t oldest_session; - /* Count of scanning threads, or -1 for exclusive access. */ volatile int32_t scan_count; /* - * Track information about the running checkpoint. The transaction IDs - * used when checkpointing are special. Checkpoints can run for a long - * time so we keep them out of regular visibility checks. Eviction and - * checkpoint operations know when they need to be aware of - * checkpoint IDs. + * Track information about the running checkpoint. The transaction + * snapshot used when checkpointing are special. Checkpoints can run + * for a long time so we keep them out of regular visibility checks. + * Eviction and checkpoint operations know when they need to be aware + * of checkpoint transactions. */ + volatile uint32_t checkpoint_id; /* Checkpoint's session ID */ volatile uint64_t checkpoint_gen; - volatile uint64_t checkpoint_id; - volatile uint64_t checkpoint_snap_min; + volatile uint64_t checkpoint_pinned; WT_TXN_STATE *states; /* Per-session transaction states */ }; diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index 4ae80231c65..a9b19ca1ff5 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -98,33 +98,37 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_TXN_GLOBAL *txn_global; - uint64_t checkpoint_snap_min, oldest_id; + uint64_t checkpoint_pinned, oldest_id; + uint32_t checkpoint_gen; txn_global = &S2C(session)->txn_global; btree = S2BT_SAFE(session); /* - * Take a local copy of ID in case they are updated while we are + * Take a local copy of these IDs in case they are updated while we are * checking visibility. */ - checkpoint_snap_min = txn_global->checkpoint_snap_min; - oldest_id = txn_global->oldest_id; + WT_ORDERED_READ(oldest_id, txn_global->oldest_id); + WT_ORDERED_READ(checkpoint_gen, txn_global->checkpoint_gen); + WT_ORDERED_READ(checkpoint_pinned, txn_global->checkpoint_pinned); /* - * If there is no active checkpoint or this handle is up to date with - * the active checkpoint it's safe to ignore the checkpoint ID in the - * visibility check. + * Checkpoint transactions often fall behind ordinary application + * threads. Take special effort to not keep changes pinned in cache + * if they are only required for the checkpoint and it has already + * seen them. + * + * If there is no active checkpoint, this session is doing the + * checkpoint, or this handle is up to date with the active checkpoint + * then it's safe to ignore the checkpoint ID in the visibility check. */ - if (checkpoint_snap_min != WT_TXN_NONE && (btree == NULL || - btree->checkpoint_gen != txn_global->checkpoint_gen) && - TXNID_LT(checkpoint_snap_min, oldest_id)) - /* - * Use the checkpoint ID for the visibility check if it is the - * oldest ID in the system. - */ - oldest_id = checkpoint_snap_min; + if (checkpoint_pinned == WT_TXN_NONE || + TXNID_LT(oldest_id, checkpoint_pinned) || + WT_SESSION_IS_CHECKPOINT(session) || + (btree != NULL && btree->checkpoint_gen == checkpoint_gen)) + return (oldest_id); - return (oldest_id); + return (checkpoint_pinned); } /* @@ -154,20 +158,20 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id) txn = &session->txn; - /* - * Eviction only sees globally visible updates, or if there is a - * checkpoint transaction running, use its transaction. - */ - if (txn->isolation == TXN_ISO_EVICTION) - return (__wt_txn_visible_all(session, id)); + /* Changes with no associated transaction are always visible. */ + if (id == WT_TXN_NONE) + return (1); /* Nobody sees the results of aborted transactions. */ if (id == WT_TXN_ABORTED) return (0); - /* Changes with no associated transaction are always visible. */ - if (id == WT_TXN_NONE) - return (1); + /* + * Eviction only sees globally visible updates, or if there is a + * checkpoint transaction running, use its transaction. + */ + if (txn->isolation == TXN_ISO_EVICTION) + return (__wt_txn_visible_all(session, id)); /* * Read-uncommitted transactions see all other changes. @@ -206,6 +210,37 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id) } /* + * __wt_txn_begin -- + * Begin a transaction. + */ +static int +__wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) +{ + WT_TXN *txn; + + txn = &session->txn; + txn->isolation = session->isolation; + txn->txn_logsync = S2C(session)->txn_logsync; + + if (cfg != NULL) + WT_RET(__wt_txn_config(session, cfg)); + + F_SET(txn, TXN_RUNNING); + if (txn->isolation == TXN_ISO_SNAPSHOT) { + if (session->ncursors > 0) + WT_RET(__wt_session_copy_values(session)); + + /* + * We're about to allocate a snapshot: if we need to block for + * eviction, it's better to do it beforehand. + */ + WT_RET(__wt_cache_full_check(session)); + __wt_txn_get_snapshot(session); + } + return (0); +} + +/* * __wt_txn_read -- * Get the first visible update in a list (or NULL if none are visible). */ @@ -301,7 +336,7 @@ __wt_txn_id_check(WT_SESSION_IMPL *session) if (!F_ISSET(txn, TXN_HAS_ID)) { conn = S2C(session); txn_global = &conn->txn_global; - txn_state = &txn_global->states[session->id]; + txn_state = WT_SESSION_TXN_STATE(session); WT_ASSERT(session, txn_state->id == WT_TXN_NONE); @@ -393,7 +428,7 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session) txn = &session->txn; txn_global = &S2C(session)->txn_global; - txn_state = &txn_global->states[session->id]; + txn_state = WT_SESSION_TXN_STATE(session); /* * If there is no transaction running (so we don't have an ID), and no @@ -418,7 +453,7 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session) if (txn->isolation != TXN_ISO_READ_UNCOMMITTED && !F_ISSET(txn, TXN_HAS_SNAPSHOT)) - __wt_txn_refresh(session, 1); + __wt_txn_get_snapshot(session); } /* diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index e50beac3bfe..4804290acba 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -945,14 +945,16 @@ struct __wt_session { * configured when the database is configured with "all"\, but the * cursor open will fail if "all" is specified when the database is * configured with "fast"\, and the cursor open will fail in all cases - * when the database is configured with "none". If \c statistics is not + * when the database is configured with "none". If "size" is + * configured\, only the underlying size of the object on disk is filled + * in and the object is not opened. If \c statistics is not * configured\, the default configuration is the database configuration. * The "clear" configuration resets statistics after gathering them\, * where appropriate (for example\, a cache size statistic is not * cleared\, while the count of cursor insert operations will be * cleared). See @ref statistics for more information., a list\, with * values chosen from the following options: \c "all"\, \c "fast"\, \c - * "clear"; default empty.} + * "clear"\, \c "size"; default empty.} * @config{target, if non-empty\, backup the list of objects; valid only * for a backup data source., a list of strings; default empty.} * @configend @@ -1335,6 +1337,10 @@ struct __wt_session { * @config{dump_shape, Display the shape of the tree after * verification\, using the application's message handler\, intended for * debugging., a boolean flag; default \c false.} + * @config{strict, Treat any verification problem as an error; by + * default\, verify will warn\, but not fail\, in the case of errors + * that won't affect future behavior (for example\, a leaked block)., a + * boolean flag; default \c false.} * @configend * @ebusy_errors */ diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index b63038b976e..5c1d76105cb 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -48,6 +48,20 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) conn = S2C(session); log = conn->log; + + /* + * We need to wait for the previous log file to get written + * to disk before we sync out the current one and advance + * the LSN. Signal the worker thread because we know the + * LSN has moved into a later log file and there should be a + * log file ready to close. + */ + while (log->sync_lsn.file < min_lsn->file) { + WT_ERR(__wt_cond_signal(session, conn->log_file_cond)); + WT_ERR(__wt_cond_wait( + session, log->log_sync_cond, 10000)); + } + __wt_spin_lock(session, &log->log_sync_lock); WT_ASSERT(session, log->log_dir_fh != NULL); /* @@ -1063,7 +1077,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) * Signal the close thread if needed. */ if (F_ISSET(slot, SLOT_CLOSEFH)) - WT_ERR(__wt_cond_signal(session, conn->log_close_cond)); + WT_ERR(__wt_cond_signal(session, conn->log_file_cond)); /* * Try to consolidate calls to fsync to wait less. Acquire a spin lock diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c index 7665e417722..111de7a2be1 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c @@ -20,11 +20,11 @@ static int __clsm_open_cursors(WT_CURSOR_LSM *, int, u_int, uint32_t); static int __clsm_reset_cursors(WT_CURSOR_LSM *, WT_CURSOR *); /* - * __clsm_request_switch -- + * __wt_clsm_request_switch -- * Request an LSM tree switch for a cursor operation. */ -static inline int -__clsm_request_switch(WT_CURSOR_LSM *clsm) +int +__wt_clsm_request_switch(WT_CURSOR_LSM *clsm) { WT_DECL_RET; WT_LSM_TREE *lsm_tree; @@ -44,9 +44,9 @@ __clsm_request_switch(WT_CURSOR_LSM *clsm) if (lsm_tree->nchunks == 0 || (clsm->dsk_gen == lsm_tree->dsk_gen && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))) { + F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH); ret = __wt_lsm_manager_push_entry( session, WT_LSM_WORK_SWITCH, 0, lsm_tree); - F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH); } WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree)); } @@ -55,6 +55,41 @@ __clsm_request_switch(WT_CURSOR_LSM *clsm) } /* + * __wt_clsm_await_switch -- + * Wait for a switch to have completed in the LSM tree + */ +int +__wt_clsm_await_switch(WT_CURSOR_LSM *clsm) +{ + WT_LSM_TREE *lsm_tree; + WT_SESSION_IMPL *session; + int waited; + + lsm_tree = clsm->lsm_tree; + session = (WT_SESSION_IMPL *)clsm->iface.session; + + /* + * If there is no primary chunk, or a chunk has overflowed the hard + * limit, which either means a worker thread has fallen behind or there + * has just been a user-level checkpoint, wait until the tree changes. + * + * We used to switch chunks in the application thread here, but that is + * problematic because there is a transaction in progress and it could + * roll back, leaving the metadata inconsistent. + */ + for (waited = 0; + lsm_tree->nchunks == 0 || + clsm->dsk_gen == lsm_tree->dsk_gen; + ++waited) { + if (waited % 1000 == 0) + WT_RET(__wt_lsm_manager_push_entry( + session, WT_LSM_WORK_SWITCH, 0, lsm_tree)); + __wt_sleep(0, 10); + } + return (0); +} + +/* * __clsm_enter_update -- * Make sure an LSM cursor is ready to perform an update. */ @@ -65,7 +100,7 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm) WT_LSM_CHUNK *primary_chunk; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; - int hard_limit, have_primary, ovfl, waited; + int hard_limit, have_primary, ovfl; lsm_tree = clsm->lsm_tree; ovfl = 0; @@ -108,30 +143,13 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm) } /* Request a switch. */ - WT_RET(__clsm_request_switch(clsm)); + WT_RET(__wt_clsm_request_switch(clsm)); /* If we only overflowed the soft limit, we're done. */ if (have_primary && !hard_limit) return (0); - /* - * If there is no primary chunk, or it has overflowed the hard limit, - * which either means a worker thread has fallen behind or there has - * just been a user-level checkpoint, wait until the tree changes. - * - * We used to switch chunks in the application thread if we got to - * here, but that is problematic because there is a transaction in - * progress and it could roll back, leaving the metadata inconsistent. - */ - for (waited = 0; - lsm_tree->nchunks == 0 || - clsm->dsk_gen == lsm_tree->dsk_gen; - ++waited) { - if (waited % 1000 == 0) - WT_RET(__wt_lsm_manager_push_entry( - session, WT_LSM_WORK_SWITCH, 0, lsm_tree)); - __wt_sleep(0, 10); - } + WT_RET(__wt_clsm_await_switch(clsm)); return (0); } @@ -1423,11 +1441,11 @@ err: __clsm_leave(clsm); } /* - * __clsm_close -- + * __wt_clsm_close -- * WT_CURSOR->close method for the LSM cursor type. */ -static int -__clsm_close(WT_CURSOR *cursor) +int +__wt_clsm_close(WT_CURSOR *cursor) { WT_CURSOR_LSM *clsm; WT_DECL_RET; @@ -1481,14 +1499,17 @@ __wt_clsm_open(WT_SESSION_IMPL *session, __clsm_update, /* update */ __clsm_remove, /* remove */ __wt_cursor_reconfigure, /* reconfigure */ - __clsm_close); /* close */ + __wt_clsm_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_LSM *clsm; WT_DECL_RET; WT_LSM_TREE *lsm_tree; + int bulk; + bulk = 0; clsm = NULL; cursor = NULL; + lsm_tree = NULL; if (!WT_PREFIX_MATCH(uri, "lsm:")) return (EINVAL); @@ -1498,9 +1519,22 @@ __wt_clsm_open(WT_SESSION_IMPL *session, WT_RET_MSG(session, EINVAL, "LSM does not support opening by checkpoint"); + WT_RET(__wt_config_gets_def(session, cfg, "bulk", 0, &cval)); + if (cval.val != 0) + bulk = 1; + /* Get the LSM tree. */ WT_WITH_DHANDLE_LOCK(session, - ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)); + ret = __wt_lsm_tree_get(session, uri, bulk, &lsm_tree)); + /* + * Check whether the exclusive open for a bulk load succeeded, and + * if it did ensure that it's safe to bulk load into the tree. + */ + if (bulk && (ret == EBUSY || (ret == 0 && lsm_tree->nchunks > 1))) + WT_ERR_MSG(session, EINVAL, + "bulk-load is only supported on newly created LSM trees"); + WT_ASSERT(session, !bulk || lsm_tree->exclusive); + /* Flag any errors from the tree get. */ WT_RET(ret); WT_ERR(__wt_calloc_one(session, &clsm)); @@ -1523,9 +1557,20 @@ __wt_clsm_open(WT_SESSION_IMPL *session, WT_STATIC_ASSERT(offsetof(WT_CURSOR_LSM, iface) == 0); WT_ERR(__wt_cursor_init(cursor, cursor->uri, owner, cfg, cursorp)); + if (bulk) + WT_ERR(__wt_clsm_open_bulk(clsm, cfg)); + if (0) { err: if (clsm != NULL) - WT_TRET(__clsm_close(cursor)); + WT_TRET(__wt_clsm_close(cursor)); + else if (lsm_tree != NULL) + __wt_lsm_tree_release(session, lsm_tree); + + /* + * We open bulk cursors after setting the returned cursor. + * Fix that here. + */ + *cursorp = NULL; } return (ret); diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c new file mode 100644 index 00000000000..8099c87c3bf --- /dev/null +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c @@ -0,0 +1,131 @@ +/*- + * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __clsm_close_bulk -- + * WT_CURSOR->close method for LSM bulk cursors. + */ +static int +__clsm_close_bulk(WT_CURSOR *cursor) +{ + WT_CURSOR_LSM *clsm; + WT_CURSOR *bulk_cursor; + WT_LSM_TREE *lsm_tree; + WT_SESSION_IMPL *session; + + clsm = (WT_CURSOR_LSM *)cursor; + lsm_tree = clsm->lsm_tree; + session = (WT_SESSION_IMPL *)clsm->iface.session; + + /* Close the bulk cursor to ensure the chunk is written to disk. */ + bulk_cursor = clsm->cursors[0]; + WT_RET(bulk_cursor->close(bulk_cursor)); + clsm->cursors[0] = NULL; + clsm->nchunks = 0; + + /* Set ondisk, and flush the metadata */ + F_SET(lsm_tree->chunk[0], WT_LSM_CHUNK_ONDISK); + WT_RET(__wt_lsm_meta_write(session, lsm_tree)); + ++lsm_tree->dsk_gen; + + /* Close the LSM cursor */ + WT_RET(__wt_clsm_close(cursor)); + + return (0); +} +/* + * __clsm_insert_bulk -- + * WT_CURSOR->insert method for LSM bulk cursors. + */ +static int +__clsm_insert_bulk(WT_CURSOR *cursor) +{ + WT_CURSOR *bulk_cursor; + WT_CURSOR_LSM *clsm; + WT_LSM_TREE *lsm_tree; + WT_SESSION_IMPL *session; + + clsm = (WT_CURSOR_LSM *)cursor; + lsm_tree = clsm->lsm_tree; + session = (WT_SESSION_IMPL *)clsm->iface.session; + + WT_ASSERT(session, lsm_tree->nchunks == 1 && clsm->nchunks == 1); + ++lsm_tree->chunk[0]->count; + bulk_cursor = *clsm->cursors; + bulk_cursor->set_key(bulk_cursor, &cursor->key); + bulk_cursor->set_value(bulk_cursor, &cursor->value); + WT_RET(bulk_cursor->insert(bulk_cursor)); + + return (0); +} + +/* + * __wt_clsm_open_bulk -- + * WT_SESSION->open_cursor method for LSM bulk cursors. + */ +int +__wt_clsm_open_bulk(WT_CURSOR_LSM *clsm, const char *cfg[]) +{ + WT_CURSOR *cursor, *bulk_cursor; + WT_LSM_TREE *lsm_tree; + WT_SESSION_IMPL *session; + + bulk_cursor = NULL; + cursor = &clsm->iface; + lsm_tree = clsm->lsm_tree; + session = (WT_SESSION_IMPL *)clsm->iface.session; + + F_SET(clsm, WT_CLSM_BULK); + + /* Bulk cursors are limited to insert and close. */ + __wt_cursor_set_notsup(cursor); + cursor->insert = __clsm_insert_bulk; + cursor->close = __clsm_close_bulk; + + /* Setup the first chunk in the tree. */ + WT_RET(__wt_clsm_request_switch(clsm)); + WT_RET(__wt_clsm_await_switch(clsm)); + + /* + * Grab and release the LSM tree lock to ensure that the first chunk + * has been fully created before proceeding. We have the LSM tree + * open exclusive, so that saves us from needing the lock generally. + */ + WT_RET(__wt_lsm_tree_readlock(session, lsm_tree)); + WT_RET(__wt_lsm_tree_readunlock(session, lsm_tree)); + + /* + * Open a bulk cursor on the first chunk, it's not a regular LSM chunk + * cursor, but use the standard storage locations. Allocate the space + * for a bloom filter - it makes cleanup simpler. Cleaned up by + * cursor close on error. + */ + WT_RET(__wt_calloc_one(session, &clsm->blooms)); + clsm->bloom_alloc = 1; + WT_RET(__wt_calloc_one(session, &clsm->cursors)); + clsm->cursor_alloc = 1; + clsm->nchunks = 1; + + /* + * Open a bulk cursor on the first chunk in the tree - take a read + * lock on the LSM tree while we are opening the chunk, to ensure + * that the first chunk has been fully created before we succeed. + * Pass through the application config to ensure the tree is open + * for bulk access. + */ + WT_RET(__wt_open_cursor(session, + lsm_tree->chunk[0]->uri, &clsm->iface, cfg, &bulk_cursor)); + clsm->cursors[0] = bulk_cursor; + /* LSM cursors are always raw */ + F_SET(bulk_cursor, WT_CURSTD_RAW); + + return (0); +} + diff --git a/src/third_party/wiredtiger/src/lsm/lsm_stat.c b/src/third_party/wiredtiger/src/lsm/lsm_stat.c index e994300d4d3..656e43c978d 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_stat.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_stat.c @@ -40,11 +40,12 @@ __curstat_lsm_init( /* Propagate all, fast and/or clear to the cursors we open. */ if (!F_ISSET(cst, WT_CONN_STAT_NONE)) { (void)snprintf(config, sizeof(config), - "statistics=(%s%s%s)", - F_ISSET(cst, WT_CONN_STAT_CLEAR) ? "clear," : "", + "statistics=(%s%s%s%s)", F_ISSET(cst, WT_CONN_STAT_ALL) ? "all," : "", + F_ISSET(cst, WT_CONN_STAT_CLEAR) ? "clear," : "", !F_ISSET(cst, WT_CONN_STAT_ALL) && - F_ISSET(cst, WT_CONN_STAT_FAST) ? "fast," : ""); + F_ISSET(cst, WT_CONN_STAT_FAST) ? "fast," : "", + F_ISSET(cst, WT_CONN_STAT_SIZE) ? "size," : ""); cfg[1] = disk_cfg[1] = config; } diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c index 2bded10cb96..63f19858279 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_tree.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c @@ -10,7 +10,8 @@ static int __lsm_tree_cleanup_old(WT_SESSION_IMPL *, const char *); static int __lsm_tree_open_check(WT_SESSION_IMPL *, WT_LSM_TREE *); -static int __lsm_tree_open(WT_SESSION_IMPL *, const char *, WT_LSM_TREE **); +static int __lsm_tree_open( + WT_SESSION_IMPL *, const char *, int, WT_LSM_TREE **); static int __lsm_tree_set_name(WT_SESSION_IMPL *, WT_LSM_TREE *, const char *); /* @@ -430,7 +431,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session, */ if (ret == 0) WT_WITH_DHANDLE_LOCK(session, - ret = __lsm_tree_open(session, uri, &lsm_tree)); + ret = __lsm_tree_open(session, uri, 1, &lsm_tree)); if (ret == 0) __wt_lsm_tree_release(session, lsm_tree); @@ -539,8 +540,8 @@ __lsm_tree_open_check(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) * Open an LSM tree structure. */ static int -__lsm_tree_open( - WT_SESSION_IMPL *session, const char *uri, WT_LSM_TREE **treep) +__lsm_tree_open(WT_SESSION_IMPL *session, + const char *uri, int exclusive, WT_LSM_TREE **treep) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; @@ -556,7 +557,8 @@ __lsm_tree_open( WT_RET(__wt_lsm_manager_start(session)); /* Make sure no one beat us to it. */ - if ((ret = __lsm_tree_find(session, uri, 0, treep)) != WT_NOTFOUND) + if ((ret = __lsm_tree_find( + session, uri, exclusive, treep)) != WT_NOTFOUND) return (ret); /* Try to open the tree. */ @@ -582,6 +584,7 @@ __lsm_tree_open( * with getting handles exclusive. */ lsm_tree->refcnt = 1; + lsm_tree->exclusive = exclusive; lsm_tree->queue_ref = 0; /* Set a flush timestamp as a baseline. */ @@ -613,8 +616,9 @@ __wt_lsm_tree_get(WT_SESSION_IMPL *session, ret = __lsm_tree_find(session, uri, exclusive, treep); if (ret == WT_NOTFOUND) - ret = __lsm_tree_open(session, uri, treep); + ret = __lsm_tree_open(session, uri, exclusive, treep); + WT_ASSERT(session, ret != 0 || exclusive == (*treep)->exclusive); return (ret); } diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c index 0566e0abc70..99140f89c51 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c @@ -281,7 +281,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, } /* Stop if a running transaction needs the chunk. */ - __wt_txn_update_oldest(session); + __wt_txn_update_oldest(session, 1); if (chunk->switch_txn == WT_TXN_NONE || !__wt_txn_visible_all(session, chunk->switch_txn)) { WT_RET(__wt_verbose(session, WT_VERB_LSM, diff --git a/src/third_party/wiredtiger/src/meta/meta_track.c b/src/third_party/wiredtiger/src/meta/meta_track.c index 62d4df47ff6..66e7e3977f4 100644 --- a/src/third_party/wiredtiger/src/meta/meta_track.c +++ b/src/third_party/wiredtiger/src/meta/meta_track.c @@ -17,6 +17,7 @@ typedef struct __wt_meta_track { enum { WT_ST_EMPTY, /* Unused slot */ WT_ST_CHECKPOINT, /* Complete a checkpoint */ + WT_ST_DROP_COMMIT, /* Drop post commit */ WT_ST_FILEOP, /* File operation */ WT_ST_LOCK, /* Lock a handle */ WT_ST_REMOVE, /* Remove a metadata entry */ @@ -106,7 +107,8 @@ __meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll) * Unlock handles and complete checkpoints regardless of whether we are * unrolling. */ - if (!unroll && trk->op != WT_ST_CHECKPOINT && trk->op != WT_ST_LOCK) + if (!unroll && trk->op != WT_ST_CHECKPOINT && + trk->op != WT_ST_DROP_COMMIT && trk->op != WT_ST_LOCK) goto free; switch (trk->op) { @@ -120,6 +122,14 @@ __meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll) WT_TRET(bm->checkpoint_resolve(bm, session))); } break; + case WT_ST_DROP_COMMIT: + if ((tret = __wt_remove_if_exists(session, trk->a)) != 0) { + __wt_err(session, tret, + "metadata remove dropped file %s", + trk->a); + WT_TRET(tret); + } + break; case WT_ST_LOCK: /* Handle lock, see above */ if (unroll && trk->created) F_SET(trk->dhandle, WT_DHANDLE_DISCARD); @@ -394,6 +404,23 @@ __wt_meta_track_fileop( } /* + * __wt_meta_track_drop -- + * Track a file drop, where the remove is deferred until commit. + */ +int +__wt_meta_track_drop( + WT_SESSION_IMPL *session, const char *filename) +{ + WT_META_TRACK *trk; + + WT_RET(__meta_track_next(session, &trk)); + + trk->op = WT_ST_DROP_COMMIT; + WT_RET(__wt_strdup(session, filename, &trk->a)); + return (0); +} + +/* * __wt_meta_track_handle_lock -- * Track a locked handle. */ diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 573ea8811f8..14ab05fbb25 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -363,6 +363,19 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_STAT_FAST_DATA_INCR(session, rec_pages_eviction); } +#ifdef HAVE_DIAGNOSTIC + { + /* + * Check that transaction time always moves forward for a given page. + * If this check fails, reconciliation can free something that a future + * reconciliation will need. + */ + uint64_t oldest_id = __wt_txn_oldest_id(session); + WT_ASSERT(session, TXNID_LE(mod->last_oldest_id, oldest_id)); + mod->last_oldest_id = oldest_id; + } +#endif + /* Record the most recent transaction ID we will *not* write. */ mod->disk_snap_min = session->txn.snap_min; @@ -839,6 +852,7 @@ static inline int __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp) { + WT_DECL_RET; WT_ITEM ovfl; WT_PAGE *page; WT_UPDATE *upd, *upd_list, *upd_ovfl; @@ -977,8 +991,11 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, */ if (vpack != NULL && vpack->raw == WT_CELL_VALUE_OVFL_RM && !__wt_txn_visible_all(session, min_txn)) { - WT_RET(__wt_ovfl_txnc_search( - page, vpack->data, vpack->size, &ovfl)); + if ((ret = __wt_ovfl_txnc_search( + page, vpack->data, vpack->size, &ovfl)) != 0) + WT_PANIC_RET(session, ret, + "cached overflow item discarded early"); + /* * Create an update structure with an impossibly low transaction * ID and append it to the update list we're about to save. @@ -1221,10 +1238,6 @@ __rec_child_deleted( if (F_ISSET(r, WT_SKIP_UPDATE_ERR)) WT_PANIC_RET(session, EINVAL, "reconciliation illegally skipped an update"); - - /* If this page cannot be evicted, quit now. */ - if (F_ISSET(r, WT_EVICTING)) - return (EBUSY); } /* @@ -1265,6 +1278,18 @@ __rec_child_deleted( } /* + * If there are deleted child pages that we can't discard immediately, + * keep the page dirty so they are eventually freed. + */ + if (ref->addr != NULL) { + r->leave_dirty = 1; + + /* This page cannot be evicted, quit now. */ + if (F_ISSET(r, WT_EVICTING)) + return (EBUSY); + } + + /* * Minor memory cleanup: if a truncate call deleted this page and we * were ever forced to instantiate the page in memory, we would have * built a list of updates in the page reference in order to be able diff --git a/src/third_party/wiredtiger/src/schema/schema_drop.c b/src/third_party/wiredtiger/src/schema/schema_drop.c index 03097128ec2..694d07c65bf 100644 --- a/src/third_party/wiredtiger/src/schema/schema_drop.c +++ b/src/third_party/wiredtiger/src/schema/schema_drop.c @@ -39,10 +39,10 @@ __drop_file( return (ret); /* - * Remove the underlying physical file. There is no point tracking this - * operation: there is no going back from here. + * Schedule the remove of the underlying physical file when the drop + * completes. */ - WT_TRET(__wt_remove_if_exists(session, filename)); + WT_TRET(__wt_meta_track_drop(session, filename)); return (ret); } @@ -120,8 +120,13 @@ __drop_table( for (i = 0; i < WT_COLGROUPS(table); i++) { if ((colgroup = table->cgroups[i]) == NULL) continue; - WT_ERR(__wt_metadata_remove(session, colgroup->name)); + /* + * Drop the column group before updating the metadata to avoid + * the metadata for the table becoming inconsistent if we can't + * get exclusive access. + */ WT_ERR(__wt_schema_drop(session, colgroup->source, cfg)); + WT_ERR(__wt_metadata_remove(session, colgroup->name)); } /* Drop the indices. */ @@ -129,8 +134,13 @@ __drop_table( for (i = 0; i < table->nindices; i++) { if ((idx = table->indices[i]) == NULL) continue; - WT_ERR(__wt_metadata_remove(session, idx->name)); + /* + * Drop the column group before updating the metadata to avoid + * the metadata for the table becoming inconsistent if we can't + * get exclusive access. + */ WT_ERR(__wt_schema_drop(session, idx->source, cfg)); + WT_ERR(__wt_metadata_remove(session, idx->name)); } WT_ERR(__wt_schema_remove_table(session, table)); diff --git a/src/third_party/wiredtiger/src/schema/schema_rename.c b/src/third_party/wiredtiger/src/schema/schema_rename.c index 51281eccec5..c00ffa7d61c 100644 --- a/src/third_party/wiredtiger/src/schema/schema_rename.c +++ b/src/third_party/wiredtiger/src/schema/schema_rename.c @@ -155,15 +155,18 @@ __rename_tree(WT_SESSION_IMPL *session, cval.str + cval.len)); /* + * Do the rename before updating the metadata to avoid leaving the + * metadata inconsistent if the rename fails. + */ + WT_ERR(__wt_schema_rename(session, os->data, ns->data, cfg)); + + /* * Remove the old metadata entry. * Insert the new metadata entry. */ WT_ERR(__wt_metadata_remove(session, name)); WT_ERR(__wt_metadata_insert(session, nn->data, nv->data)); - /* Rename the file. */ - WT_ERR(__wt_schema_rename(session, os->data, ns->data, cfg)); - err: __wt_scr_free(session, &nn); __wt_scr_free(session, &ns); __wt_scr_free(session, &nv); diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index c838785a9c3..f6f5a695b4f 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -57,66 +57,44 @@ __wt_txn_release_snapshot(WT_SESSION_IMPL *session) txn = &session->txn; txn_state = &S2C(session)->txn_global.states[session->id]; - if (txn_state->snap_min != WT_TXN_NONE) { - WT_ASSERT(session, - session->txn.isolation == TXN_ISO_READ_UNCOMMITTED || - !__wt_txn_visible_all(session, txn_state->snap_min)); - txn_state->snap_min = WT_TXN_NONE; - } - F_CLR(txn, TXN_HAS_SNAPSHOT); -} + WT_ASSERT(session, + txn_state->snap_min == WT_TXN_NONE || + session->txn.isolation == TXN_ISO_READ_UNCOMMITTED || + !__wt_txn_visible_all(session, txn_state->snap_min)); -/* - * __wt_txn_update_oldest -- - * Sweep the running transactions to update the oldest ID required. - */ -void -__wt_txn_update_oldest(WT_SESSION_IMPL *session) -{ - /* - * !!! - * If a data-source is calling the WT_EXTENSION_API.transaction_oldest - * method (for the oldest transaction ID not yet visible to a running - * transaction), and then comparing that oldest ID against committed - * transactions to see if updates for a committed transaction are still - * visible to running transactions, the oldest transaction ID may be - * the same as the last committed transaction ID, if the transaction - * state wasn't refreshed after the last transaction committed. Push - * past the last committed transaction. - */ - __wt_txn_refresh(session, 0); + txn_state->snap_min = WT_TXN_NONE; + F_CLR(txn, TXN_HAS_SNAPSHOT); } /* - * __wt_txn_refresh -- - * Allocate a transaction ID and/or a snapshot. + * __wt_txn_get_snapshot -- + * Allocate a snapshot. */ void -__wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot) +__wt_txn_get_snapshot(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s, *txn_state; - uint64_t current_id, id, oldest_id; + uint64_t current_id, id; uint64_t prev_oldest_id, snap_min; - uint32_t i, n, oldest_session, session_cnt; + uint32_t i, n, session_cnt; int32_t count; conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; - txn_state = &txn_global->states[session->id]; + txn_state = WT_SESSION_TXN_STATE(session); current_id = snap_min = txn_global->current; prev_oldest_id = txn_global->oldest_id; /* For pure read-only workloads, avoid scanning. */ if (prev_oldest_id == current_id) { - if (get_snapshot) { - txn_state->snap_min = current_id; - __txn_sort_snapshot(session, 0, current_id); - } + txn_state->snap_min = current_id; + __txn_sort_snapshot(session, 0, current_id); + /* Check that the oldest ID has not moved in the meantime. */ if (prev_oldest_id == txn_global->oldest_id && txn_global->scan_count == 0) @@ -136,17 +114,11 @@ __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot) /* The oldest ID cannot change until the scan count goes to zero. */ prev_oldest_id = txn_global->oldest_id; - current_id = oldest_id = snap_min = txn_global->current; - oldest_session = 0; + current_id = snap_min = txn_global->current; /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) { - /* Skip the checkpoint transaction; it is never read from. */ - if (txn_global->checkpoint_id != WT_TXN_NONE && - s->id == txn_global->checkpoint_id) - continue; - /* * Build our snapshot of any concurrent transaction IDs. * @@ -160,18 +132,99 @@ __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot) if (s != txn_state && (id = s->id) != WT_TXN_NONE && TXNID_LE(prev_oldest_id, id)) { - if (get_snapshot) - txn->snapshot[n++] = id; + txn->snapshot[n++] = id; if (TXNID_LT(id, snap_min)) snap_min = id; } + } + + /* + * If we got a new snapshot, update the published snap_min for this + * session. + */ + WT_ASSERT(session, TXNID_LE(prev_oldest_id, snap_min)); + WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); + txn_state->snap_min = snap_min; + + /* Update the last running ID if we have a much newer value. */ + if (snap_min > txn_global->last_running + 100) + txn_global->last_running = snap_min; + + WT_ASSERT(session, txn_global->scan_count > 0); + (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1); + + __txn_sort_snapshot(session, n, current_id); +} + +/* + * __wt_txn_update_oldest -- + * Sweep the running transactions to update the oldest ID required. + * !!! + * If a data-source is calling the WT_EXTENSION_API.transaction_oldest + * method (for the oldest transaction ID not yet visible to a running + * transaction), and then comparing that oldest ID against committed + * transactions to see if updates for a committed transaction are still + * visible to running transactions, the oldest transaction ID may be + * the same as the last committed transaction ID, if the transaction + * state wasn't refreshed after the last transaction committed. Push + * past the last committed transaction. +*/ +void +__wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) +{ + WT_CONNECTION_IMPL *conn; + WT_SESSION_IMPL *oldest_session; + WT_TXN_GLOBAL *txn_global; + WT_TXN_STATE *s; + uint64_t current_id, id, oldest_id, prev_oldest_id, snap_min; + uint32_t i, session_cnt; + int32_t count; + int last_running_moved; + + conn = S2C(session); + txn_global = &conn->txn_global; + current_id = snap_min = txn_global->current; + oldest_session = NULL; + prev_oldest_id = txn_global->oldest_id; + + /* + * For pure read-only workloads, or if the update isn't forced and the + * oldest ID isn't too far behind, avoid scanning. + */ + if (prev_oldest_id == current_id || + (!force && TXNID_LT(current_id, prev_oldest_id + 100))) + return; + + /* + * We're going to scan. Increment the count of scanners to prevent the + * oldest ID from moving forwards. Spin if the count is negative, + * which indicates that some thread is moving the oldest ID forwards. + */ + do { + if ((count = txn_global->scan_count) < 0) + WT_PAUSE(); + } while (count < 0 || + !WT_ATOMIC_CAS4(txn_global->scan_count, count, count + 1)); + + /* The oldest ID cannot change until the scan count goes to zero. */ + prev_oldest_id = txn_global->oldest_id; + current_id = oldest_id = snap_min = txn_global->current; + + /* Walk the array of concurrent transactions. */ + WT_ORDERED_READ(session_cnt, conn->session_cnt); + for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { /* - * Ignore the session's own snap_min: we are about to update - * it. + * Update the oldest ID. + * + * Ignore: IDs older than the oldest ID we saw. This can happen + * if we race with a thread that is allocating an ID -- the ID + * will not be used because the thread will keep spinning until + * it gets a valid one. */ - if (get_snapshot && s == txn_state) - continue; + if ((id = s->id) != WT_TXN_NONE && + TXNID_LE(prev_oldest_id, id) && TXNID_LT(id, snap_min)) + snap_min = id; /* * !!! @@ -184,51 +237,25 @@ __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot) if ((id = s->snap_min) != WT_TXN_NONE && TXNID_LT(id, oldest_id)) { oldest_id = id; - oldest_session = i; + oldest_session = &conn->sessions[i]; } } if (TXNID_LT(snap_min, oldest_id)) oldest_id = snap_min; - if (txn->id != WT_TXN_NONE && TXNID_LT(txn->id, oldest_id)) - oldest_id = txn->id; - /* - * If we got a new snapshot, update the published snap_min for this - * session. - */ - if (get_snapshot) { - WT_ASSERT(session, TXNID_LE(prev_oldest_id, snap_min)); - WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); - txn_state->snap_min = snap_min; - } - - /* - * Update the last running ID if we have a much newer value or we are - * forcing an update. - */ - if (!get_snapshot || snap_min > txn_global->last_running + 100) + /* Update the last running ID. */ + if (TXNID_LT(txn_global->last_running, snap_min)) { txn_global->last_running = snap_min; + last_running_moved = 1; + } else + last_running_moved = 0; - /* - * Update the oldest ID if we have a newer ID and we can get exclusive - * access. During normal snapshot refresh, only do this if we have a - * much newer value. Once we get exclusive access, do another pass to - * make sure nobody else is using an earlier ID. - */ + /* Update the oldest ID. */ if (TXNID_LT(prev_oldest_id, oldest_id) && - (!get_snapshot || oldest_id - prev_oldest_id > 100) && WT_ATOMIC_CAS4(txn_global->scan_count, 1, -1)) { WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { - /* - * Skip the checkpoint transaction; it is never read - * from. - */ - if (txn_global->checkpoint_id != WT_TXN_NONE && - s->id == txn_global->checkpoint_id) - continue; - if ((id = s->id) != WT_TXN_NONE && TXNID_LT(id, oldest_id)) oldest_id = id; @@ -241,31 +268,27 @@ __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot) txn_global->scan_count = 0; } else { if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) && - current_id - oldest_id > 10000 && - txn_global->oldest_session != oldest_session) { + current_id - oldest_id > 10000 && last_running_moved && + oldest_session != NULL) { (void)__wt_verbose(session, WT_VERB_TRANSACTION, "old snapshot %" PRIu64 " pinned in session %d [%s]" " with snap_min %" PRIu64 "\n", - oldest_id, oldest_session, - conn->sessions[oldest_session].lastop, - conn->sessions[oldest_session].txn.snap_min); - txn_global->oldest_session = oldest_session; + oldest_id, oldest_session->id, + oldest_session->lastop, + oldest_session->txn.snap_min); } WT_ASSERT(session, txn_global->scan_count > 0); (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1); } - - if (get_snapshot) - __txn_sort_snapshot(session, n, current_id); } /* - * __wt_txn_begin -- - * Begin a transaction. + * __wt_txn_config -- + * Configure a transaction. */ int -__wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) +__wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CONFIG_ITEM cval; WT_TXN *txn; @@ -273,9 +296,7 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) txn = &session->txn; WT_RET(__wt_config_gets_def(session, cfg, "isolation", 0, &cval)); - if (cval.len == 0) - txn->isolation = session->isolation; - else + if (cval.len != 0) txn->isolation = WT_STRING_MATCH("snapshot", cval.str, cval.len) ? TXN_ISO_SNAPSHOT : @@ -294,18 +315,11 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) * !!! This is an unusual use of the config code: the "default" value * we pass in is inherited from the connection. */ - txn->txn_logsync = S2C(session)->txn_logsync; WT_RET(__wt_config_gets_def(session, cfg, "sync", FLD_ISSET(txn->txn_logsync, WT_LOG_FLUSH) ? 1 : 0, &cval)); if (!cval.val) txn->txn_logsync = 0; - F_SET(txn, TXN_RUNNING); - if (txn->isolation == TXN_ISO_SNAPSHOT) { - if (session->ncursors > 0) - WT_RET(__wt_session_copy_values(session)); - __wt_txn_refresh(session, 1); - } return (0); } @@ -325,10 +339,17 @@ __wt_txn_release(WT_SESSION_IMPL *session) txn->notify = NULL; txn_global = &S2C(session)->txn_global; - txn_state = &txn_global->states[session->id]; + txn_state = WT_SESSION_TXN_STATE(session); /* Clear the transaction's ID from the global table. */ - if (F_ISSET(txn, TXN_HAS_ID)) { + if (WT_SESSION_IS_CHECKPOINT(session)) { + WT_ASSERT(session, txn_state->id == WT_TXN_NONE); + txn->id = WT_TXN_NONE; + + /* Clear the global checkpoint transaction IDs. */ + txn_global->checkpoint_id = 0; + txn_global->checkpoint_pinned = WT_TXN_NONE; + } else if (F_ISSET(txn, TXN_HAS_ID)) { WT_ASSERT(session, txn_state->id != WT_TXN_NONE && txn->id != WT_TXN_NONE); WT_PUBLISH(txn_state->id, WT_TXN_NONE); @@ -385,6 +406,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) */ __wt_txn_release_snapshot(session); ret = __wt_txn_log_commit(session, cfg); + WT_ASSERT(session, ret == 0); } /* @@ -515,19 +537,19 @@ __wt_txn_stats_update(WT_SESSION_IMPL *session) WT_TXN_GLOBAL *txn_global; WT_CONNECTION_IMPL *conn; WT_CONNECTION_STATS *stats; - uint64_t checkpoint_snap_min; + uint64_t checkpoint_pinned; conn = S2C(session); txn_global = &conn->txn_global; stats = &conn->stats; - checkpoint_snap_min = txn_global->checkpoint_snap_min; + checkpoint_pinned = txn_global->checkpoint_pinned; WT_STAT_SET(stats, txn_pinned_range, txn_global->current - txn_global->oldest_id); WT_STAT_SET(stats, txn_pinned_checkpoint_range, - checkpoint_snap_min == WT_TXN_NONE ? - 0 : txn_global->current - checkpoint_snap_min); + checkpoint_pinned == WT_TXN_NONE ? + 0 : txn_global->current - checkpoint_pinned); } /* diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index 1361b1a6682..08d8b778371 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -349,6 +349,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_ISOLATION saved_isolation; + WT_TXN_STATE *txn_state; const char *txn_cfg[] = { WT_CONFIG_BASE(session, session_begin_transaction), "isolation=snapshot", NULL }; @@ -358,6 +359,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) conn = S2C(session); txn_global = &conn->txn_global; + txn_state = WT_SESSION_TXN_STATE(session); saved_isolation = session->isolation; txn = &session->txn; full = idle = logging = tracking = 0; @@ -388,7 +390,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * This is particularly important for compact, so that all dirty pages * can be fully written. */ - __wt_txn_update_oldest(session); + __wt_txn_update_oldest(session, 1); /* Flush data-sources before we start the checkpoint. */ WT_ERR(__checkpoint_data_source(session, cfg)); @@ -426,6 +428,22 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__checkpoint_verbose_track(session, "starting transaction", &verb_timer)); + if (full) + WT_ERR(__wt_epoch(session, &start)); + + /* + * Bump the global checkpoint generation, used to figure out whether + * checkpoint has visited a tree. There is no need for this to be + * atomic: it is only written while holding the checkpoint lock. + * + * We do need to update it before clearing the checkpoint's entry out + * of the transaction table, or a thread evicting in a tree could + * ignore the checkpoint's transaction. + */ + ++txn_global->checkpoint_gen; + WT_STAT_FAST_CONN_SET(session, + txn_checkpoint_generation, txn_global->checkpoint_gen); + /* * Start a snapshot transaction for the checkpoint. * @@ -433,27 +451,44 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * side effects on cursors, which applications can hold open across * calls to checkpoint. */ - if (full) - WT_ERR(__wt_epoch(session, &start)); WT_ERR(__wt_txn_begin(session, txn_cfg)); /* Ensure a transaction ID is allocated prior to sharing it globally */ WT_ERR(__wt_txn_id_check(session)); + /* - * Save a copy of the checkpoint transaction ID so that refresh can - * skip the checkpoint IDs. Save a copy of the snap min so that - * visibility checks for the checkpoint use the right ID. + * Save the checkpoint session ID. We never do checkpoints in the + * default session (with id zero). */ - txn_global->checkpoint_id = session->txn.id; - txn_global->checkpoint_snap_min = session->txn.snap_min; + WT_ASSERT(session, session->id != 0 && txn_global->checkpoint_id == 0); + txn_global->checkpoint_id = session->id; + + txn_global->checkpoint_pinned = + WT_MIN(txn_state->id, txn_state->snap_min); /* - * No need for this to be atomic it is only written while holding the - * checkpoint lock. + * We're about to clear the checkpoint transaction from the global + * state table so the oldest ID can move forward. Make sure everything + * we've done above is scheduled. */ - txn_global->checkpoint_gen += 1; - WT_STAT_FAST_CONN_SET(session, - txn_checkpoint_generation, txn_global->checkpoint_gen); + WT_FULL_BARRIER(); + + /* + * Sanity check that the oldest ID hasn't moved on before we have + * cleared our entry. + */ + WT_ASSERT(session, + TXNID_LE(txn_global->oldest_id, txn_state->id) && + TXNID_LE(txn_global->oldest_id, txn_state->snap_min)); + + /* + * Clear our entry from the global transaction session table. Any + * operation that needs to know about the ID for this checkpoint will + * consider the checkpoint ID in the global structure. Most operations + * can safely ignore the checkpoint ID (see the visible all check for + * details). + */ + txn_state->id = txn_state->snap_min = WT_TXN_NONE; /* Tell logging that we have started a database checkpoint. */ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && full) { @@ -474,10 +509,6 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) /* Release the snapshot so we aren't pinning pages in cache. */ __wt_txn_release_snapshot(session); - /* Clear the global checkpoint transaction IDs */ - txn_global->checkpoint_id = WT_TXN_NONE; - txn_global->checkpoint_snap_min = WT_TXN_NONE; - WT_ERR(__checkpoint_verbose_track(session, "committing transaction", &verb_timer)); @@ -550,10 +581,6 @@ err: /* WT_TRET(__wt_txn_rollback(session, NULL)); } - /* Ensure the checkpoint IDs are cleared on the error path. */ - txn_global->checkpoint_id = WT_TXN_NONE; - txn_global->checkpoint_snap_min = WT_TXN_NONE; - /* * Tell logging that we have finished a database checkpoint. Do not * write a log record if the database was idle. @@ -806,10 +833,8 @@ __checkpoint_worker( force = 1; } if (!btree->modified && !force) { - if (!is_checkpoint) { - F_SET(btree, WT_BTREE_SKIP_CKPT); - goto done; - } + if (!is_checkpoint) + goto nockpt; deleted = 0; WT_CKPT_FOREACH(ckptbase, ckpt) @@ -828,7 +853,12 @@ __checkpoint_worker( (WT_PREFIX_MATCH(name, WT_CHECKPOINT) && WT_PREFIX_MATCH((ckpt - 1)->name, WT_CHECKPOINT))) && deleted < 2) { - F_SET(btree, WT_BTREE_SKIP_CKPT); +nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT); + WT_PUBLISH(btree->checkpoint_gen, + S2C(session)->txn_global.checkpoint_gen); + WT_STAT_FAST_DATA_SET(session, + btree_checkpoint_generation, + btree->checkpoint_gen); goto done; } } @@ -1056,16 +1086,8 @@ fake: /* WT_ERR(__wt_txn_checkpoint_log( session, 0, WT_TXN_LOG_CKPT_STOP, NULL)); - /* - * Update the checkpoint generation for this handle so visible - * updates newer than the checkpoint can be evicted. - */ -done: btree->checkpoint_gen = conn->txn_global.checkpoint_gen; - WT_STAT_FAST_DATA_SET(session, - btree_checkpoint_generation, btree->checkpoint_gen); - -err: - /* +done: +err: /* * If the checkpoint didn't complete successfully, make sure the * tree is marked dirty. */ @@ -1142,7 +1164,7 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, int final) * for active readers. */ if (!btree->modified && !bulk) { - __wt_txn_update_oldest(session); + __wt_txn_update_oldest(session, 1); return (__wt_txn_visible_all(session, btree->rec_max_txn) ? __wt_cache_op(session, NULL, WT_SYNC_DISCARD) : EBUSY); } |