summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexander Gorrod <alexander.gorrod@mongodb.com>2015-07-06 00:17:49 +0000
committerAlexander Gorrod <alexander.gorrod@mongodb.com>2015-07-06 00:17:49 +0000
commitf31038b98941bdc72c13449183854a690fd20653 (patch)
treeb64ad23009156438cc8a58e994a4de1f63a65ff9
parentdb0ba62bd4a375f86e36c992033894569233000f (diff)
downloadmongo-f31038b98941bdc72c13449183854a690fd20653.tar.gz
Import wiredtiger-wiredtiger-mongodb-3.0.4-20-ga3b359d.tar.gz from wiredtiger branch mongodb-3.0
-rw-r--r--src/third_party/wiredtiger/build_win/filelist.win1
-rw-r--r--src/third_party/wiredtiger/dist/api_data.py37
-rw-r--r--src/third_party/wiredtiger/dist/filelist1
-rw-r--r--src/third_party/wiredtiger/src/block/block_mgr.c5
-rw-r--r--src/third_party/wiredtiger/src/block/block_open.c18
-rw-r--r--src/third_party/wiredtiger/src/block/block_vrfy.c19
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_page.c5
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c14
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_sync.c14
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_vrfy.c4
-rw-r--r--src/third_party/wiredtiger/src/config/config_def.c5
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_log.c38
-rw-r--r--src/third_party/wiredtiger/src/conn/conn_open.c2
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_stat.c32
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_file.c2
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_lru.c36
-rw-r--r--src/third_party/wiredtiger/src/evict/evict_page.c3
-rw-r--r--src/third_party/wiredtiger/src/include/block.h4
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h12
-rw-r--r--src/third_party/wiredtiger/src/include/cache.h2
-rw-r--r--src/third_party/wiredtiger/src/include/cache.i19
-rw-r--r--src/third_party/wiredtiger/src/include/connection.h9
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h14
-rw-r--r--src/third_party/wiredtiger/src/include/lsm.h17
-rw-r--r--src/third_party/wiredtiger/src/include/txn.h20
-rw-r--r--src/third_party/wiredtiger/src/include/txn.i91
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in10
-rw-r--r--src/third_party/wiredtiger/src/log/log.c16
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_cursor.c105
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c131
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_stat.c7
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_tree.c16
-rw-r--r--src/third_party/wiredtiger/src/lsm/lsm_work_unit.c2
-rw-r--r--src/third_party/wiredtiger/src/meta/meta_track.c29
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c37
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_drop.c20
-rw-r--r--src/third_party/wiredtiger/src/schema/schema_rename.c9
-rw-r--r--src/third_party/wiredtiger/src/txn/txn.c244
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_ckpt.c96
39 files changed, 798 insertions, 348 deletions
diff --git a/src/third_party/wiredtiger/build_win/filelist.win b/src/third_party/wiredtiger/build_win/filelist.win
index e297ca16b06..8655c0eda8e 100644
--- a/src/third_party/wiredtiger/build_win/filelist.win
+++ b/src/third_party/wiredtiger/build_win/filelist.win
@@ -85,6 +85,7 @@ src/log/log.c
src/log/log_auto.c
src/log/log_slot.c
src/lsm/lsm_cursor.c
+src/lsm/lsm_cursor_bulk.c
src/lsm/lsm_manager.c
src/lsm/lsm_merge.c
src/lsm/lsm_meta.c
diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py
index 351067d7ba5..5ad422befb4 100644
--- a/src/third_party/wiredtiger/dist/api_data.py
+++ b/src/third_party/wiredtiger/dist/api_data.py
@@ -715,22 +715,22 @@ methods = {
type='boolean', undoc=True),
Config('statistics', '', r'''
Specify the statistics to be gathered. Choosing "all" gathers
- statistics regardless of cost and may include traversing
- on-disk files; "fast" gathers a subset of relatively
- inexpensive statistics. The selection must agree with the
- database \c statistics configuration specified to
- ::wiredtiger_open or WT_CONNECTION::reconfigure. For example,
- "all" or "fast" can be configured when the database is
- configured with "all", but the cursor open will fail if "all"
- is specified when the database is configured with "fast",
- and the cursor open will fail in all cases when the database
- is configured with "none". If \c statistics is not configured,
- the default configuration is the database configuration.
- The "clear" configuration resets statistics after gathering
- them, where appropriate (for example, a cache size statistic
- is not cleared, while the count of cursor insert operations
- will be cleared). See @ref statistics for more information''',
- type='list', choices=['all', 'fast', 'clear']),
+ statistics regardless of cost and may include traversing on-disk files;
+ "fast" gathers a subset of relatively inexpensive statistics. The
+ selection must agree with the database \c statistics configuration
+ specified to ::wiredtiger_open or WT_CONNECTION::reconfigure. For
+ example, "all" or "fast" can be configured when the database is
+ configured with "all", but the cursor open will fail if "all" is
+ specified when the database is configured with "fast", and the cursor
+ open will fail in all cases when the database is configured with
+ "none". If "size" is configured, only the underlying size of the
+ object on disk is filled in and the object is not opened. If \c
+ statistics is not configured, the default configuration is the database
+ configuration. The "clear" configuration resets statistics after
+ gathering them, where appropriate (for example, a cache size statistic
+ is not cleared, while the count of cursor insert operations will be
+ cleared). See @ref statistics for more information''',
+ type='list', choices=['all', 'fast', 'clear', 'size']),
Config('target', '', r'''
if non-empty, backup the list of objects; valid only for a
backup data source''',
@@ -767,6 +767,11 @@ methods = {
Config('dump_shape', 'false', r'''
Display the shape of the tree after verification,
using the application's message handler, intended for debugging''',
+ type='boolean'),
+ Config('strict', 'false', r'''
+ Treat any verification problem as an error; by default, verify will
+ warn, but not fail, in the case of errors that won't affect future
+ behavior (for example, a leaked block)''',
type='boolean')
]),
diff --git a/src/third_party/wiredtiger/dist/filelist b/src/third_party/wiredtiger/dist/filelist
index ee70ccf765e..af72bab6718 100644
--- a/src/third_party/wiredtiger/dist/filelist
+++ b/src/third_party/wiredtiger/dist/filelist
@@ -85,6 +85,7 @@ src/log/log.c
src/log/log_auto.c
src/log/log_slot.c
src/lsm/lsm_cursor.c
+src/lsm/lsm_cursor_bulk.c
src/lsm/lsm_manager.c
src/lsm/lsm_merge.c
src/lsm/lsm_meta.c
diff --git a/src/third_party/wiredtiger/src/block/block_mgr.c b/src/third_party/wiredtiger/src/block/block_mgr.c
index 13e6ec73b32..558008ee7b0 100644
--- a/src/third_party/wiredtiger/src/block/block_mgr.c
+++ b/src/third_party/wiredtiger/src/block/block_mgr.c
@@ -302,9 +302,10 @@ __bm_salvage_end(WT_BM *bm, WT_SESSION_IMPL *session)
* Start a block manager verify.
*/
static int
-__bm_verify_start(WT_BM *bm, WT_SESSION_IMPL *session, WT_CKPT *ckptbase)
+__bm_verify_start(WT_BM *bm,
+ WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[])
{
- return (__wt_block_verify_start(session, bm->block, ckptbase));
+ return (__wt_block_verify_start(session, bm->block, ckptbase, cfg));
}
/*
diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c
index 5a882f0fb7c..8e45ec85a97 100644
--- a/src/third_party/wiredtiger/src/block/block_open.c
+++ b/src/third_party/wiredtiger/src/block/block_open.c
@@ -388,7 +388,7 @@ err: __wt_scr_free(session, &buf);
/*
* __wt_block_stat --
- * Block statistics
+ * Set the statistics for a live block handle.
*/
void
__wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats)
@@ -409,3 +409,19 @@ __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats)
WT_STAT_SET(stats, block_size, block->fh->size);
__wt_spin_unlock(session, &block->live_lock);
}
+
+/*
+ * __wt_block_manager_size --
+ * Set the size statistic for a file.
+ */
+int
+__wt_block_manager_size(
+ WT_SESSION_IMPL *session, const char *filename, WT_DSRC_STATS *stats)
+{
+ wt_off_t filesize;
+
+ WT_RET(__wt_filesize_name(session, filename, &filesize));
+ WT_STAT_SET(stats, block_size, filesize);
+
+ return (0);
+}
diff --git a/src/third_party/wiredtiger/src/block/block_vrfy.c b/src/third_party/wiredtiger/src/block/block_vrfy.c
index 1e341aff77a..29a9e4950b4 100644
--- a/src/third_party/wiredtiger/src/block/block_vrfy.c
+++ b/src/third_party/wiredtiger/src/block/block_vrfy.c
@@ -28,10 +28,11 @@ static int __verify_last_truncate(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *);
* Start file verification.
*/
int
-__wt_block_verify_start(
- WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
+__wt_block_verify_start(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, WT_CKPT *ckptbase, const char *cfg[])
{
WT_CKPT *ckpt;
+ WT_CONFIG_ITEM cval;
wt_off_t size;
/*
@@ -98,6 +99,10 @@ __wt_block_verify_start(
*/
WT_RET(__verify_last_avail(session, block, ckpt));
+ /* Configuration: strict behavior on any error. */
+ WT_RET(__wt_config_gets(session, cfg, "strict", &cval));
+ block->verify_strict = cval.val ? 1 : 0;
+
block->verify = 1;
return (0);
}
@@ -164,14 +169,18 @@ __wt_block_verify_end(WT_SESSION_IMPL *session, WT_BLOCK *block)
/* Confirm we verified every file block. */
ret = __verify_filefrag_chk(session, block);
+ block->verify = 0;
+ block->verify_strict = 0;
+ block->verify_size = 0;
+
/* Discard the accumulated allocation list. */
__wt_block_extlist_free(session, &block->verify_alloc);
/* Discard the fragment tracking lists. */
+ block->frags = 0;
__wt_free(session, block->fragfile);
__wt_free(session, block->fragckpt);
- block->verify = 0;
return (ret);
}
@@ -434,7 +443,7 @@ __verify_filefrag_chk(WT_SESSION_IMPL *session, WT_BLOCK *block)
return (0);
__wt_errx(session, "file ranges never verified: %" PRIu64, count);
- return (WT_ERROR);
+ return (block->verify_strict ? WT_ERROR : 0);
}
/*
@@ -527,5 +536,5 @@ __verify_ckptfrag_chk(WT_SESSION_IMPL *session, WT_BLOCK *block)
__wt_errx(session,
"checkpoint ranges never verified: %" PRIu64, count);
- return (WT_ERROR);
+ return (block->verify_strict ? WT_ERROR : 0);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c
index 0b93cc981d7..120220223f8 100644
--- a/src/third_party/wiredtiger/src/btree/bt_page.c
+++ b/src/third_party/wiredtiger/src/btree/bt_page.c
@@ -49,6 +49,9 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
/* Trigger eviction on the next page release. */
__wt_page_evict_soon(page);
+ /* Bump the oldest ID, we're about to do some visibility checks. */
+ __wt_txn_update_oldest(session, 0);
+
/* If eviction cannot succeed, don't try. */
return (__wt_page_can_evict(session, page, 1));
}
@@ -168,7 +171,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
page->read_gen != WT_READGEN_OLDEST &&
page->read_gen < __wt_cache_read_gen(session))
page->read_gen =
- __wt_cache_read_gen_set(session);
+ __wt_cache_read_gen_bump(session);
return (0);
WT_ILLEGAL_VALUE(session);
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index f5c3d5fa331..eb2382cd610 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -1028,20 +1028,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
* are holding it locked.
*/
if (ret == 0 && !exclusive &&
- !F_ISSET_ATOMIC(parent, WT_PAGE_REFUSE_DEEPEN) &&
- __split_should_deepen(session, parent_ref, &children)) {
- /*
- * XXX
- * Temporary hack to avoid a bug where the root page is split
- * even when it's no longer doing any good.
- */
- uint64_t __a, __b;
- __a = parent->memory_footprint;
+ __split_should_deepen(session, parent_ref, &children))
ret = __split_deepen(session, parent, children);
- __b = parent->memory_footprint;
- if (__b * 2 >= __a)
- F_SET_ATOMIC(parent, WT_PAGE_REFUSE_DEEPEN);
- }
err: if (!complete)
for (i = 0; i < parent_entries; ++i) {
diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c
index 71b0d0abdb3..ca3b8f327b3 100644
--- a/src/third_party/wiredtiger/src/btree/bt_sync.c
+++ b/src/third_party/wiredtiger/src/btree/bt_sync.c
@@ -71,7 +71,7 @@ __sync_file(WT_SESSION_IMPL *session, int syncop)
__wt_txn_visible_all(
session, page->modify->update_txn)) {
if (txn->isolation == TXN_ISO_READ_COMMITTED)
- __wt_txn_refresh(session, 1);
+ __wt_txn_get_snapshot(session);
leaf_bytes += page->memory_footprint;
++leaf_pages;
WT_ERR(__wt_reconcile(session, walk, NULL, 0));
@@ -190,6 +190,18 @@ err: /* On error, clear any left-over tree walk. */
if (btree->checkpointing) {
/*
+ * Update the checkpoint generation for this handle so visible
+ * updates newer than the checkpoint can be evicted.
+ *
+ * This has to be published before eviction is enabled again,
+ * so that eviction knows that the checkpoint has completed.
+ */
+ WT_PUBLISH(btree->checkpoint_gen,
+ S2C(session)->txn_global.checkpoint_gen);
+ WT_STAT_FAST_DATA_SET(session,
+ btree_checkpoint_generation, btree->checkpoint_gen);
+
+ /*
* Clear the checkpoint flag and push the change; not required,
* but publishing the change means stalled eviction gets moving
* as soon as possible.
diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c
index 45c2029f6ed..93d1ddad8c6 100644
--- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c
+++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c
@@ -23,7 +23,7 @@ typedef struct {
#define WT_VRFY_DUMP(vs) \
((vs)->dump_address || \
(vs)->dump_blocks || (vs)->dump_pages || (vs)->dump_shape)
- int dump_address; /* Debugging hooks */
+ int dump_address; /* Configure: dump special */
int dump_blocks;
int dump_pages;
int dump_shape;
@@ -176,7 +176,7 @@ __wt_verify(WT_SESSION_IMPL *session, const char *cfg[])
__wt_meta_ckptlist_get(session, btree->dhandle->name, &ckptbase));
/* Inform the underlying block manager we're verifying. */
- WT_ERR(bm->verify_start(bm, session, ckptbase));
+ WT_ERR(bm->verify_start(bm, session, ckptbase, cfg));
bm_start = 1;
/* Loop through the file's checkpoints, verifying each one. */
diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c
index e2990f26719..d068c196771 100644
--- a/src/third_party/wiredtiger/src/config/config_def.c
+++ b/src/third_party/wiredtiger/src/config/config_def.c
@@ -312,7 +312,7 @@ static const WT_CONFIG_CHECK confchk_session_open_cursor[] = {
{ "readonly", "boolean", NULL, NULL, NULL },
{ "skip_sort_check", "boolean", NULL, NULL, NULL },
{ "statistics", "list",
- NULL, "choices=[\"all\",\"fast\",\"clear\"]",
+ NULL, "choices=[\"all\",\"fast\",\"clear\",\"size\"]",
NULL },
{ "target", "list", NULL, NULL, NULL },
{ NULL, NULL, NULL, NULL, NULL }
@@ -337,6 +337,7 @@ static const WT_CONFIG_CHECK confchk_session_verify[] = {
{ "dump_offsets", "list", NULL, NULL, NULL },
{ "dump_pages", "boolean", NULL, NULL, NULL },
{ "dump_shape", "boolean", NULL, NULL, NULL },
+ { "strict", "boolean", NULL, NULL, NULL },
{ NULL, NULL, NULL, NULL, NULL }
};
@@ -780,7 +781,7 @@ static const WT_CONFIG_ENTRY config_entries[] = {
},
{ "session.verify",
"dump_address=0,dump_blocks=0,dump_offsets=,dump_pages=0,"
- "dump_shape=0",
+ "dump_shape=0,strict=0",
confchk_session_verify
},
{ "table.meta",
diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c
index 9d49e36a5ca..85d9bb08d26 100644
--- a/src/third_party/wiredtiger/src/conn/conn_log.c
+++ b/src/third_party/wiredtiger/src/conn/conn_log.c
@@ -270,11 +270,11 @@ err:
}
/*
- * __log_close_server --
+ * __log_file_server --
* The log close server thread.
*/
static WT_THREAD_RET
-__log_close_server(void *arg)
+__log_file_server(void *arg)
{
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
@@ -317,6 +317,8 @@ __log_close_server(void *arg)
__wt_spin_lock(session, &log->log_sync_lock);
locked = 1;
WT_ERR(__wt_close(session, &close_fh));
+ WT_ASSERT(session,
+ LOG_CMP(&close_end_lsn, &log->sync_lsn) >= 0);
log->sync_lsn = close_end_lsn;
WT_ERR(__wt_cond_signal(session, log->log_sync_cond));
locked = 0;
@@ -324,7 +326,7 @@ __log_close_server(void *arg)
} else
/* Wait until the next event. */
WT_ERR(__wt_cond_wait(session,
- conn->log_close_cond, WT_MILLION));
+ conn->log_file_cond, WT_MILLION));
}
if (0) {
@@ -433,7 +435,7 @@ __log_wrlsn_server(void *arg)
*/
if (F_ISSET(slot, SLOT_CLOSEFH))
WT_ERR(__wt_cond_signal(session,
- conn->log_close_cond));
+ conn->log_file_cond));
WT_ERR(__wt_log_slot_free(session, slot));
}
}
@@ -583,16 +585,16 @@ __wt_logmgr_open(WT_SESSION_IMPL *session)
* If logging is enabled, this thread runs.
*/
WT_RET(__wt_open_internal_session(
- conn, "log-close-server", 0, 0, &conn->log_close_session));
- WT_RET(__wt_cond_alloc(conn->log_close_session,
- "log close server", 0, &conn->log_close_cond));
+ conn, "log-close-server", 0, 0, &conn->log_file_session));
+ WT_RET(__wt_cond_alloc(conn->log_file_session,
+ "log close server", 0, &conn->log_file_cond));
/*
* Start the log file close thread.
*/
- WT_RET(__wt_thread_create(conn->log_close_session,
- &conn->log_close_tid, __log_close_server, conn->log_close_session));
- conn->log_close_tid_set = 1;
+ WT_RET(__wt_thread_create(conn->log_file_session,
+ &conn->log_file_tid, __log_file_server, conn->log_file_session));
+ conn->log_file_tid_set = 1;
/*
* Start the log write LSN thread. It is not configurable.
@@ -667,16 +669,16 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
conn->log_tid_set = 0;
}
WT_TRET(__wt_cond_destroy(session, &conn->log_cond));
- if (conn->log_close_tid_set) {
- WT_TRET(__wt_cond_signal(session, conn->log_close_cond));
- WT_TRET(__wt_thread_join(session, conn->log_close_tid));
- conn->log_close_tid_set = 0;
+ if (conn->log_file_tid_set) {
+ WT_TRET(__wt_cond_signal(session, conn->log_file_cond));
+ WT_TRET(__wt_thread_join(session, conn->log_file_tid));
+ conn->log_file_tid_set = 0;
}
- WT_TRET(__wt_cond_destroy(session, &conn->log_close_cond));
- if (conn->log_close_session != NULL) {
- wt_session = &conn->log_close_session->iface;
+ WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond));
+ if (conn->log_file_session != NULL) {
+ wt_session = &conn->log_file_session->iface;
WT_TRET(wt_session->close(wt_session, NULL));
- conn->log_close_session = NULL;
+ conn->log_file_session = NULL;
}
if (conn->log_wrlsn_tid_set) {
WT_TRET(__wt_cond_signal(session, conn->log_wrlsn_cond));
diff --git a/src/third_party/wiredtiger/src/conn/conn_open.c b/src/third_party/wiredtiger/src/conn/conn_open.c
index e0e59dea8ba..ca8335fbdb9 100644
--- a/src/third_party/wiredtiger/src/conn/conn_open.c
+++ b/src/third_party/wiredtiger/src/conn/conn_open.c
@@ -92,7 +92,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
* transaction ID will catch up with the current ID.
*/
for (;;) {
- __wt_txn_update_oldest(session);
+ __wt_txn_update_oldest(session, 1);
if (txn_global->oldest_id == txn_global->current)
break;
__wt_yield();
diff --git a/src/third_party/wiredtiger/src/cursor/cur_stat.c b/src/third_party/wiredtiger/src/cursor/cur_stat.c
index 85442592c39..82568401319 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_stat.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_stat.c
@@ -373,6 +373,22 @@ __curstat_file_init(WT_SESSION_IMPL *session,
{
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
+ const char *filename;
+
+ /*
+ * If we are only getting the size of the file, we don't need to open
+ * the tree.
+ */
+ if (F_ISSET(cst, WT_CONN_STAT_SIZE)) {
+ filename = uri;
+ if (!WT_PREFIX_SKIP(filename, "file:"))
+ return (EINVAL);
+ __wt_stat_init_dsrc_stats(&cst->u.dsrc_stats);
+ WT_RET(__wt_block_manager_size(
+ session, filename, &cst->u.dsrc_stats));
+ __wt_curstat_dsrc_final(cst);
+ return (0);
+ }
WT_RET(__wt_session_get_btree_ckpt(session, uri, cfg, 0));
dhandle = session->dhandle;
@@ -508,8 +524,22 @@ __wt_curstat_open(WT_SESSION_IMPL *session,
}
WT_ERR_NOTFOUND_OK(ret);
if ((ret = __wt_config_subgets(
- session, &cval, "clear", &sval)) == 0 && sval.val != 0)
+ session, &cval, "size", &sval)) == 0 && sval.val != 0) {
+ if (F_ISSET(cst, WT_CONN_STAT_FAST | WT_CONN_STAT_ALL))
+ WT_ERR_MSG(session, EINVAL,
+ "only one statistics configuration value "
+ "may be specified");
+ F_SET(cst, WT_CONN_STAT_SIZE);
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+ if ((ret = __wt_config_subgets(
+ session, &cval, "clear", &sval)) == 0 && sval.val != 0) {
+ if (F_ISSET(cst, WT_CONN_STAT_SIZE))
+ WT_ERR_MSG(session, EINVAL,
+ "clear is incompatible with size "
+ "statistics");
F_SET(cst, WT_CONN_STAT_CLEAR);
+ }
WT_ERR_NOTFOUND_OK(ret);
/* If no configuration, use the connection's configuration. */
diff --git a/src/third_party/wiredtiger/src/evict/evict_file.c b/src/third_party/wiredtiger/src/evict/evict_file.c
index 864c116a380..795833d3b25 100644
--- a/src/third_party/wiredtiger/src/evict/evict_file.c
+++ b/src/third_party/wiredtiger/src/evict/evict_file.c
@@ -27,7 +27,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop)
WT_RET(__wt_evict_file_exclusive_on(session, &evict_reset));
/* Make sure the oldest transaction ID is up-to-date. */
- __wt_txn_update_oldest(session);
+ __wt_txn_update_oldest(session, 1);
/* Walk the tree, discarding pages. */
next_ref = NULL;
diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c
index a3bab5457f6..63a905539ce 100644
--- a/src/third_party/wiredtiger/src/evict/evict_lru.c
+++ b/src/third_party/wiredtiger/src/evict/evict_lru.c
@@ -493,6 +493,14 @@ __evict_pass(WT_SESSION_IMPL *session)
session, cache->evict_waiter_cond));
}
+ /*
+ * Increment the shared read generation. We do this
+ * occasionally even if eviction is not currently required, so
+ * that pages have some relative read generation when the
+ * eviction server does need to do some work.
+ */
+ __wt_cache_read_gen_incr(session);
+
WT_RET(__evict_has_work(session, &flags));
if (flags == 0)
break;
@@ -681,7 +689,7 @@ __wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref)
* before evicting, using a special "eviction" isolation level, where
* only globally visible updates can be evicted.
*/
- __wt_txn_update_oldest(session);
+ __wt_txn_update_oldest(session, 1);
txn = &session->txn;
saved_iso = txn->isolation;
txn->isolation = TXN_ISO_EVICTION;
@@ -838,6 +846,9 @@ __evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags)
WT_ASSERT(session, cache->evict[0].ref != NULL);
+ /* Track the oldest read generation we have in the queue. */
+ cache->read_gen_oldest = cache->evict[0].ref->page->read_gen;
+
if (LF_ISSET(WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK))
/*
* Take all candidates if we only gathered pages with an oldest
@@ -933,16 +944,13 @@ __evict_walk(WT_SESSION_IMPL *session, uint32_t flags)
incr = dhandle_locked = 0;
retries = 0;
- /* Increment the shared read generation. */
- __wt_cache_read_gen_incr(session);
-
/*
* Update the oldest ID: we use it to decide whether pages are
* candidates for eviction. Without this, if all threads are blocked
* after a long-running transaction (such as a checkpoint) completes,
* we may never start evicting again.
*/
- __wt_txn_update_oldest(session);
+ __wt_txn_update_oldest(session, 1);
if (cache->evict_current == NULL)
WT_STAT_FAST_CONN_INCR(session, cache_eviction_queue_empty);
@@ -1222,15 +1230,11 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
continue;
/*
- * If this page has never been considered for eviction,
- * set its read generation to a little bit in the
- * future and move on, give readers a chance to start
- * updating the read generation.
+ * If this page has never been considered for eviction, set its
+ * read generation to somewhere in the middle of the LRU list.
*/
- if (page->read_gen == WT_READGEN_NOTSET) {
- page->read_gen = __wt_cache_read_gen_set(session);
- continue;
- }
+ if (page->read_gen == WT_READGEN_NOTSET)
+ page->read_gen = __wt_cache_read_gen_new(session);
fast: /* If the page can't be evicted, give up. */
if (!__wt_page_can_evict(session, page, 1))
@@ -1424,7 +1428,7 @@ __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_server)
*/
page = ref->page;
if (page->read_gen != WT_READGEN_OLDEST)
- page->read_gen = __wt_cache_read_gen_set(session);
+ page->read_gen = __wt_cache_read_gen_bump(session);
/*
* If we are evicting in a dead tree, don't write dirty pages.
@@ -1475,7 +1479,7 @@ __wt_cache_wait(WT_SESSION_IMPL *session, int full)
* to make sure there is free space in the cache.
*/
txn_global = &S2C(session)->txn_global;
- txn_state = &txn_global->states[session->id];
+ txn_state = WT_SESSION_TXN_STATE(session);
busy = txn_state->id != WT_TXN_NONE ||
session->nhazard > 0 ||
(txn_state->snap_min != WT_TXN_NONE &&
@@ -1524,7 +1528,7 @@ __wt_cache_wait(WT_SESSION_IMPL *session, int full)
* are not busy.
*/
if (busy) {
- __wt_txn_update_oldest(session);
+ __wt_txn_update_oldest(session, 0);
if (txn_state->id == txn_global->oldest_id ||
txn_state->snap_min == txn_global->oldest_id)
return (0);
diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c
index 92ad8d296df..fe08916b24c 100644
--- a/src/third_party/wiredtiger/src/evict/evict_page.c
+++ b/src/third_party/wiredtiger/src/evict/evict_page.c
@@ -59,6 +59,9 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int exclusive)
conn = S2C(session);
+ /* Checkpoints should never do eviction. */
+ WT_ASSERT(session, !WT_SESSION_IS_CHECKPOINT(session));
+
page = ref->page;
forced_eviction = (page->read_gen == WT_READGEN_OLDEST);
inmem_split = 0;
diff --git a/src/third_party/wiredtiger/src/include/block.h b/src/third_party/wiredtiger/src/include/block.h
index 4ef1b9da4ec..fb8987efdb4 100644
--- a/src/third_party/wiredtiger/src/include/block.h
+++ b/src/third_party/wiredtiger/src/include/block.h
@@ -185,7 +185,8 @@ struct __wt_bm {
int (*sync)(WT_BM *, WT_SESSION_IMPL *, int);
int (*verify_addr)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
int (*verify_end)(WT_BM *, WT_SESSION_IMPL *);
- int (*verify_start)(WT_BM *, WT_SESSION_IMPL *, WT_CKPT *);
+ int (*verify_start)
+ (WT_BM *, WT_SESSION_IMPL *, WT_CKPT *, const char *[]);
int (*write) (WT_BM *,
WT_SESSION_IMPL *, WT_ITEM *, uint8_t *, size_t *, int);
int (*write_size)(WT_BM *, WT_SESSION_IMPL *, size_t *);
@@ -246,6 +247,7 @@ struct __wt_block {
/* Verification support */
int verify; /* If performing verification */
+ int verify_strict; /* Fail hard on any error */
wt_off_t verify_size; /* Checkpoint's file size */
WT_EXTLIST verify_alloc; /* Verification allocation list */
uint64_t frags; /* Maximum frags in the file */
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index 303162fcc93..23b17ef2cd3 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -194,6 +194,11 @@ struct __wt_page_modify {
/* The largest update transaction ID (approximate). */
uint64_t update_txn;
+#ifdef HAVE_DIAGNOSTIC
+ /* Check that transaction time moves forward. */
+ uint64_t last_oldest_id;
+#endif
+
/* Dirty bytes added to the cache. */
size_t bytes_dirty;
@@ -534,10 +539,9 @@ struct __wt_page {
#define WT_PAGE_DISK_ALLOC 0x02 /* Disk image in allocated memory */
#define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */
#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */
-#define WT_PAGE_REFUSE_DEEPEN 0x10 /* Don't deepen the tree at this page */
-#define WT_PAGE_SCANNING 0x20 /* Obsolete updates are being scanned */
-#define WT_PAGE_SPLIT_INSERT 0x40 /* A leaf page was split for append */
-#define WT_PAGE_SPLITTING 0x80 /* An internal page is growing */
+#define WT_PAGE_SCANNING 0x10 /* Obsolete updates are being scanned */
+#define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */
+#define WT_PAGE_SPLITTING 0x40 /* An internal page is growing */
uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
/*
diff --git a/src/third_party/wiredtiger/src/include/cache.h b/src/third_party/wiredtiger/src/include/cache.h
index 11f631416af..58b7b4dbddb 100644
--- a/src/third_party/wiredtiger/src/include/cache.h
+++ b/src/third_party/wiredtiger/src/include/cache.h
@@ -71,6 +71,8 @@ struct __wt_cache {
* Read information.
*/
uint64_t read_gen; /* Page read generation (LRU) */
+ uint64_t read_gen_oldest; /* The oldest read generation that
+ eviction knows about */
/*
* Eviction thread information.
diff --git a/src/third_party/wiredtiger/src/include/cache.i b/src/third_party/wiredtiger/src/include/cache.i
index f952f1bf698..d84069c43fb 100644
--- a/src/third_party/wiredtiger/src/include/cache.i
+++ b/src/third_party/wiredtiger/src/include/cache.i
@@ -27,11 +27,11 @@ __wt_cache_read_gen_incr(WT_SESSION_IMPL *session)
}
/*
- * __wt_cache_read_gen_set --
- * Get the read generation to store in a page.
+ * __wt_cache_read_gen_bump --
+ * Get the read generation to keep a page in memory.
*/
static inline uint64_t
-__wt_cache_read_gen_set(WT_SESSION_IMPL *session)
+__wt_cache_read_gen_bump(WT_SESSION_IMPL *session)
{
/*
* We return read-generations from the future (where "the future" is
@@ -46,6 +46,19 @@ __wt_cache_read_gen_set(WT_SESSION_IMPL *session)
}
/*
+ * __wt_cache_read_gen_new --
+ * Get the read generation for a new page in memory.
+ */
+static inline uint64_t
+__wt_cache_read_gen_new(WT_SESSION_IMPL *session)
+{
+ WT_CACHE *cache;
+
+ cache = S2C(session)->cache;
+ return (__wt_cache_read_gen(session) + cache->read_gen_oldest) / 2;
+}
+
+/*
* __wt_cache_pages_inuse --
* Return the number of pages in use.
*/
diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h
index a95b051fbc0..f24459a4147 100644
--- a/src/third_party/wiredtiger/src/include/connection.h
+++ b/src/third_party/wiredtiger/src/include/connection.h
@@ -252,6 +252,7 @@ struct __wt_connection_impl {
#define WT_CONN_STAT_FAST 0x04 /* "fast" statistics configured */
#define WT_CONN_STAT_NONE 0x08 /* don't gather statistics */
#define WT_CONN_STAT_ON_CLOSE 0x10 /* output statistics on close */
+#define WT_CONN_STAT_SIZE 0x20 /* "size" statistics configured */
uint32_t stat_flags;
WT_CONNECTION_STATS stats; /* Connection statistics */
@@ -317,10 +318,10 @@ struct __wt_connection_impl {
WT_SESSION_IMPL *log_session; /* Log server session */
wt_thread_t log_tid; /* Log server thread */
int log_tid_set; /* Log server thread set */
- WT_CONDVAR *log_close_cond;/* Log close thread wait mutex */
- WT_SESSION_IMPL *log_close_session;/* Log close thread session */
- wt_thread_t log_close_tid; /* Log close thread thread */
- int log_close_tid_set;/* Log close thread set */
+ WT_CONDVAR *log_file_cond; /* Log file thread wait mutex */
+ WT_SESSION_IMPL *log_file_session;/* Log file thread session */
+ wt_thread_t log_file_tid; /* Log file thread thread */
+ int log_file_tid_set;/* Log file thread set */
WT_CONDVAR *log_wrlsn_cond;/* Log write lsn thread wait mutex */
WT_SESSION_IMPL *log_wrlsn_session;/* Log write lsn thread session */
wt_thread_t log_wrlsn_tid; /* Log write lsn thread thread */
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 59e795893b5..63b6bb2cbc5 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -50,6 +50,7 @@ extern int __wt_block_open(WT_SESSION_IMPL *session, const char *filename, const
extern int __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block);
extern int __wt_desc_init(WT_SESSION_IMPL *session, WT_FH *fh, uint32_t allocsize);
extern void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats);
+extern int __wt_block_manager_size( WT_SESSION_IMPL *session, const char *filename, WT_DSRC_STATS *stats);
extern int __wt_bm_preload(WT_BM *bm, WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size);
extern int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size);
extern int __wt_block_read_off_blind( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset);
@@ -65,7 +66,7 @@ extern int __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block);
extern int __wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size);
extern int __wt_block_salvage_next(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t *addr_sizep, int *eofp);
extern int __wt_block_salvage_valid(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t addr_size, int valid);
-extern int __wt_block_verify_start( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase);
+extern int __wt_block_verify_start(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase, const char *cfg[]);
extern int __wt_block_verify_end(WT_SESSION_IMPL *session, WT_BLOCK *block);
extern int __wt_verify_ckpt_load( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_BLOCK_CKPT *ci);
extern int __wt_verify_ckpt_unload(WT_SESSION_IMPL *session, WT_BLOCK *block);
@@ -363,8 +364,12 @@ extern int __wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
extern int64_t __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size);
extern int __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
extern int __wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize);
+extern int __wt_clsm_request_switch(WT_CURSOR_LSM *clsm);
+extern int __wt_clsm_await_switch(WT_CURSOR_LSM *clsm);
extern int __wt_clsm_init_merge( WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks);
+extern int __wt_clsm_close(WT_CURSOR *cursor);
extern int __wt_clsm_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_clsm_open_bulk(WT_CURSOR_LSM *clsm, const char *cfg[]);
extern int __wt_lsm_manager_config(WT_SESSION_IMPL *session, const char **cfg);
extern int __wt_lsm_manager_reconfig(WT_SESSION_IMPL *session, const char **cfg);
extern int __wt_lsm_manager_start(WT_SESSION_IMPL *session);
@@ -435,6 +440,7 @@ extern int __wt_meta_track_checkpoint(WT_SESSION_IMPL *session);
extern int __wt_meta_track_insert(WT_SESSION_IMPL *session, const char *key);
extern int __wt_meta_track_update(WT_SESSION_IMPL *session, const char *key);
extern int __wt_meta_track_fileop( WT_SESSION_IMPL *session, const char *olduri, const char *newuri);
+extern int __wt_meta_track_drop( WT_SESSION_IMPL *session, const char *filename);
extern int __wt_meta_track_handle_lock(WT_SESSION_IMPL *session, int created);
extern int __wt_turtle_init(WT_SESSION_IMPL *session);
extern int __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, char **valuep);
@@ -659,9 +665,9 @@ extern void __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats);
extern void __wt_stat_refresh_connection_stats(void *stats_arg);
extern int WT_CDECL __wt_txnid_cmp(const void *v1, const void *v2);
extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session);
-extern void __wt_txn_update_oldest(WT_SESSION_IMPL *session);
-extern void __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot);
-extern int __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]);
+extern void __wt_txn_get_snapshot(WT_SESSION_IMPL *session);
+extern void __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force);
+extern int __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]);
extern void __wt_txn_release(WT_SESSION_IMPL *session);
extern int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]);
diff --git a/src/third_party/wiredtiger/src/include/lsm.h b/src/third_party/wiredtiger/src/include/lsm.h
index aa1d797e3b5..dc6a0d7e027 100644
--- a/src/third_party/wiredtiger/src/include/lsm.h
+++ b/src/third_party/wiredtiger/src/include/lsm.h
@@ -57,15 +57,16 @@ struct __wt_cursor_lsm {
u_int update_count; /* Updates performed. */
-#define WT_CLSM_ACTIVE 0x01 /* Incremented the session count */
-#define WT_CLSM_ITERATE_NEXT 0x02 /* Forward iteration */
-#define WT_CLSM_ITERATE_PREV 0x04 /* Backward iteration */
-#define WT_CLSM_MERGE 0x08 /* Merge cursor, don't update */
-#define WT_CLSM_MINOR_MERGE 0x10 /* Minor merge, include tombstones */
-#define WT_CLSM_MULTIPLE 0x20 /* Multiple cursors have values for the
+#define WT_CLSM_ACTIVE 0x001 /* Incremented the session count */
+#define WT_CLSM_BULK 0x002 /* Open for snapshot isolation */
+#define WT_CLSM_ITERATE_NEXT 0x004 /* Forward iteration */
+#define WT_CLSM_ITERATE_PREV 0x008 /* Backward iteration */
+#define WT_CLSM_MERGE 0x010 /* Merge cursor, don't update */
+#define WT_CLSM_MINOR_MERGE 0x020 /* Minor merge, include tombstones */
+#define WT_CLSM_MULTIPLE 0x040 /* Multiple cursors have values for the
current key */
-#define WT_CLSM_OPEN_READ 0x40 /* Open for reads */
-#define WT_CLSM_OPEN_SNAPSHOT 0x80 /* Open for snapshot isolation */
+#define WT_CLSM_OPEN_READ 0x080 /* Open for reads */
+#define WT_CLSM_OPEN_SNAPSHOT 0x100 /* Open for snapshot isolation */
uint32_t flags;
};
diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h
index 927ab09d5f9..d2b369a41c4 100644
--- a/src/third_party/wiredtiger/src/include/txn.h
+++ b/src/third_party/wiredtiger/src/include/txn.h
@@ -25,6 +25,9 @@
#define WT_SESSION_TXN_STATE(s) (&S2C(s)->txn_global.states[(s)->id])
+#define WT_SESSION_IS_CHECKPOINT(s) \
+ ((s)->id != 0 && (s)->id == S2C(s)->txn_global.checkpoint_id)
+
struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_txn_state {
volatile uint64_t id;
volatile uint64_t snap_min;
@@ -42,22 +45,19 @@ struct __wt_txn_global {
*/
volatile uint64_t oldest_id;
- /* The oldest session found in the last scan. */
- uint32_t oldest_session;
-
/* Count of scanning threads, or -1 for exclusive access. */
volatile int32_t scan_count;
/*
- * Track information about the running checkpoint. The transaction IDs
- * used when checkpointing are special. Checkpoints can run for a long
- * time so we keep them out of regular visibility checks. Eviction and
- * checkpoint operations know when they need to be aware of
- * checkpoint IDs.
+ * Track information about the running checkpoint. The transaction
+ * snapshot used when checkpointing are special. Checkpoints can run
+ * for a long time so we keep them out of regular visibility checks.
+ * Eviction and checkpoint operations know when they need to be aware
+ * of checkpoint transactions.
*/
+ volatile uint32_t checkpoint_id; /* Checkpoint's session ID */
volatile uint64_t checkpoint_gen;
- volatile uint64_t checkpoint_id;
- volatile uint64_t checkpoint_snap_min;
+ volatile uint64_t checkpoint_pinned;
WT_TXN_STATE *states; /* Per-session transaction states */
};
diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i
index 4ae80231c65..a9b19ca1ff5 100644
--- a/src/third_party/wiredtiger/src/include/txn.i
+++ b/src/third_party/wiredtiger/src/include/txn.i
@@ -98,33 +98,37 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session)
{
WT_BTREE *btree;
WT_TXN_GLOBAL *txn_global;
- uint64_t checkpoint_snap_min, oldest_id;
+ uint64_t checkpoint_pinned, oldest_id;
+ uint32_t checkpoint_gen;
txn_global = &S2C(session)->txn_global;
btree = S2BT_SAFE(session);
/*
- * Take a local copy of ID in case they are updated while we are
+ * Take a local copy of these IDs in case they are updated while we are
* checking visibility.
*/
- checkpoint_snap_min = txn_global->checkpoint_snap_min;
- oldest_id = txn_global->oldest_id;
+ WT_ORDERED_READ(oldest_id, txn_global->oldest_id);
+ WT_ORDERED_READ(checkpoint_gen, txn_global->checkpoint_gen);
+ WT_ORDERED_READ(checkpoint_pinned, txn_global->checkpoint_pinned);
/*
- * If there is no active checkpoint or this handle is up to date with
- * the active checkpoint it's safe to ignore the checkpoint ID in the
- * visibility check.
+ * Checkpoint transactions often fall behind ordinary application
+ * threads. Take special effort to not keep changes pinned in cache
+ * if they are only required for the checkpoint and it has already
+ * seen them.
+ *
+ * If there is no active checkpoint, this session is doing the
+ * checkpoint, or this handle is up to date with the active checkpoint
+ * then it's safe to ignore the checkpoint ID in the visibility check.
*/
- if (checkpoint_snap_min != WT_TXN_NONE && (btree == NULL ||
- btree->checkpoint_gen != txn_global->checkpoint_gen) &&
- TXNID_LT(checkpoint_snap_min, oldest_id))
- /*
- * Use the checkpoint ID for the visibility check if it is the
- * oldest ID in the system.
- */
- oldest_id = checkpoint_snap_min;
+ if (checkpoint_pinned == WT_TXN_NONE ||
+ TXNID_LT(oldest_id, checkpoint_pinned) ||
+ WT_SESSION_IS_CHECKPOINT(session) ||
+ (btree != NULL && btree->checkpoint_gen == checkpoint_gen))
+ return (oldest_id);
- return (oldest_id);
+ return (checkpoint_pinned);
}
/*
@@ -154,20 +158,20 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id)
txn = &session->txn;
- /*
- * Eviction only sees globally visible updates, or if there is a
- * checkpoint transaction running, use its transaction.
- */
- if (txn->isolation == TXN_ISO_EVICTION)
- return (__wt_txn_visible_all(session, id));
+ /* Changes with no associated transaction are always visible. */
+ if (id == WT_TXN_NONE)
+ return (1);
/* Nobody sees the results of aborted transactions. */
if (id == WT_TXN_ABORTED)
return (0);
- /* Changes with no associated transaction are always visible. */
- if (id == WT_TXN_NONE)
- return (1);
+ /*
+ * Eviction only sees globally visible updates, or if there is a
+ * checkpoint transaction running, use its transaction.
+ */
+ if (txn->isolation == TXN_ISO_EVICTION)
+ return (__wt_txn_visible_all(session, id));
/*
* Read-uncommitted transactions see all other changes.
@@ -206,6 +210,37 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id)
}
/*
+ * __wt_txn_begin --
+ * Begin a transaction.
+ */
+static int
+__wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
+{
+ WT_TXN *txn;
+
+ txn = &session->txn;
+ txn->isolation = session->isolation;
+ txn->txn_logsync = S2C(session)->txn_logsync;
+
+ if (cfg != NULL)
+ WT_RET(__wt_txn_config(session, cfg));
+
+ F_SET(txn, TXN_RUNNING);
+ if (txn->isolation == TXN_ISO_SNAPSHOT) {
+ if (session->ncursors > 0)
+ WT_RET(__wt_session_copy_values(session));
+
+ /*
+ * We're about to allocate a snapshot: if we need to block for
+ * eviction, it's better to do it beforehand.
+ */
+ WT_RET(__wt_cache_full_check(session));
+ __wt_txn_get_snapshot(session);
+ }
+ return (0);
+}
+
+/*
* __wt_txn_read --
* Get the first visible update in a list (or NULL if none are visible).
*/
@@ -301,7 +336,7 @@ __wt_txn_id_check(WT_SESSION_IMPL *session)
if (!F_ISSET(txn, TXN_HAS_ID)) {
conn = S2C(session);
txn_global = &conn->txn_global;
- txn_state = &txn_global->states[session->id];
+ txn_state = WT_SESSION_TXN_STATE(session);
WT_ASSERT(session, txn_state->id == WT_TXN_NONE);
@@ -393,7 +428,7 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session)
txn = &session->txn;
txn_global = &S2C(session)->txn_global;
- txn_state = &txn_global->states[session->id];
+ txn_state = WT_SESSION_TXN_STATE(session);
/*
* If there is no transaction running (so we don't have an ID), and no
@@ -418,7 +453,7 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session)
if (txn->isolation != TXN_ISO_READ_UNCOMMITTED &&
!F_ISSET(txn, TXN_HAS_SNAPSHOT))
- __wt_txn_refresh(session, 1);
+ __wt_txn_get_snapshot(session);
}
/*
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index e50beac3bfe..4804290acba 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -945,14 +945,16 @@ struct __wt_session {
* configured when the database is configured with "all"\, but the
* cursor open will fail if "all" is specified when the database is
* configured with "fast"\, and the cursor open will fail in all cases
- * when the database is configured with "none". If \c statistics is not
+ * when the database is configured with "none". If "size" is
+ * configured\, only the underlying size of the object on disk is filled
+ * in and the object is not opened. If \c statistics is not
* configured\, the default configuration is the database configuration.
* The "clear" configuration resets statistics after gathering them\,
* where appropriate (for example\, a cache size statistic is not
* cleared\, while the count of cursor insert operations will be
* cleared). See @ref statistics for more information., a list\, with
* values chosen from the following options: \c "all"\, \c "fast"\, \c
- * "clear"; default empty.}
+ * "clear"\, \c "size"; default empty.}
* @config{target, if non-empty\, backup the list of objects; valid only
* for a backup data source., a list of strings; default empty.}
* @configend
@@ -1335,6 +1337,10 @@ struct __wt_session {
* @config{dump_shape, Display the shape of the tree after
* verification\, using the application's message handler\, intended for
* debugging., a boolean flag; default \c false.}
+ * @config{strict, Treat any verification problem as an error; by
+ * default\, verify will warn\, but not fail\, in the case of errors
+ * that won't affect future behavior (for example\, a leaked block)., a
+ * boolean flag; default \c false.}
* @configend
* @ebusy_errors
*/
diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c
index b63038b976e..5c1d76105cb 100644
--- a/src/third_party/wiredtiger/src/log/log.c
+++ b/src/third_party/wiredtiger/src/log/log.c
@@ -48,6 +48,20 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn)
conn = S2C(session);
log = conn->log;
+
+ /*
+ * We need to wait for the previous log file to get written
+ * to disk before we sync out the current one and advance
+ * the LSN. Signal the worker thread because we know the
+ * LSN has moved into a later log file and there should be a
+ * log file ready to close.
+ */
+ while (log->sync_lsn.file < min_lsn->file) {
+ WT_ERR(__wt_cond_signal(session, conn->log_file_cond));
+ WT_ERR(__wt_cond_wait(
+ session, log->log_sync_cond, 10000));
+ }
+
__wt_spin_lock(session, &log->log_sync_lock);
WT_ASSERT(session, log->log_dir_fh != NULL);
/*
@@ -1063,7 +1077,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep)
* Signal the close thread if needed.
*/
if (F_ISSET(slot, SLOT_CLOSEFH))
- WT_ERR(__wt_cond_signal(session, conn->log_close_cond));
+ WT_ERR(__wt_cond_signal(session, conn->log_file_cond));
/*
* Try to consolidate calls to fsync to wait less. Acquire a spin lock
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
index 7665e417722..111de7a2be1 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c
@@ -20,11 +20,11 @@ static int __clsm_open_cursors(WT_CURSOR_LSM *, int, u_int, uint32_t);
static int __clsm_reset_cursors(WT_CURSOR_LSM *, WT_CURSOR *);
/*
- * __clsm_request_switch --
+ * __wt_clsm_request_switch --
* Request an LSM tree switch for a cursor operation.
*/
-static inline int
-__clsm_request_switch(WT_CURSOR_LSM *clsm)
+int
+__wt_clsm_request_switch(WT_CURSOR_LSM *clsm)
{
WT_DECL_RET;
WT_LSM_TREE *lsm_tree;
@@ -44,9 +44,9 @@ __clsm_request_switch(WT_CURSOR_LSM *clsm)
if (lsm_tree->nchunks == 0 ||
(clsm->dsk_gen == lsm_tree->dsk_gen &&
!F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))) {
+ F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
ret = __wt_lsm_manager_push_entry(
session, WT_LSM_WORK_SWITCH, 0, lsm_tree);
- F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
}
WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree));
}
@@ -55,6 +55,41 @@ __clsm_request_switch(WT_CURSOR_LSM *clsm)
}
/*
+ * __wt_clsm_await_switch --
+ * Wait for a switch to have completed in the LSM tree
+ */
+int
+__wt_clsm_await_switch(WT_CURSOR_LSM *clsm)
+{
+ WT_LSM_TREE *lsm_tree;
+ WT_SESSION_IMPL *session;
+ int waited;
+
+ lsm_tree = clsm->lsm_tree;
+ session = (WT_SESSION_IMPL *)clsm->iface.session;
+
+ /*
+ * If there is no primary chunk, or a chunk has overflowed the hard
+ * limit, which either means a worker thread has fallen behind or there
+ * has just been a user-level checkpoint, wait until the tree changes.
+ *
+ * We used to switch chunks in the application thread here, but that is
+ * problematic because there is a transaction in progress and it could
+ * roll back, leaving the metadata inconsistent.
+ */
+ for (waited = 0;
+ lsm_tree->nchunks == 0 ||
+ clsm->dsk_gen == lsm_tree->dsk_gen;
+ ++waited) {
+ if (waited % 1000 == 0)
+ WT_RET(__wt_lsm_manager_push_entry(
+ session, WT_LSM_WORK_SWITCH, 0, lsm_tree));
+ __wt_sleep(0, 10);
+ }
+ return (0);
+}
+
+/*
* __clsm_enter_update --
* Make sure an LSM cursor is ready to perform an update.
*/
@@ -65,7 +100,7 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm)
WT_LSM_CHUNK *primary_chunk;
WT_LSM_TREE *lsm_tree;
WT_SESSION_IMPL *session;
- int hard_limit, have_primary, ovfl, waited;
+ int hard_limit, have_primary, ovfl;
lsm_tree = clsm->lsm_tree;
ovfl = 0;
@@ -108,30 +143,13 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm)
}
/* Request a switch. */
- WT_RET(__clsm_request_switch(clsm));
+ WT_RET(__wt_clsm_request_switch(clsm));
/* If we only overflowed the soft limit, we're done. */
if (have_primary && !hard_limit)
return (0);
- /*
- * If there is no primary chunk, or it has overflowed the hard limit,
- * which either means a worker thread has fallen behind or there has
- * just been a user-level checkpoint, wait until the tree changes.
- *
- * We used to switch chunks in the application thread if we got to
- * here, but that is problematic because there is a transaction in
- * progress and it could roll back, leaving the metadata inconsistent.
- */
- for (waited = 0;
- lsm_tree->nchunks == 0 ||
- clsm->dsk_gen == lsm_tree->dsk_gen;
- ++waited) {
- if (waited % 1000 == 0)
- WT_RET(__wt_lsm_manager_push_entry(
- session, WT_LSM_WORK_SWITCH, 0, lsm_tree));
- __wt_sleep(0, 10);
- }
+ WT_RET(__wt_clsm_await_switch(clsm));
return (0);
}
@@ -1423,11 +1441,11 @@ err: __clsm_leave(clsm);
}
/*
- * __clsm_close --
+ * __wt_clsm_close --
* WT_CURSOR->close method for the LSM cursor type.
*/
-static int
-__clsm_close(WT_CURSOR *cursor)
+int
+__wt_clsm_close(WT_CURSOR *cursor)
{
WT_CURSOR_LSM *clsm;
WT_DECL_RET;
@@ -1481,14 +1499,17 @@ __wt_clsm_open(WT_SESSION_IMPL *session,
__clsm_update, /* update */
__clsm_remove, /* remove */
__wt_cursor_reconfigure, /* reconfigure */
- __clsm_close); /* close */
+ __wt_clsm_close); /* close */
WT_CURSOR *cursor;
WT_CURSOR_LSM *clsm;
WT_DECL_RET;
WT_LSM_TREE *lsm_tree;
+ int bulk;
+ bulk = 0;
clsm = NULL;
cursor = NULL;
+ lsm_tree = NULL;
if (!WT_PREFIX_MATCH(uri, "lsm:"))
return (EINVAL);
@@ -1498,9 +1519,22 @@ __wt_clsm_open(WT_SESSION_IMPL *session,
WT_RET_MSG(session, EINVAL,
"LSM does not support opening by checkpoint");
+ WT_RET(__wt_config_gets_def(session, cfg, "bulk", 0, &cval));
+ if (cval.val != 0)
+ bulk = 1;
+
/* Get the LSM tree. */
WT_WITH_DHANDLE_LOCK(session,
- ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree));
+ ret = __wt_lsm_tree_get(session, uri, bulk, &lsm_tree));
+ /*
+ * Check whether the exclusive open for a bulk load succeeded, and
+ * if it did ensure that it's safe to bulk load into the tree.
+ */
+ if (bulk && (ret == EBUSY || (ret == 0 && lsm_tree->nchunks > 1)))
+ WT_ERR_MSG(session, EINVAL,
+ "bulk-load is only supported on newly created LSM trees");
+ WT_ASSERT(session, !bulk || lsm_tree->exclusive);
+ /* Flag any errors from the tree get. */
WT_RET(ret);
WT_ERR(__wt_calloc_one(session, &clsm));
@@ -1523,9 +1557,20 @@ __wt_clsm_open(WT_SESSION_IMPL *session,
WT_STATIC_ASSERT(offsetof(WT_CURSOR_LSM, iface) == 0);
WT_ERR(__wt_cursor_init(cursor, cursor->uri, owner, cfg, cursorp));
+ if (bulk)
+ WT_ERR(__wt_clsm_open_bulk(clsm, cfg));
+
if (0) {
err: if (clsm != NULL)
- WT_TRET(__clsm_close(cursor));
+ WT_TRET(__wt_clsm_close(cursor));
+ else if (lsm_tree != NULL)
+ __wt_lsm_tree_release(session, lsm_tree);
+
+ /*
+ * We open bulk cursors after setting the returned cursor.
+ * Fix that here.
+ */
+ *cursorp = NULL;
}
return (ret);
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c
new file mode 100644
index 00000000000..8099c87c3bf
--- /dev/null
+++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor_bulk.c
@@ -0,0 +1,131 @@
+/*-
+ * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __clsm_close_bulk --
+ * WT_CURSOR->close method for LSM bulk cursors.
+ */
+static int
+__clsm_close_bulk(WT_CURSOR *cursor)
+{
+ WT_CURSOR_LSM *clsm;
+ WT_CURSOR *bulk_cursor;
+ WT_LSM_TREE *lsm_tree;
+ WT_SESSION_IMPL *session;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+ lsm_tree = clsm->lsm_tree;
+ session = (WT_SESSION_IMPL *)clsm->iface.session;
+
+ /* Close the bulk cursor to ensure the chunk is written to disk. */
+ bulk_cursor = clsm->cursors[0];
+ WT_RET(bulk_cursor->close(bulk_cursor));
+ clsm->cursors[0] = NULL;
+ clsm->nchunks = 0;
+
+ /* Set ondisk, and flush the metadata */
+ F_SET(lsm_tree->chunk[0], WT_LSM_CHUNK_ONDISK);
+ WT_RET(__wt_lsm_meta_write(session, lsm_tree));
+ ++lsm_tree->dsk_gen;
+
+ /* Close the LSM cursor */
+ WT_RET(__wt_clsm_close(cursor));
+
+ return (0);
+}
+/*
+ * __clsm_insert_bulk --
+ * WT_CURSOR->insert method for LSM bulk cursors.
+ */
+static int
+__clsm_insert_bulk(WT_CURSOR *cursor)
+{
+ WT_CURSOR *bulk_cursor;
+ WT_CURSOR_LSM *clsm;
+ WT_LSM_TREE *lsm_tree;
+ WT_SESSION_IMPL *session;
+
+ clsm = (WT_CURSOR_LSM *)cursor;
+ lsm_tree = clsm->lsm_tree;
+ session = (WT_SESSION_IMPL *)clsm->iface.session;
+
+ WT_ASSERT(session, lsm_tree->nchunks == 1 && clsm->nchunks == 1);
+ ++lsm_tree->chunk[0]->count;
+ bulk_cursor = *clsm->cursors;
+ bulk_cursor->set_key(bulk_cursor, &cursor->key);
+ bulk_cursor->set_value(bulk_cursor, &cursor->value);
+ WT_RET(bulk_cursor->insert(bulk_cursor));
+
+ return (0);
+}
+
+/*
+ * __wt_clsm_open_bulk --
+ * WT_SESSION->open_cursor method for LSM bulk cursors.
+ */
+int
+__wt_clsm_open_bulk(WT_CURSOR_LSM *clsm, const char *cfg[])
+{
+ WT_CURSOR *cursor, *bulk_cursor;
+ WT_LSM_TREE *lsm_tree;
+ WT_SESSION_IMPL *session;
+
+ bulk_cursor = NULL;
+ cursor = &clsm->iface;
+ lsm_tree = clsm->lsm_tree;
+ session = (WT_SESSION_IMPL *)clsm->iface.session;
+
+ F_SET(clsm, WT_CLSM_BULK);
+
+ /* Bulk cursors are limited to insert and close. */
+ __wt_cursor_set_notsup(cursor);
+ cursor->insert = __clsm_insert_bulk;
+ cursor->close = __clsm_close_bulk;
+
+ /* Setup the first chunk in the tree. */
+ WT_RET(__wt_clsm_request_switch(clsm));
+ WT_RET(__wt_clsm_await_switch(clsm));
+
+ /*
+ * Grab and release the LSM tree lock to ensure that the first chunk
+ * has been fully created before proceeding. We have the LSM tree
+ * open exclusive, so that saves us from needing the lock generally.
+ */
+ WT_RET(__wt_lsm_tree_readlock(session, lsm_tree));
+ WT_RET(__wt_lsm_tree_readunlock(session, lsm_tree));
+
+ /*
+ * Open a bulk cursor on the first chunk, it's not a regular LSM chunk
+ * cursor, but use the standard storage locations. Allocate the space
+ * for a bloom filter - it makes cleanup simpler. Cleaned up by
+ * cursor close on error.
+ */
+ WT_RET(__wt_calloc_one(session, &clsm->blooms));
+ clsm->bloom_alloc = 1;
+ WT_RET(__wt_calloc_one(session, &clsm->cursors));
+ clsm->cursor_alloc = 1;
+ clsm->nchunks = 1;
+
+ /*
+ * Open a bulk cursor on the first chunk in the tree - take a read
+ * lock on the LSM tree while we are opening the chunk, to ensure
+ * that the first chunk has been fully created before we succeed.
+ * Pass through the application config to ensure the tree is open
+ * for bulk access.
+ */
+ WT_RET(__wt_open_cursor(session,
+ lsm_tree->chunk[0]->uri, &clsm->iface, cfg, &bulk_cursor));
+ clsm->cursors[0] = bulk_cursor;
+ /* LSM cursors are always raw */
+ F_SET(bulk_cursor, WT_CURSTD_RAW);
+
+ return (0);
+}
+
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_stat.c b/src/third_party/wiredtiger/src/lsm/lsm_stat.c
index e994300d4d3..656e43c978d 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_stat.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_stat.c
@@ -40,11 +40,12 @@ __curstat_lsm_init(
/* Propagate all, fast and/or clear to the cursors we open. */
if (!F_ISSET(cst, WT_CONN_STAT_NONE)) {
(void)snprintf(config, sizeof(config),
- "statistics=(%s%s%s)",
- F_ISSET(cst, WT_CONN_STAT_CLEAR) ? "clear," : "",
+ "statistics=(%s%s%s%s)",
F_ISSET(cst, WT_CONN_STAT_ALL) ? "all," : "",
+ F_ISSET(cst, WT_CONN_STAT_CLEAR) ? "clear," : "",
!F_ISSET(cst, WT_CONN_STAT_ALL) &&
- F_ISSET(cst, WT_CONN_STAT_FAST) ? "fast," : "");
+ F_ISSET(cst, WT_CONN_STAT_FAST) ? "fast," : "",
+ F_ISSET(cst, WT_CONN_STAT_SIZE) ? "size," : "");
cfg[1] = disk_cfg[1] = config;
}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c
index 2bded10cb96..63f19858279 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_tree.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c
@@ -10,7 +10,8 @@
static int __lsm_tree_cleanup_old(WT_SESSION_IMPL *, const char *);
static int __lsm_tree_open_check(WT_SESSION_IMPL *, WT_LSM_TREE *);
-static int __lsm_tree_open(WT_SESSION_IMPL *, const char *, WT_LSM_TREE **);
+static int __lsm_tree_open(
+ WT_SESSION_IMPL *, const char *, int, WT_LSM_TREE **);
static int __lsm_tree_set_name(WT_SESSION_IMPL *, WT_LSM_TREE *, const char *);
/*
@@ -430,7 +431,7 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session,
*/
if (ret == 0)
WT_WITH_DHANDLE_LOCK(session,
- ret = __lsm_tree_open(session, uri, &lsm_tree));
+ ret = __lsm_tree_open(session, uri, 1, &lsm_tree));
if (ret == 0)
__wt_lsm_tree_release(session, lsm_tree);
@@ -539,8 +540,8 @@ __lsm_tree_open_check(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
* Open an LSM tree structure.
*/
static int
-__lsm_tree_open(
- WT_SESSION_IMPL *session, const char *uri, WT_LSM_TREE **treep)
+__lsm_tree_open(WT_SESSION_IMPL *session,
+ const char *uri, int exclusive, WT_LSM_TREE **treep)
{
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
@@ -556,7 +557,8 @@ __lsm_tree_open(
WT_RET(__wt_lsm_manager_start(session));
/* Make sure no one beat us to it. */
- if ((ret = __lsm_tree_find(session, uri, 0, treep)) != WT_NOTFOUND)
+ if ((ret = __lsm_tree_find(
+ session, uri, exclusive, treep)) != WT_NOTFOUND)
return (ret);
/* Try to open the tree. */
@@ -582,6 +584,7 @@ __lsm_tree_open(
* with getting handles exclusive.
*/
lsm_tree->refcnt = 1;
+ lsm_tree->exclusive = exclusive;
lsm_tree->queue_ref = 0;
/* Set a flush timestamp as a baseline. */
@@ -613,8 +616,9 @@ __wt_lsm_tree_get(WT_SESSION_IMPL *session,
ret = __lsm_tree_find(session, uri, exclusive, treep);
if (ret == WT_NOTFOUND)
- ret = __lsm_tree_open(session, uri, treep);
+ ret = __lsm_tree_open(session, uri, exclusive, treep);
+ WT_ASSERT(session, ret != 0 || exclusive == (*treep)->exclusive);
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
index 0566e0abc70..99140f89c51 100644
--- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
+++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c
@@ -281,7 +281,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
}
/* Stop if a running transaction needs the chunk. */
- __wt_txn_update_oldest(session);
+ __wt_txn_update_oldest(session, 1);
if (chunk->switch_txn == WT_TXN_NONE ||
!__wt_txn_visible_all(session, chunk->switch_txn)) {
WT_RET(__wt_verbose(session, WT_VERB_LSM,
diff --git a/src/third_party/wiredtiger/src/meta/meta_track.c b/src/third_party/wiredtiger/src/meta/meta_track.c
index 62d4df47ff6..66e7e3977f4 100644
--- a/src/third_party/wiredtiger/src/meta/meta_track.c
+++ b/src/third_party/wiredtiger/src/meta/meta_track.c
@@ -17,6 +17,7 @@ typedef struct __wt_meta_track {
enum {
WT_ST_EMPTY, /* Unused slot */
WT_ST_CHECKPOINT, /* Complete a checkpoint */
+ WT_ST_DROP_COMMIT, /* Drop post commit */
WT_ST_FILEOP, /* File operation */
WT_ST_LOCK, /* Lock a handle */
WT_ST_REMOVE, /* Remove a metadata entry */
@@ -106,7 +107,8 @@ __meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll)
* Unlock handles and complete checkpoints regardless of whether we are
* unrolling.
*/
- if (!unroll && trk->op != WT_ST_CHECKPOINT && trk->op != WT_ST_LOCK)
+ if (!unroll && trk->op != WT_ST_CHECKPOINT &&
+ trk->op != WT_ST_DROP_COMMIT && trk->op != WT_ST_LOCK)
goto free;
switch (trk->op) {
@@ -120,6 +122,14 @@ __meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll)
WT_TRET(bm->checkpoint_resolve(bm, session)));
}
break;
+ case WT_ST_DROP_COMMIT:
+ if ((tret = __wt_remove_if_exists(session, trk->a)) != 0) {
+ __wt_err(session, tret,
+ "metadata remove dropped file %s",
+ trk->a);
+ WT_TRET(tret);
+ }
+ break;
case WT_ST_LOCK: /* Handle lock, see above */
if (unroll && trk->created)
F_SET(trk->dhandle, WT_DHANDLE_DISCARD);
@@ -394,6 +404,23 @@ __wt_meta_track_fileop(
}
/*
+ * __wt_meta_track_drop --
+ * Track a file drop, where the remove is deferred until commit.
+ */
+int
+__wt_meta_track_drop(
+ WT_SESSION_IMPL *session, const char *filename)
+{
+ WT_META_TRACK *trk;
+
+ WT_RET(__meta_track_next(session, &trk));
+
+ trk->op = WT_ST_DROP_COMMIT;
+ WT_RET(__wt_strdup(session, filename, &trk->a));
+ return (0);
+}
+
+/*
* __wt_meta_track_handle_lock --
* Track a locked handle.
*/
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 573ea8811f8..14ab05fbb25 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -363,6 +363,19 @@ __wt_reconcile(WT_SESSION_IMPL *session,
WT_STAT_FAST_DATA_INCR(session, rec_pages_eviction);
}
+#ifdef HAVE_DIAGNOSTIC
+ {
+ /*
+ * Check that transaction time always moves forward for a given page.
+ * If this check fails, reconciliation can free something that a future
+ * reconciliation will need.
+ */
+ uint64_t oldest_id = __wt_txn_oldest_id(session);
+ WT_ASSERT(session, TXNID_LE(mod->last_oldest_id, oldest_id));
+ mod->last_oldest_id = oldest_id;
+ }
+#endif
+
/* Record the most recent transaction ID we will *not* write. */
mod->disk_snap_min = session->txn.snap_min;
@@ -839,6 +852,7 @@ static inline int
__rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
WT_INSERT *ins, WT_ROW *rip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp)
{
+ WT_DECL_RET;
WT_ITEM ovfl;
WT_PAGE *page;
WT_UPDATE *upd, *upd_list, *upd_ovfl;
@@ -977,8 +991,11 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
*/
if (vpack != NULL && vpack->raw == WT_CELL_VALUE_OVFL_RM &&
!__wt_txn_visible_all(session, min_txn)) {
- WT_RET(__wt_ovfl_txnc_search(
- page, vpack->data, vpack->size, &ovfl));
+ if ((ret = __wt_ovfl_txnc_search(
+ page, vpack->data, vpack->size, &ovfl)) != 0)
+ WT_PANIC_RET(session, ret,
+ "cached overflow item discarded early");
+
/*
* Create an update structure with an impossibly low transaction
* ID and append it to the update list we're about to save.
@@ -1221,10 +1238,6 @@ __rec_child_deleted(
if (F_ISSET(r, WT_SKIP_UPDATE_ERR))
WT_PANIC_RET(session, EINVAL,
"reconciliation illegally skipped an update");
-
- /* If this page cannot be evicted, quit now. */
- if (F_ISSET(r, WT_EVICTING))
- return (EBUSY);
}
/*
@@ -1265,6 +1278,18 @@ __rec_child_deleted(
}
/*
+ * If there are deleted child pages that we can't discard immediately,
+ * keep the page dirty so they are eventually freed.
+ */
+ if (ref->addr != NULL) {
+ r->leave_dirty = 1;
+
+ /* This page cannot be evicted, quit now. */
+ if (F_ISSET(r, WT_EVICTING))
+ return (EBUSY);
+ }
+
+ /*
* Minor memory cleanup: if a truncate call deleted this page and we
* were ever forced to instantiate the page in memory, we would have
* built a list of updates in the page reference in order to be able
diff --git a/src/third_party/wiredtiger/src/schema/schema_drop.c b/src/third_party/wiredtiger/src/schema/schema_drop.c
index 03097128ec2..694d07c65bf 100644
--- a/src/third_party/wiredtiger/src/schema/schema_drop.c
+++ b/src/third_party/wiredtiger/src/schema/schema_drop.c
@@ -39,10 +39,10 @@ __drop_file(
return (ret);
/*
- * Remove the underlying physical file. There is no point tracking this
- * operation: there is no going back from here.
+ * Schedule the remove of the underlying physical file when the drop
+ * completes.
*/
- WT_TRET(__wt_remove_if_exists(session, filename));
+ WT_TRET(__wt_meta_track_drop(session, filename));
return (ret);
}
@@ -120,8 +120,13 @@ __drop_table(
for (i = 0; i < WT_COLGROUPS(table); i++) {
if ((colgroup = table->cgroups[i]) == NULL)
continue;
- WT_ERR(__wt_metadata_remove(session, colgroup->name));
+ /*
+ * Drop the column group before updating the metadata to avoid
+ * the metadata for the table becoming inconsistent if we can't
+ * get exclusive access.
+ */
WT_ERR(__wt_schema_drop(session, colgroup->source, cfg));
+ WT_ERR(__wt_metadata_remove(session, colgroup->name));
}
/* Drop the indices. */
@@ -129,8 +134,13 @@ __drop_table(
for (i = 0; i < table->nindices; i++) {
if ((idx = table->indices[i]) == NULL)
continue;
- WT_ERR(__wt_metadata_remove(session, idx->name));
+ /*
+ * Drop the column group before updating the metadata to avoid
+ * the metadata for the table becoming inconsistent if we can't
+ * get exclusive access.
+ */
WT_ERR(__wt_schema_drop(session, idx->source, cfg));
+ WT_ERR(__wt_metadata_remove(session, idx->name));
}
WT_ERR(__wt_schema_remove_table(session, table));
diff --git a/src/third_party/wiredtiger/src/schema/schema_rename.c b/src/third_party/wiredtiger/src/schema/schema_rename.c
index 51281eccec5..c00ffa7d61c 100644
--- a/src/third_party/wiredtiger/src/schema/schema_rename.c
+++ b/src/third_party/wiredtiger/src/schema/schema_rename.c
@@ -155,15 +155,18 @@ __rename_tree(WT_SESSION_IMPL *session,
cval.str + cval.len));
/*
+ * Do the rename before updating the metadata to avoid leaving the
+ * metadata inconsistent if the rename fails.
+ */
+ WT_ERR(__wt_schema_rename(session, os->data, ns->data, cfg));
+
+ /*
* Remove the old metadata entry.
* Insert the new metadata entry.
*/
WT_ERR(__wt_metadata_remove(session, name));
WT_ERR(__wt_metadata_insert(session, nn->data, nv->data));
- /* Rename the file. */
- WT_ERR(__wt_schema_rename(session, os->data, ns->data, cfg));
-
err: __wt_scr_free(session, &nn);
__wt_scr_free(session, &ns);
__wt_scr_free(session, &nv);
diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c
index c838785a9c3..f6f5a695b4f 100644
--- a/src/third_party/wiredtiger/src/txn/txn.c
+++ b/src/third_party/wiredtiger/src/txn/txn.c
@@ -57,66 +57,44 @@ __wt_txn_release_snapshot(WT_SESSION_IMPL *session)
txn = &session->txn;
txn_state = &S2C(session)->txn_global.states[session->id];
- if (txn_state->snap_min != WT_TXN_NONE) {
- WT_ASSERT(session,
- session->txn.isolation == TXN_ISO_READ_UNCOMMITTED ||
- !__wt_txn_visible_all(session, txn_state->snap_min));
- txn_state->snap_min = WT_TXN_NONE;
- }
- F_CLR(txn, TXN_HAS_SNAPSHOT);
-}
+ WT_ASSERT(session,
+ txn_state->snap_min == WT_TXN_NONE ||
+ session->txn.isolation == TXN_ISO_READ_UNCOMMITTED ||
+ !__wt_txn_visible_all(session, txn_state->snap_min));
-/*
- * __wt_txn_update_oldest --
- * Sweep the running transactions to update the oldest ID required.
- */
-void
-__wt_txn_update_oldest(WT_SESSION_IMPL *session)
-{
- /*
- * !!!
- * If a data-source is calling the WT_EXTENSION_API.transaction_oldest
- * method (for the oldest transaction ID not yet visible to a running
- * transaction), and then comparing that oldest ID against committed
- * transactions to see if updates for a committed transaction are still
- * visible to running transactions, the oldest transaction ID may be
- * the same as the last committed transaction ID, if the transaction
- * state wasn't refreshed after the last transaction committed. Push
- * past the last committed transaction.
- */
- __wt_txn_refresh(session, 0);
+ txn_state->snap_min = WT_TXN_NONE;
+ F_CLR(txn, TXN_HAS_SNAPSHOT);
}
/*
- * __wt_txn_refresh --
- * Allocate a transaction ID and/or a snapshot.
+ * __wt_txn_get_snapshot --
+ * Allocate a snapshot.
*/
void
-__wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot)
+__wt_txn_get_snapshot(WT_SESSION_IMPL *session)
{
WT_CONNECTION_IMPL *conn;
WT_TXN *txn;
WT_TXN_GLOBAL *txn_global;
WT_TXN_STATE *s, *txn_state;
- uint64_t current_id, id, oldest_id;
+ uint64_t current_id, id;
uint64_t prev_oldest_id, snap_min;
- uint32_t i, n, oldest_session, session_cnt;
+ uint32_t i, n, session_cnt;
int32_t count;
conn = S2C(session);
txn = &session->txn;
txn_global = &conn->txn_global;
- txn_state = &txn_global->states[session->id];
+ txn_state = WT_SESSION_TXN_STATE(session);
current_id = snap_min = txn_global->current;
prev_oldest_id = txn_global->oldest_id;
/* For pure read-only workloads, avoid scanning. */
if (prev_oldest_id == current_id) {
- if (get_snapshot) {
- txn_state->snap_min = current_id;
- __txn_sort_snapshot(session, 0, current_id);
- }
+ txn_state->snap_min = current_id;
+ __txn_sort_snapshot(session, 0, current_id);
+
/* Check that the oldest ID has not moved in the meantime. */
if (prev_oldest_id == txn_global->oldest_id &&
txn_global->scan_count == 0)
@@ -136,17 +114,11 @@ __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot)
/* The oldest ID cannot change until the scan count goes to zero. */
prev_oldest_id = txn_global->oldest_id;
- current_id = oldest_id = snap_min = txn_global->current;
- oldest_session = 0;
+ current_id = snap_min = txn_global->current;
/* Walk the array of concurrent transactions. */
WT_ORDERED_READ(session_cnt, conn->session_cnt);
for (i = n = 0, s = txn_global->states; i < session_cnt; i++, s++) {
- /* Skip the checkpoint transaction; it is never read from. */
- if (txn_global->checkpoint_id != WT_TXN_NONE &&
- s->id == txn_global->checkpoint_id)
- continue;
-
/*
* Build our snapshot of any concurrent transaction IDs.
*
@@ -160,18 +132,99 @@ __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot)
if (s != txn_state &&
(id = s->id) != WT_TXN_NONE &&
TXNID_LE(prev_oldest_id, id)) {
- if (get_snapshot)
- txn->snapshot[n++] = id;
+ txn->snapshot[n++] = id;
if (TXNID_LT(id, snap_min))
snap_min = id;
}
+ }
+
+ /*
+ * If we got a new snapshot, update the published snap_min for this
+ * session.
+ */
+ WT_ASSERT(session, TXNID_LE(prev_oldest_id, snap_min));
+ WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
+ txn_state->snap_min = snap_min;
+
+ /* Update the last running ID if we have a much newer value. */
+ if (snap_min > txn_global->last_running + 100)
+ txn_global->last_running = snap_min;
+
+ WT_ASSERT(session, txn_global->scan_count > 0);
+ (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1);
+
+ __txn_sort_snapshot(session, n, current_id);
+}
+
+/*
+ * __wt_txn_update_oldest --
+ * Sweep the running transactions to update the oldest ID required.
+ * !!!
+ * If a data-source is calling the WT_EXTENSION_API.transaction_oldest
+ * method (for the oldest transaction ID not yet visible to a running
+ * transaction), and then comparing that oldest ID against committed
+ * transactions to see if updates for a committed transaction are still
+ * visible to running transactions, the oldest transaction ID may be
+ * the same as the last committed transaction ID, if the transaction
+ * state wasn't refreshed after the last transaction committed. Push
+ * past the last committed transaction.
+*/
+void
+__wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_SESSION_IMPL *oldest_session;
+ WT_TXN_GLOBAL *txn_global;
+ WT_TXN_STATE *s;
+ uint64_t current_id, id, oldest_id, prev_oldest_id, snap_min;
+ uint32_t i, session_cnt;
+ int32_t count;
+ int last_running_moved;
+
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+ current_id = snap_min = txn_global->current;
+ oldest_session = NULL;
+ prev_oldest_id = txn_global->oldest_id;
+
+ /*
+ * For pure read-only workloads, or if the update isn't forced and the
+ * oldest ID isn't too far behind, avoid scanning.
+ */
+ if (prev_oldest_id == current_id ||
+ (!force && TXNID_LT(current_id, prev_oldest_id + 100)))
+ return;
+
+ /*
+ * We're going to scan. Increment the count of scanners to prevent the
+ * oldest ID from moving forwards. Spin if the count is negative,
+ * which indicates that some thread is moving the oldest ID forwards.
+ */
+ do {
+ if ((count = txn_global->scan_count) < 0)
+ WT_PAUSE();
+ } while (count < 0 ||
+ !WT_ATOMIC_CAS4(txn_global->scan_count, count, count + 1));
+
+ /* The oldest ID cannot change until the scan count goes to zero. */
+ prev_oldest_id = txn_global->oldest_id;
+ current_id = oldest_id = snap_min = txn_global->current;
+
+ /* Walk the array of concurrent transactions. */
+ WT_ORDERED_READ(session_cnt, conn->session_cnt);
+ for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
/*
- * Ignore the session's own snap_min: we are about to update
- * it.
+ * Update the oldest ID.
+ *
+ * Ignore: IDs older than the oldest ID we saw. This can happen
+ * if we race with a thread that is allocating an ID -- the ID
+ * will not be used because the thread will keep spinning until
+ * it gets a valid one.
*/
- if (get_snapshot && s == txn_state)
- continue;
+ if ((id = s->id) != WT_TXN_NONE &&
+ TXNID_LE(prev_oldest_id, id) && TXNID_LT(id, snap_min))
+ snap_min = id;
/*
* !!!
@@ -184,51 +237,25 @@ __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot)
if ((id = s->snap_min) != WT_TXN_NONE &&
TXNID_LT(id, oldest_id)) {
oldest_id = id;
- oldest_session = i;
+ oldest_session = &conn->sessions[i];
}
}
if (TXNID_LT(snap_min, oldest_id))
oldest_id = snap_min;
- if (txn->id != WT_TXN_NONE && TXNID_LT(txn->id, oldest_id))
- oldest_id = txn->id;
- /*
- * If we got a new snapshot, update the published snap_min for this
- * session.
- */
- if (get_snapshot) {
- WT_ASSERT(session, TXNID_LE(prev_oldest_id, snap_min));
- WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
- txn_state->snap_min = snap_min;
- }
-
- /*
- * Update the last running ID if we have a much newer value or we are
- * forcing an update.
- */
- if (!get_snapshot || snap_min > txn_global->last_running + 100)
+ /* Update the last running ID. */
+ if (TXNID_LT(txn_global->last_running, snap_min)) {
txn_global->last_running = snap_min;
+ last_running_moved = 1;
+ } else
+ last_running_moved = 0;
- /*
- * Update the oldest ID if we have a newer ID and we can get exclusive
- * access. During normal snapshot refresh, only do this if we have a
- * much newer value. Once we get exclusive access, do another pass to
- * make sure nobody else is using an earlier ID.
- */
+ /* Update the oldest ID. */
if (TXNID_LT(prev_oldest_id, oldest_id) &&
- (!get_snapshot || oldest_id - prev_oldest_id > 100) &&
WT_ATOMIC_CAS4(txn_global->scan_count, 1, -1)) {
WT_ORDERED_READ(session_cnt, conn->session_cnt);
for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
- /*
- * Skip the checkpoint transaction; it is never read
- * from.
- */
- if (txn_global->checkpoint_id != WT_TXN_NONE &&
- s->id == txn_global->checkpoint_id)
- continue;
-
if ((id = s->id) != WT_TXN_NONE &&
TXNID_LT(id, oldest_id))
oldest_id = id;
@@ -241,31 +268,27 @@ __wt_txn_refresh(WT_SESSION_IMPL *session, int get_snapshot)
txn_global->scan_count = 0;
} else {
if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) &&
- current_id - oldest_id > 10000 &&
- txn_global->oldest_session != oldest_session) {
+ current_id - oldest_id > 10000 && last_running_moved &&
+ oldest_session != NULL) {
(void)__wt_verbose(session, WT_VERB_TRANSACTION,
"old snapshot %" PRIu64
" pinned in session %d [%s]"
" with snap_min %" PRIu64 "\n",
- oldest_id, oldest_session,
- conn->sessions[oldest_session].lastop,
- conn->sessions[oldest_session].txn.snap_min);
- txn_global->oldest_session = oldest_session;
+ oldest_id, oldest_session->id,
+ oldest_session->lastop,
+ oldest_session->txn.snap_min);
}
WT_ASSERT(session, txn_global->scan_count > 0);
(void)WT_ATOMIC_SUB4(txn_global->scan_count, 1);
}
-
- if (get_snapshot)
- __txn_sort_snapshot(session, n, current_id);
}
/*
- * __wt_txn_begin --
- * Begin a transaction.
+ * __wt_txn_config --
+ * Configure a transaction.
*/
int
-__wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
+__wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[])
{
WT_CONFIG_ITEM cval;
WT_TXN *txn;
@@ -273,9 +296,7 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
txn = &session->txn;
WT_RET(__wt_config_gets_def(session, cfg, "isolation", 0, &cval));
- if (cval.len == 0)
- txn->isolation = session->isolation;
- else
+ if (cval.len != 0)
txn->isolation =
WT_STRING_MATCH("snapshot", cval.str, cval.len) ?
TXN_ISO_SNAPSHOT :
@@ -294,18 +315,11 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
* !!! This is an unusual use of the config code: the "default" value
* we pass in is inherited from the connection.
*/
- txn->txn_logsync = S2C(session)->txn_logsync;
WT_RET(__wt_config_gets_def(session, cfg, "sync",
FLD_ISSET(txn->txn_logsync, WT_LOG_FLUSH) ? 1 : 0, &cval));
if (!cval.val)
txn->txn_logsync = 0;
- F_SET(txn, TXN_RUNNING);
- if (txn->isolation == TXN_ISO_SNAPSHOT) {
- if (session->ncursors > 0)
- WT_RET(__wt_session_copy_values(session));
- __wt_txn_refresh(session, 1);
- }
return (0);
}
@@ -325,10 +339,17 @@ __wt_txn_release(WT_SESSION_IMPL *session)
txn->notify = NULL;
txn_global = &S2C(session)->txn_global;
- txn_state = &txn_global->states[session->id];
+ txn_state = WT_SESSION_TXN_STATE(session);
/* Clear the transaction's ID from the global table. */
- if (F_ISSET(txn, TXN_HAS_ID)) {
+ if (WT_SESSION_IS_CHECKPOINT(session)) {
+ WT_ASSERT(session, txn_state->id == WT_TXN_NONE);
+ txn->id = WT_TXN_NONE;
+
+ /* Clear the global checkpoint transaction IDs. */
+ txn_global->checkpoint_id = 0;
+ txn_global->checkpoint_pinned = WT_TXN_NONE;
+ } else if (F_ISSET(txn, TXN_HAS_ID)) {
WT_ASSERT(session, txn_state->id != WT_TXN_NONE &&
txn->id != WT_TXN_NONE);
WT_PUBLISH(txn_state->id, WT_TXN_NONE);
@@ -385,6 +406,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
*/
__wt_txn_release_snapshot(session);
ret = __wt_txn_log_commit(session, cfg);
+ WT_ASSERT(session, ret == 0);
}
/*
@@ -515,19 +537,19 @@ __wt_txn_stats_update(WT_SESSION_IMPL *session)
WT_TXN_GLOBAL *txn_global;
WT_CONNECTION_IMPL *conn;
WT_CONNECTION_STATS *stats;
- uint64_t checkpoint_snap_min;
+ uint64_t checkpoint_pinned;
conn = S2C(session);
txn_global = &conn->txn_global;
stats = &conn->stats;
- checkpoint_snap_min = txn_global->checkpoint_snap_min;
+ checkpoint_pinned = txn_global->checkpoint_pinned;
WT_STAT_SET(stats, txn_pinned_range,
txn_global->current - txn_global->oldest_id);
WT_STAT_SET(stats, txn_pinned_checkpoint_range,
- checkpoint_snap_min == WT_TXN_NONE ?
- 0 : txn_global->current - checkpoint_snap_min);
+ checkpoint_pinned == WT_TXN_NONE ?
+ 0 : txn_global->current - checkpoint_pinned);
}
/*
diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
index 1361b1a6682..08d8b778371 100644
--- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c
+++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c
@@ -349,6 +349,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
WT_TXN *txn;
WT_TXN_GLOBAL *txn_global;
WT_TXN_ISOLATION saved_isolation;
+ WT_TXN_STATE *txn_state;
const char *txn_cfg[] =
{ WT_CONFIG_BASE(session, session_begin_transaction),
"isolation=snapshot", NULL };
@@ -358,6 +359,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
conn = S2C(session);
txn_global = &conn->txn_global;
+ txn_state = WT_SESSION_TXN_STATE(session);
saved_isolation = session->isolation;
txn = &session->txn;
full = idle = logging = tracking = 0;
@@ -388,7 +390,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
* This is particularly important for compact, so that all dirty pages
* can be fully written.
*/
- __wt_txn_update_oldest(session);
+ __wt_txn_update_oldest(session, 1);
/* Flush data-sources before we start the checkpoint. */
WT_ERR(__checkpoint_data_source(session, cfg));
@@ -426,6 +428,22 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
WT_ERR(__checkpoint_verbose_track(session,
"starting transaction", &verb_timer));
+ if (full)
+ WT_ERR(__wt_epoch(session, &start));
+
+ /*
+ * Bump the global checkpoint generation, used to figure out whether
+ * checkpoint has visited a tree. There is no need for this to be
+ * atomic: it is only written while holding the checkpoint lock.
+ *
+ * We do need to update it before clearing the checkpoint's entry out
+ * of the transaction table, or a thread evicting in a tree could
+ * ignore the checkpoint's transaction.
+ */
+ ++txn_global->checkpoint_gen;
+ WT_STAT_FAST_CONN_SET(session,
+ txn_checkpoint_generation, txn_global->checkpoint_gen);
+
/*
* Start a snapshot transaction for the checkpoint.
*
@@ -433,27 +451,44 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
* side effects on cursors, which applications can hold open across
* calls to checkpoint.
*/
- if (full)
- WT_ERR(__wt_epoch(session, &start));
WT_ERR(__wt_txn_begin(session, txn_cfg));
/* Ensure a transaction ID is allocated prior to sharing it globally */
WT_ERR(__wt_txn_id_check(session));
+
/*
- * Save a copy of the checkpoint transaction ID so that refresh can
- * skip the checkpoint IDs. Save a copy of the snap min so that
- * visibility checks for the checkpoint use the right ID.
+ * Save the checkpoint session ID. We never do checkpoints in the
+ * default session (with id zero).
*/
- txn_global->checkpoint_id = session->txn.id;
- txn_global->checkpoint_snap_min = session->txn.snap_min;
+ WT_ASSERT(session, session->id != 0 && txn_global->checkpoint_id == 0);
+ txn_global->checkpoint_id = session->id;
+
+ txn_global->checkpoint_pinned =
+ WT_MIN(txn_state->id, txn_state->snap_min);
/*
- * No need for this to be atomic it is only written while holding the
- * checkpoint lock.
+ * We're about to clear the checkpoint transaction from the global
+ * state table so the oldest ID can move forward. Make sure everything
+ * we've done above is scheduled.
*/
- txn_global->checkpoint_gen += 1;
- WT_STAT_FAST_CONN_SET(session,
- txn_checkpoint_generation, txn_global->checkpoint_gen);
+ WT_FULL_BARRIER();
+
+ /*
+ * Sanity check that the oldest ID hasn't moved on before we have
+ * cleared our entry.
+ */
+ WT_ASSERT(session,
+ TXNID_LE(txn_global->oldest_id, txn_state->id) &&
+ TXNID_LE(txn_global->oldest_id, txn_state->snap_min));
+
+ /*
+ * Clear our entry from the global transaction session table. Any
+ * operation that needs to know about the ID for this checkpoint will
+ * consider the checkpoint ID in the global structure. Most operations
+ * can safely ignore the checkpoint ID (see the visible all check for
+ * details).
+ */
+ txn_state->id = txn_state->snap_min = WT_TXN_NONE;
/* Tell logging that we have started a database checkpoint. */
if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && full) {
@@ -474,10 +509,6 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
/* Release the snapshot so we aren't pinning pages in cache. */
__wt_txn_release_snapshot(session);
- /* Clear the global checkpoint transaction IDs */
- txn_global->checkpoint_id = WT_TXN_NONE;
- txn_global->checkpoint_snap_min = WT_TXN_NONE;
-
WT_ERR(__checkpoint_verbose_track(session,
"committing transaction", &verb_timer));
@@ -550,10 +581,6 @@ err: /*
WT_TRET(__wt_txn_rollback(session, NULL));
}
- /* Ensure the checkpoint IDs are cleared on the error path. */
- txn_global->checkpoint_id = WT_TXN_NONE;
- txn_global->checkpoint_snap_min = WT_TXN_NONE;
-
/*
* Tell logging that we have finished a database checkpoint. Do not
* write a log record if the database was idle.
@@ -806,10 +833,8 @@ __checkpoint_worker(
force = 1;
}
if (!btree->modified && !force) {
- if (!is_checkpoint) {
- F_SET(btree, WT_BTREE_SKIP_CKPT);
- goto done;
- }
+ if (!is_checkpoint)
+ goto nockpt;
deleted = 0;
WT_CKPT_FOREACH(ckptbase, ckpt)
@@ -828,7 +853,12 @@ __checkpoint_worker(
(WT_PREFIX_MATCH(name, WT_CHECKPOINT) &&
WT_PREFIX_MATCH((ckpt - 1)->name, WT_CHECKPOINT))) &&
deleted < 2) {
- F_SET(btree, WT_BTREE_SKIP_CKPT);
+nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT);
+ WT_PUBLISH(btree->checkpoint_gen,
+ S2C(session)->txn_global.checkpoint_gen);
+ WT_STAT_FAST_DATA_SET(session,
+ btree_checkpoint_generation,
+ btree->checkpoint_gen);
goto done;
}
}
@@ -1056,16 +1086,8 @@ fake: /*
WT_ERR(__wt_txn_checkpoint_log(
session, 0, WT_TXN_LOG_CKPT_STOP, NULL));
- /*
- * Update the checkpoint generation for this handle so visible
- * updates newer than the checkpoint can be evicted.
- */
-done: btree->checkpoint_gen = conn->txn_global.checkpoint_gen;
- WT_STAT_FAST_DATA_SET(session,
- btree_checkpoint_generation, btree->checkpoint_gen);
-
-err:
- /*
+done:
+err: /*
* If the checkpoint didn't complete successfully, make sure the
* tree is marked dirty.
*/
@@ -1142,7 +1164,7 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, int final)
* for active readers.
*/
if (!btree->modified && !bulk) {
- __wt_txn_update_oldest(session);
+ __wt_txn_update_oldest(session, 1);
return (__wt_txn_visible_all(session, btree->rec_max_txn) ?
__wt_cache_op(session, NULL, WT_SYNC_DISCARD) : EBUSY);
}