diff options
Diffstat (limited to 'src')
50 files changed, 540 insertions, 444 deletions
diff --git a/src/async/async_worker.c b/src/async/async_worker.c index 401d0616eab..b1bc3902f7c 100644 --- a/src/async/async_worker.c +++ b/src/async/async_worker.c @@ -216,7 +216,7 @@ __async_worker_execop(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op, break; case WT_AOP_NONE: WT_RET_MSG(session, EINVAL, - "Unknown async optype %d\n", op->optype); + "Unknown async optype %d", op->optype); } return (0); } diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index 3690b41ead4..41ae457b0fe 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -1217,7 +1217,7 @@ err: if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED)) /* * __wt_btcur_init -- - * Initialize an cursor used for internal purposes. + * Initialize a cursor used for internal purposes. */ void __wt_btcur_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index 9591023e163..337a3ea036f 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -271,6 +271,17 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) else F_CLR(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION); + WT_RET(__wt_config_gets(session, + cfg, "ignore_in_memory_cache_size", &cval)); + if (cval.val) { + if (!F_ISSET(conn, WT_CONN_IN_MEMORY)) + WT_RET_MSG(session, EINVAL, + "ignore_in_memory_cache_size setting is only valid " + "with databases configured to run in-memory"); + F_SET(btree, WT_BTREE_IGNORE_CACHE); + } else + F_CLR(btree, WT_BTREE_IGNORE_CACHE); + WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval)); if (cval.val) F_CLR(btree, WT_BTREE_NO_LOGGING); @@ -353,7 +364,7 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush")); btree->checkpointing = WT_CKPT_OFF; /* Not checkpointing */ - btree->modified = 0; /* Clean */ + btree->modified = false; /* Clean */ btree->write_gen = ckpt->write_gen; /* Write generation */ return (0); diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index b41179a565d..7b583bd9c1e 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -24,20 +24,20 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) WT_REF *walk; WT_TXN *txn; uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; - uint64_t oldest_id, saved_snap_min; + uint64_t oldest_id, saved_pinned_id; uint32_t flags; conn = S2C(session); btree = S2BT(session); walk = NULL; txn = &session->txn; - saved_snap_min = WT_SESSION_TXN_STATE(session)->snap_min; + saved_pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; flags = WT_READ_CACHE | WT_READ_NO_GEN; internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) - WT_RET(__wt_epoch(session, &start)); + __wt_epoch(session, &start); switch (syncop) { case WT_SYNC_WRITE_LEAVES: @@ -205,15 +205,14 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) } if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { - WT_ERR(__wt_epoch(session, &end)); + __wt_epoch(session, &end); __wt_verbose(session, WT_VERB_CHECKPOINT, - "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64 - " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64 - " bytes, %" PRIu64 " pages of internal\n\t" - "Took: %" PRIu64 "ms", + "__sync_file WT_SYNC_%s wrote: %" PRIu64 + " leaf pages (%" PRIu64 "B), %" PRIu64 + " internal pages (%" PRIu64 "B), and took %" PRIu64 "ms", syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", - leaf_bytes, leaf_pages, internal_bytes, internal_pages, + leaf_pages, leaf_bytes, internal_pages, internal_bytes, WT_TIMEDIFF_MS(end, start)); } @@ -226,7 +225,7 @@ err: /* On error, clear any left-over tree walk. */ * snapshot active when we started, release it. */ if (txn->isolation == WT_ISO_READ_COMMITTED && - saved_snap_min == WT_TXN_NONE) + saved_pinned_id == WT_TXN_NONE) __wt_txn_release_snapshot(session); /* Clear the checkpoint flag and push the change. */ diff --git a/src/checksum/power8/README.md b/src/checksum/power8/README.md index 3e2976650cd..579d841a02c 100644 --- a/src/checksum/power8/README.md +++ b/src/checksum/power8/README.md @@ -39,7 +39,7 @@ Quick start - Type make to create the constants (crc32_constants.h) -- Import the code into your application (crc32.S crc32_wrapper.c +- Import the code into your application (crc32.sx crc32_wrapper.c crc32_constants.h ppc-opcode.h) and call the CRC: ``` diff --git a/src/checksum/power8/crc32.S b/src/checksum/power8/crc32.sx index 0b7870668b5..0b7870668b5 100644 --- a/src/checksum/power8/crc32.S +++ b/src/checksum/power8/crc32.sx diff --git a/src/checksum/zseries/crc32le-vx.S b/src/checksum/zseries/crc32le-vx.sx index 0f1392b0952..0f1392b0952 100644 --- a/src/checksum/zseries/crc32le-vx.S +++ b/src/checksum/zseries/crc32le-vx.sx diff --git a/src/config/config_collapse.c b/src/config/config_collapse.c index ea956ebfff9..7fe78d06ba7 100644 --- a/src/config/config_collapse.c +++ b/src/config/config_collapse.c @@ -47,7 +47,7 @@ __wt_config_collapse( if (k.type != WT_CONFIG_ITEM_STRING && k.type != WT_CONFIG_ITEM_ID) WT_ERR_MSG(session, EINVAL, - "Invalid configuration key found: '%s'\n", k.str); + "Invalid configuration key found: '%s'", k.str); WT_ERR(__wt_config_get(session, cfg, &k, &v)); /* Include the quotes around string keys/values. */ if (k.type == WT_CONFIG_ITEM_STRING) { @@ -132,7 +132,7 @@ __config_merge_scan(WT_SESSION_IMPL *session, if (k.type != WT_CONFIG_ITEM_STRING && k.type != WT_CONFIG_ITEM_ID) WT_ERR_MSG(session, EINVAL, - "Invalid configuration key found: '%s'\n", k.str); + "Invalid configuration key found: '%s'", k.str); /* Include the quotes around string keys/values. */ if (k.type == WT_CONFIG_ITEM_STRING) { diff --git a/src/config/config_def.c b/src/config/config_def.c index 7bad5f12a9f..7bce4bc9cef 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -246,6 +246,9 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_create[] = { { "format", "string", NULL, "choices=[\"btree\"]", NULL, 0 }, { "huffman_key", "string", NULL, NULL, NULL, 0 }, { "huffman_value", "string", NULL, NULL, NULL, 0 }, + { "ignore_in_memory_cache_size", "boolean", + NULL, NULL, + NULL, 0 }, { "immutable", "boolean", NULL, NULL, NULL, 0 }, { "internal_item_max", "int", NULL, "min=0", NULL, 0 }, { "internal_key_max", "int", NULL, "min=0", NULL, 0 }, @@ -413,6 +416,9 @@ static const WT_CONFIG_CHECK confchk_file_config[] = { { "format", "string", NULL, "choices=[\"btree\"]", NULL, 0 }, { "huffman_key", "string", NULL, NULL, NULL, 0 }, { "huffman_value", "string", NULL, NULL, NULL, 0 }, + { "ignore_in_memory_cache_size", "boolean", + NULL, NULL, + NULL, 0 }, { "internal_item_max", "int", NULL, "min=0", NULL, 0 }, { "internal_key_max", "int", NULL, "min=0", NULL, 0 }, { "internal_key_truncate", "boolean", NULL, NULL, NULL, 0 }, @@ -471,6 +477,9 @@ static const WT_CONFIG_CHECK confchk_file_meta[] = { { "huffman_key", "string", NULL, NULL, NULL, 0 }, { "huffman_value", "string", NULL, NULL, NULL, 0 }, { "id", "string", NULL, NULL, NULL, 0 }, + { "ignore_in_memory_cache_size", "boolean", + NULL, NULL, + NULL, 0 }, { "internal_item_max", "int", NULL, "min=0", NULL, 0 }, { "internal_key_max", "int", NULL, "min=0", NULL, 0 }, { "internal_key_truncate", "boolean", NULL, NULL, NULL, 0 }, @@ -544,6 +553,9 @@ static const WT_CONFIG_CHECK confchk_lsm_meta[] = { { "format", "string", NULL, "choices=[\"btree\"]", NULL, 0 }, { "huffman_key", "string", NULL, NULL, NULL, 0 }, { "huffman_value", "string", NULL, NULL, NULL, 0 }, + { "ignore_in_memory_cache_size", "boolean", + NULL, NULL, + NULL, 0 }, { "internal_item_max", "int", NULL, "min=0", NULL, 0 }, { "internal_key_max", "int", NULL, "min=0", NULL, 0 }, { "internal_key_truncate", "boolean", NULL, NULL, NULL, 0 }, @@ -1053,18 +1065,18 @@ static const WT_CONFIG_ENTRY config_entries[] = { "block_compressor=,cache_resident=false,checksum=uncompressed," "colgroups=,collator=,columns=,dictionary=0,encryption=(keyid=," "name=),exclusive=false,extractor=,format=btree,huffman_key=," - "huffman_value=,immutable=false,internal_item_max=0," - "internal_key_max=0,internal_key_truncate=true," - "internal_page_max=4KB,key_format=u,key_gap=10,leaf_item_max=0," - "leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=0," - "log=(enabled=true),lsm=(auto_throttle=true,bloom=true," - "bloom_bit_count=16,bloom_config=,bloom_hash_count=8," + "huffman_value=,ignore_in_memory_cache_size=false,immutable=false" + ",internal_item_max=0,internal_key_max=0," + "internal_key_truncate=true,internal_page_max=4KB,key_format=u," + "key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB," + "leaf_value_max=0,log=(enabled=true),lsm=(auto_throttle=true," + "bloom=true,bloom_bit_count=16,bloom_config=,bloom_hash_count=8," "bloom_oldest=false,chunk_count_limit=0,chunk_max=5GB," "chunk_size=10MB,merge_max=15,merge_min=0),memory_page_max=5MB," "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false," "prefix_compression_min=4,source=,split_deepen_min_child=0," "split_deepen_per_child=0,split_pct=75,type=file,value_format=u", - confchk_WT_SESSION_create, 40 + confchk_WT_SESSION_create, 41 }, { "WT_SESSION.drop", "checkpoint_wait=true,force=false,lock_wait=true," @@ -1148,7 +1160,8 @@ static const WT_CONFIG_ENTRY config_entries[] = { "allocation_size=4KB,app_metadata=,block_allocation=best," "block_compressor=,cache_resident=false,checksum=uncompressed," "collator=,columns=,dictionary=0,encryption=(keyid=,name=)," - "format=btree,huffman_key=,huffman_value=,internal_item_max=0," + "format=btree,huffman_key=,huffman_value=," + "ignore_in_memory_cache_size=false,internal_item_max=0," "internal_key_max=0,internal_key_truncate=true," "internal_page_max=4KB,key_format=u,key_gap=10,leaf_item_max=0," "leaf_key_max=0,leaf_page_max=32KB,leaf_value_max=0," @@ -1156,14 +1169,15 @@ static const WT_CONFIG_ENTRY config_entries[] = { "os_cache_max=0,prefix_compression=false,prefix_compression_min=4" ",split_deepen_min_child=0,split_deepen_per_child=0,split_pct=75," "value_format=u", - confchk_file_config, 33 + confchk_file_config, 34 }, { "file.meta", "allocation_size=4KB,app_metadata=,block_allocation=best," "block_compressor=,cache_resident=false,checkpoint=," "checkpoint_lsn=,checksum=uncompressed,collator=,columns=," "dictionary=0,encryption=(keyid=,name=),format=btree,huffman_key=" - ",huffman_value=,id=,internal_item_max=0,internal_key_max=0," + ",huffman_value=,id=,ignore_in_memory_cache_size=false," + "internal_item_max=0,internal_key_max=0," "internal_key_truncate=true,internal_page_max=4KB,key_format=u," "key_gap=10,leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB," "leaf_value_max=0,log=(enabled=true),memory_page_max=5MB," @@ -1171,7 +1185,7 @@ static const WT_CONFIG_ENTRY config_entries[] = { "prefix_compression_min=4,split_deepen_min_child=0," "split_deepen_per_child=0,split_pct=75,value_format=u," "version=(major=0,minor=0)", - confchk_file_meta, 37 + confchk_file_meta, 38 }, { "index.meta", "app_metadata=,collator=,columns=,extractor=,immutable=false," @@ -1183,18 +1197,19 @@ static const WT_CONFIG_ENTRY config_entries[] = { "block_compressor=,cache_resident=false,checksum=uncompressed," "chunks=,collator=,columns=,dictionary=0,encryption=(keyid=," "name=),format=btree,huffman_key=,huffman_value=," - "internal_item_max=0,internal_key_max=0," - "internal_key_truncate=true,internal_page_max=4KB,key_format=u," - "key_gap=10,last=,leaf_item_max=0,leaf_key_max=0," - "leaf_page_max=32KB,leaf_value_max=0,log=(enabled=true)," - "lsm=(auto_throttle=true,bloom=true,bloom_bit_count=16," - "bloom_config=,bloom_hash_count=8,bloom_oldest=false," - "chunk_count_limit=0,chunk_max=5GB,chunk_size=10MB,merge_max=15," - "merge_min=0),memory_page_max=5MB,old_chunks=," - "os_cache_dirty_max=0,os_cache_max=0,prefix_compression=false," - "prefix_compression_min=4,split_deepen_min_child=0," - "split_deepen_per_child=0,split_pct=75,value_format=u", - confchk_lsm_meta, 37 + "ignore_in_memory_cache_size=false,internal_item_max=0," + "internal_key_max=0,internal_key_truncate=true," + "internal_page_max=4KB,key_format=u,key_gap=10,last=," + "leaf_item_max=0,leaf_key_max=0,leaf_page_max=32KB," + "leaf_value_max=0,log=(enabled=true),lsm=(auto_throttle=true," + "bloom=true,bloom_bit_count=16,bloom_config=,bloom_hash_count=8," + "bloom_oldest=false,chunk_count_limit=0,chunk_max=5GB," + "chunk_size=10MB,merge_max=15,merge_min=0),memory_page_max=5MB," + "old_chunks=,os_cache_dirty_max=0,os_cache_max=0," + "prefix_compression=false,prefix_compression_min=4," + "split_deepen_min_child=0,split_deepen_per_child=0,split_pct=75," + "value_format=u", + confchk_lsm_meta, 38 }, { "table.meta", "app_metadata=,colgroups=,collator=,columns=,key_format=u," diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c index 6788b1f7f47..1b8b3183d3c 100644 --- a/src/conn/conn_cache.c +++ b/src/conn/conn_cache.c @@ -42,47 +42,38 @@ __cache_config_local(WT_SESSION_IMPL *session, bool shared, const char *cfg[]) WT_RET(__wt_config_gets(session, cfg, "eviction_trigger", &cval)); cache->eviction_trigger = (u_int)cval.val; - if (F_ISSET(conn, WT_CONN_IN_MEMORY)) - cache->eviction_checkpoint_target = - cache->eviction_dirty_target = - cache->eviction_dirty_trigger = 100U; - else { - WT_RET(__wt_config_gets( - session, cfg, "eviction_checkpoint_target", &cval)); - cache->eviction_checkpoint_target = (u_int)cval.val; + WT_RET(__wt_config_gets( + session, cfg, "eviction_checkpoint_target", &cval)); + cache->eviction_checkpoint_target = (u_int)cval.val; - WT_RET(__wt_config_gets( - session, cfg, "eviction_dirty_target", &cval)); - cache->eviction_dirty_target = (u_int)cval.val; + WT_RET(__wt_config_gets(session, cfg, "eviction_dirty_target", &cval)); + cache->eviction_dirty_target = (u_int)cval.val; - /* - * Don't allow the dirty target to be larger than the overall - * target. - */ - if (cache->eviction_dirty_target > cache->eviction_target) - cache->eviction_dirty_target = cache->eviction_target; + /* + * Don't allow the dirty target to be larger than the overall + * target. + */ + if (cache->eviction_dirty_target > cache->eviction_target) + cache->eviction_dirty_target = cache->eviction_target; - /* - * Sanity check the checkpoint target: don't allow a value - * lower than the dirty target. - */ - if (cache->eviction_checkpoint_target > 0 && - cache->eviction_checkpoint_target < - cache->eviction_dirty_target) - cache->eviction_checkpoint_target = - cache->eviction_dirty_target; + /* + * Sanity check the checkpoint target: don't allow a value + * lower than the dirty target. + */ + if (cache->eviction_checkpoint_target > 0 && + cache->eviction_checkpoint_target < cache->eviction_dirty_target) + cache->eviction_checkpoint_target = + cache->eviction_dirty_target; - WT_RET(__wt_config_gets( - session, cfg, "eviction_dirty_trigger", &cval)); - cache->eviction_dirty_trigger = (u_int)cval.val; + WT_RET(__wt_config_gets(session, cfg, "eviction_dirty_trigger", &cval)); + cache->eviction_dirty_trigger = (u_int)cval.val; - /* - * Don't allow the dirty trigger to be larger than the overall - * trigger or we can get stuck with a cache full of dirty data. - */ - if (cache->eviction_dirty_trigger > cache->eviction_trigger) - cache->eviction_dirty_trigger = cache->eviction_trigger; - } + /* + * Don't allow the dirty trigger to be larger than the overall + * trigger or we can get stuck with a cache full of dirty data. + */ + if (cache->eviction_dirty_trigger > cache->eviction_trigger) + cache->eviction_dirty_trigger = cache->eviction_trigger; WT_RET(__wt_config_gets(session, cfg, "eviction.threads_max", &cval)); WT_ASSERT(session, cval.val > 0); diff --git a/src/conn/conn_ckpt.c b/src/conn/conn_ckpt.c index b4913043d63..faeef4e71a2 100644 --- a/src/conn/conn_ckpt.c +++ b/src/conn/conn_ckpt.c @@ -87,22 +87,36 @@ __ckpt_server(void *arg) */ __wt_cond_wait(session, conn->ckpt_cond, conn->ckpt_usecs); - /* Checkpoint the database. */ - WT_ERR(wt_session->checkpoint(wt_session, NULL)); - - /* Reset. */ - if (conn->ckpt_logsize) { - __wt_log_written_reset(session); - conn->ckpt_signalled = false; - - /* - * In case we crossed the log limit during the - * checkpoint and the condition variable was already - * signalled, do a tiny wait to clear it so we don't do - * another checkpoint immediately. - */ - __wt_cond_wait(session, conn->ckpt_cond, 1); - } + /* + * Checkpoint the database if the connection is marked dirty. + * A connection is marked dirty whenever a btree gets marked + * dirty, which reflects upon a change in the database that + * needs to be checkpointed. Said that, there can be short + * instances when a btree gets marked dirty and the connection + * is yet to be. We might skip a checkpoint in that short + * instance, which is okay because by the next time we get to + * checkpoint, the connection would have been marked dirty and + * hence the checkpoint will not be skipped this time. + */ + if (conn->modified) { + WT_ERR(wt_session->checkpoint(wt_session, NULL)); + + /* Reset. */ + if (conn->ckpt_logsize) { + __wt_log_written_reset(session); + conn->ckpt_signalled = false; + + /* + * In case we crossed the log limit during the + * checkpoint and the condition variable was + * already signalled, do a tiny wait to clear + * it so we don't do another checkpoint + * immediately. + */ + __wt_cond_wait(session, conn->ckpt_cond, 1); + } + } else + WT_STAT_CONN_INCR(session, txn_checkpoint_skipped); } if (0) { diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index 2786526c2fa..34743034877 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -839,10 +839,10 @@ __log_server(void *arg) /* Wait until the next event. */ - WT_ERR(__wt_epoch(session, &start)); + __wt_epoch(session, &start); __wt_cond_auto_wait_signal(session, conn->log_cond, did_work, &signalled); - WT_ERR(__wt_epoch(session, &now)); + __wt_epoch(session, &now); timediff = WT_TIMEDIFF_MS(now, start); } diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c index 66979dfd023..d5a31c671c0 100644 --- a/src/conn/conn_stat.c +++ b/src/conn/conn_stat.c @@ -415,7 +415,7 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp) conn = S2C(session); /* Get the current local time of day. */ - WT_RET(__wt_epoch(session, &ts)); + __wt_epoch(session, &ts); tm = localtime_r(&ts.tv_sec, &_tm); /* Create the logging path name for this time of day. */ diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c index 03593f8951a..dba37fa2eb0 100644 --- a/src/conn/conn_sweep.c +++ b/src/conn/conn_sweep.c @@ -271,7 +271,7 @@ __sweep_server(void *arg) /* Wait until the next event. */ __wt_cond_wait(session, conn->sweep_cond, conn->sweep_interval * WT_MILLION); - WT_ERR(__wt_seconds(session, &now)); + __wt_seconds(session, &now); WT_STAT_CONN_INCR(session, dh_sweeps); diff --git a/src/docs/wtperf.dox b/src/docs/wtperf.dox index a49d0d9f871..df66ad43355 100644 --- a/src/docs/wtperf.dox +++ b/src/docs/wtperf.dox @@ -150,33 +150,25 @@ number of async worker threads @par checkpoint_interval (unsigned int, default=120) checkpoint every interval seconds during the workload phase. @par checkpoint_stress_rate (unsigned int, default=0) -checkpoint every rate operations during the populate phase in the -populate thread(s), 0 to disable +checkpoint every rate operations during the populate phase in the populate thread(s), 0 to disable @par checkpoint_threads (unsigned int, default=0) number of checkpoint threads -@par conn_config (string, default=create) +@par conn_config (string, default="create") connection configuration string @par compact (boolean, default=false) post-populate compact for LSM merging activity -@par compression (string, default=none) -compression extension. Allowed configuration values are: 'none', -'lz4', 'snappy', 'zlib' +@par compression (string, default="none") +compression extension. Allowed configuration values are: 'none', 'lz4', 'snappy', 'zlib' @par create (boolean, default=true) do population phase; false to use existing database @par database_count (unsigned int, default=1) -number of WiredTiger databases to use. Each database will execute the -workload using a separate home directory and complete set of worker -threads -@par drop_tables (unsigned int, default=0) -Whether to drop all tables at the end of the run, and report time -taken to do the drop. +number of WiredTiger databases to use. Each database will execute the workload using a separate home directory and complete set of worker threads +@par drop_tables (boolean, default=false) +Whether to drop all tables at the end of the run, and report time taken to do the drop. @par icount (unsigned int, default=5000) -number of records to initially populate. If multiple tables are -configured the count is spread evenly across all tables. +number of records to initially populate. If multiple tables are configured the count is spread evenly across all tables. @par idle_table_cycle (unsigned int, default=0) -Enable regular create and drop of idle tables, value is the maximum -number of seconds a create or drop is allowed before flagging an -error. Default 0 which means disabled. +Enable regular create and drop of idle tables, value is the maximum number of seconds a create or drop is allowed before flagging an error. Default 0 which means disabled. @par index (boolean, default=false) Whether to create an index on the value field. @par insert_rmw (boolean, default=false) @@ -188,28 +180,21 @@ perform partial logging on first table only. @par log_like_table (boolean, default=false) Append all modification operations to another shared table. @par min_throughput (unsigned int, default=0) -notify if any throughput measured is less than this amount. Aborts or -prints warning based on min_throughput_fatal setting. Requires -sample_interval to be configured +notify if any throughput measured is less than this amount. Aborts or prints warning based on min_throughput_fatal setting. Requires sample_interval to be configured @par min_throughput_fatal (boolean, default=false) print warning (false) or abort (true) of min_throughput failure. @par max_latency (unsigned int, default=0) -notify if any latency measured exceeds this number of -milliseconds.Aborts or prints warning based on min_throughput_fatal -setting. Requires sample_interval to be configured +notify if any latency measured exceeds this number of milliseconds. Aborts or prints warning based on min_throughput_fatal setting. Requires sample_interval to be configured @par max_latency_fatal (boolean, default=false) print warning (false) or abort (true) of max_latency failure. @par pareto (unsigned int, default=0) -use pareto distribution for random numbers. Zero to disable, otherwise -a percentage indicating how aggressive the distribution should be. +use pareto distribution for random numbers. Zero to disable, otherwise a percentage indicating how aggressive the distribution should be. @par populate_ops_per_txn (unsigned int, default=0) -number of operations to group into each transaction in the populate -phase, zero for auto-commit +number of operations to group into each transaction in the populate phase, zero for auto-commit @par populate_threads (unsigned int, default=1) number of populate threads, 1 for bulk load @par random_range (unsigned int, default=0) -if non zero choose a value from within this range as the key for -insert operations +if non zero choose a value from within this range as the key for insert operations @par random_value (boolean, default=false) generate random content for the value @par range_partition (boolean, default=false) @@ -217,9 +202,7 @@ partition data by range (vs hash) @par read_range (unsigned int, default=0) scan a range of keys after each search @par readonly (boolean, default=false) -reopen the connection between populate and workload phases in readonly -mode. Requires reopen_connection turned on (default). Requires that -read be the only workload specified +reopen the connection between populate and workload phases in readonly mode. Requires reopen_connection turned on (default). Requires that read be the only workload specified @par reopen_connection (boolean, default=true) close and reopen the connection between populate and workload phases @par report_interval (unsigned int, default=2) @@ -231,40 +214,22 @@ total workload seconds @par sample_interval (unsigned int, default=0) performance logging every interval seconds, 0 to disable @par sample_rate (unsigned int, default=50) -how often the latency of operations is measured. One for every -operation,two for every second operation, three for every third -operation etc. -@par sess_config (string, default=) +how often the latency of operations is measured. One for every operation, two for every second operation, three for every third operation etc. +@par sess_config (string, default="") session configuration string @par session_count_idle (unsigned int, default=0) number of idle sessions to create. Default 0. -@par table_config (string, default=key_format=S,value_format=S,type=lsm,exclusive=true,allocation_size=4kb,internal_page_max=64kb,leaf_page_max=4kb,split_pct=100) +@par table_config (string, default="key_format=S,value_format=S,type=lsm,exclusive=true, allocation_size=4kb,internal_page_max=64kb,leaf_page_max=4kb, split_pct=100") table configuration string @par table_count (unsigned int, default=1) -number of tables to run operations over. Keys are divided evenly over -the tables. Cursors are held open on all tables. Default 1, maximum -99999. +number of tables to run operations over. Keys are divided evenly over the tables. Cursors are held open on all tables. Default 1, maximum 99999. @par table_count_idle (unsigned int, default=0) number of tables to create, that won't be populated. Default 0. -@par threads (string, default=) -workload configuration: each 'count' entry is the total number of -threads, and the 'insert', 'read' and 'update' entries are the ratios -of insert, read and update operations done by each worker thread; If a -throttle value is provided each thread will do a maximum of that -number of operations per second; multiple workload configurations may -be specified per threads configuration; for example, a more complex -threads configuration might be -'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' -which would create 2 threads doing nothing but reads and 8 threads -each doing 50% inserts and 25% reads and updates. Allowed -configuration values are 'count', 'throttle', 'update_delta', 'reads', -'inserts', 'updates', 'truncate', 'truncate_pct' and 'truncate_count'. -There are also behavior modifiers, supported modifiers are -'ops_per_txn' -@par transaction_config (string, default=) -transaction configuration string, relevant when populate_opts_per_txn -is nonzero -@par table_name (string, default=test) +@par threads (string, default="") +workload configuration: each 'count' entry is the total number of threads, and the 'insert', 'read' and 'update' entries are the ratios of insert, read and update operations done by each worker thread; If a throttle value is provided each thread will do a maximum of that number of operations per second; multiple workload configurations may be specified per threads configuration; for example, a more complex threads configuration might be 'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' which would create 2 threads doing nothing but reads and 8 threads each doing 50% inserts and 25% reads and updates. Allowed configuration values are 'count', 'throttle', 'update_delta', 'reads', 'inserts', 'updates', 'truncate', 'truncate_pct' and 'truncate_count'. There are also behavior modifiers, supported modifiers are 'ops_per_txn' +@par transaction_config (string, default="") +WT_SESSION.begin_transaction configuration string, applied during the populate phase when populate_ops_per_txn is nonzero +@par table_name (string, default="test") table name @par truncate_single_ops (boolean, default=false) Implement truncate via cursor remove instead of session API diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index acc81f566a5..45ec9bce3b5 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -233,10 +233,10 @@ __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread) #ifdef HAVE_DIAGNOSTIC /* - * Ensure the cache stuck timer is initialized when starting eviction + * Ensure the cache stuck timer is initialized when starting eviction. */ if (thread->id == 0) - WT_ERR(__wt_epoch(session, &cache->stuck_ts)); + __wt_epoch(session, &cache->stuck_ts); #endif while (F_ISSET(conn, WT_CONN_EVICTION_RUN) && @@ -350,10 +350,10 @@ __evict_server(WT_SESSION_IMPL *session, bool *did_work) } else if (cache->pages_evicted != cache->pages_evict) { cache->pages_evicted = cache->pages_evict; #ifdef HAVE_DIAGNOSTIC - WT_RET(__wt_epoch(session, &cache->stuck_ts)); + __wt_epoch(session, &cache->stuck_ts); } else { /* After being stuck for 5 minutes, give up. */ - WT_RET(__wt_epoch(session, &now)); + __wt_epoch(session, &now); if (WT_TIMEDIFF_SEC(now, cache->stuck_ts) > 300) { ret = ETIMEDOUT; __wt_err(session, ret, @@ -465,16 +465,16 @@ __evict_update_work(WT_SESSION_IMPL *session) */ bytes_max = conn->cache_size + 1; bytes_inuse = __wt_cache_bytes_inuse(cache); - if (bytes_inuse > (cache->eviction_target * bytes_max) / 100) - F_SET(cache, WT_CACHE_EVICT_CLEAN); if (__wt_eviction_clean_needed(session, NULL)) F_SET(cache, WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD); + else if (bytes_inuse > (cache->eviction_target * bytes_max) / 100) + F_SET(cache, WT_CACHE_EVICT_CLEAN); dirty_inuse = __wt_cache_dirty_leaf_inuse(cache); - if (dirty_inuse > (cache->eviction_dirty_target * bytes_max) / 100) - F_SET(cache, WT_CACHE_EVICT_DIRTY); if (__wt_eviction_dirty_needed(session, NULL)) F_SET(cache, WT_CACHE_EVICT_DIRTY | WT_CACHE_EVICT_DIRTY_HARD); + else if (dirty_inuse > (cache->eviction_dirty_target * bytes_max) / 100) + F_SET(cache, WT_CACHE_EVICT_DIRTY); /* * If application threads are blocked by the total volume of data in @@ -506,12 +506,6 @@ __evict_update_work(WT_SESSION_IMPL *session) F_CLR(cache, WT_CACHE_EVICT_CLEAN | WT_CACHE_EVICT_CLEAN_HARD); } - /* If threads are blocked by eviction we should be looking for pages. */ - WT_ASSERT(session, !F_ISSET(cache, WT_CACHE_EVICT_CLEAN_HARD) || - F_ISSET(cache, WT_CACHE_EVICT_CLEAN)); - WT_ASSERT(session, !F_ISSET(cache, WT_CACHE_EVICT_DIRTY_HARD) || - F_ISSET(cache, WT_CACHE_EVICT_DIRTY)); - WT_STAT_CONN_SET(session, cache_eviction_state, F_MASK(cache, WT_CACHE_EVICT_MASK)); @@ -543,7 +537,7 @@ __evict_pass(WT_SESSION_IMPL *session) /* Evict pages from the cache. */ for (loop = 0; cache->pass_intr == 0; loop++) { - WT_RET(__wt_epoch(session, &now)); + __wt_epoch(session, &now); if (loop == 0) prev = now; @@ -895,12 +889,11 @@ __evict_lru_walk(WT_SESSION_IMPL *session) /* Fill the next queue (that isn't the urgent queue). */ queue = cache->evict_fill_queue; other_queue = cache->evict_queues + (1 - (queue - cache->evict_queues)); + cache->evict_fill_queue = other_queue; /* If this queue is full, try the other one. */ if (__evict_queue_full(queue) && !__evict_queue_full(other_queue)) queue = other_queue; - cache->evict_fill_queue = - &cache->evict_queues[1 - (queue - cache->evict_queues)]; /* * If both queues are full and haven't been empty on recent refills, @@ -1079,6 +1072,17 @@ __evict_walk(WT_SESSION_IMPL *session, WT_EVICT_QUEUE *queue) start_slot = slot = queue->evict_entries; max_entries = WT_MIN(slot + WT_EVICT_WALK_INCR, cache->evict_slots); + /* + * Another pathological case: if there are only a tiny number of + * candidate pages in cache, don't put all of them on one queue. + */ + if (F_ISSET(cache, WT_CACHE_EVICT_CLEAN)) + max_entries = WT_MIN(max_entries, + 1 + (uint32_t)(__wt_cache_pages_inuse(cache) / 2)); + else + max_entries = WT_MIN(max_entries, + 1 + (uint32_t)(cache->pages_dirty_leaf / 2)); + retry: while (slot < max_entries) { /* * If another thread is waiting on the eviction server to clear @@ -1508,19 +1512,22 @@ fast: /* If the page can't be evicted, give up. */ btree->evict_walk_period = 0; /* - * If we happen to end up on the root page, clear it. We have to track - * hazard pointers, and the root page complicates that calculation. + * If we happen to end up on the root page or a page requiring urgent + * eviction, clear it. We have to track hazard pointers, and the root + * page complicates that calculation. * * Likewise if we found no new candidates during the walk: there is no - * point keeping a page pinned, since it may be the only candidate in an - * idle tree. + * point keeping a page pinned, since it may be the only candidate in + * an idle tree. * * If we land on a page requiring forced eviction, move on to the next * page: we want this page evicted as quickly as possible. */ if ((ref = btree->evict_ref) != NULL) { /* Give up the walk occasionally. */ - if (__wt_ref_is_root(ref) || evict == start || give_up) + if (__wt_ref_is_root(ref) || evict == start || give_up || + ref->page->read_gen == WT_READGEN_OLDEST || + ref->page->memory_footprint >= btree->splitmempage) WT_RET(__evict_clear_walk(session, restarts == 0)); else if (ref->page->read_gen == WT_READGEN_OLDEST) WT_RET_NOTFOUND_OK(__wt_tree_walk_count( @@ -1543,14 +1550,14 @@ __evict_get_ref( WT_SESSION_IMPL *session, bool is_server, WT_BTREE **btreep, WT_REF **refp) { WT_CACHE *cache; - WT_DECL_RET; WT_EVICT_ENTRY *evict; WT_EVICT_QUEUE *queue, *other_queue, *urgent_queue; uint32_t candidates; - bool is_app, urgent_ok; + bool is_app, server_only, urgent_ok; cache = S2C(session)->cache; is_app = !F_ISSET(session, WT_SESSION_INTERNAL); + server_only = is_server && !WT_EVICT_HAS_WORKERS(session); urgent_ok = (!is_app && !is_server) || !WT_EVICT_HAS_WORKERS(session) || __wt_cache_aggressive(session); @@ -1569,7 +1576,8 @@ __evict_get_ref( } /* - * The server repopulates whenever the other queue is not full. + * The server repopulates whenever the other queue is not full, as long + * as at least one page has been evicted out of the current queue. * * Note that there are pathological cases where there are only enough * eviction candidates in the cache to fill one queue. In that case, @@ -1577,18 +1585,14 @@ __evict_get_ref( * Such cases are extremely rare in real applications. */ if (is_server && + (!urgent_ok || __evict_queue_empty(urgent_queue, false)) && + !__evict_queue_full(cache->evict_current_queue) && + !__evict_queue_full(cache->evict_fill_queue) && (cache->evict_empty_score > WT_EVICT_SCORE_CUTOFF || - __evict_queue_empty(cache->evict_fill_queue, false))) { - while ((ret = __wt_spin_trylock( - session, &cache->evict_queue_lock)) == EBUSY) - if ((!urgent_ok || - __evict_queue_empty(urgent_queue, false)) && - !__evict_queue_full(cache->evict_fill_queue)) - return (WT_NOTFOUND); + __evict_queue_empty(cache->evict_fill_queue, false))) + return (WT_NOTFOUND); - WT_RET(ret); - } else - __wt_spin_lock(session, &cache->evict_queue_lock); + __wt_spin_lock(session, &cache->evict_queue_lock); /* Check the urgent queue first. */ if (urgent_ok && !__evict_queue_empty(urgent_queue, false)) @@ -1596,17 +1600,15 @@ __evict_get_ref( else { /* * Check if the current queue needs to change. - * The current queue could have changed while we waited for - * the lock. * * The server will only evict half of the pages before looking - * for more. The remainder are left to eviction workers (if any - * configured), or application threads if necessary. + * for more, but should only switch queues if there are no + * other eviction workers. */ queue = cache->evict_current_queue; other_queue = cache->evict_other_queue; - if (__evict_queue_empty(queue, is_server) && - !__evict_queue_empty(other_queue, is_server)) { + if (__evict_queue_empty(queue, server_only) && + !__evict_queue_empty(other_queue, server_only)) { cache->evict_current_queue = other_queue; cache->evict_other_queue = queue; } @@ -1715,15 +1717,19 @@ __evict_get_ref( static int __evict_page(WT_SESSION_IMPL *session, bool is_server) { + struct timespec enter, leave; WT_BTREE *btree; WT_CACHE *cache; WT_DECL_RET; WT_REF *ref; + bool app_timer; WT_RET(__evict_get_ref(session, is_server, &btree, &ref)); WT_ASSERT(session, ref->state == WT_REF_LOCKED); + app_timer = false; cache = S2C(session)->cache; + /* * An internal session flags either the server itself or an eviction * worker thread. @@ -1739,6 +1745,10 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server) WT_STAT_CONN_INCR(session, cache_eviction_app_dirty); WT_STAT_CONN_INCR(session, cache_eviction_app); cache->app_evicts++; + if (WT_STAT_ENABLED(session)) { + app_timer = true; + __wt_epoch(session, &enter); + } } /* @@ -1756,6 +1766,11 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server) (void)__wt_atomic_subv32(&btree->evict_busy, 1); + if (app_timer) { + __wt_epoch(session, &leave); + WT_STAT_CONN_INCRV(session, + application_evict_time, WT_TIMEDIFF_US(leave, enter)); + } return (ret); } @@ -1767,6 +1782,7 @@ __evict_page(WT_SESSION_IMPL *session, bool is_server) int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) { + struct timespec enter, leave; WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; @@ -1792,9 +1808,11 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) /* Wake the eviction server if we need to do work. */ __wt_evict_server_wake(session); - init_evict_count = cache->pages_evict; + /* Track how long application threads spend doing eviction. */ + if (WT_STAT_ENABLED(session) && !F_ISSET(session, WT_SESSION_INTERNAL)) + __wt_epoch(session, &enter); - for (;;) { + for (init_evict_count = cache->pages_evict;; ret = 0) { /* * A pathological case: if we're the oldest transaction in the * system and the eviction server is stuck trying to find space, @@ -1804,7 +1822,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) if (__wt_cache_stuck(session) && __wt_txn_am_oldest(session)) { --cache->evict_aggressive_score; WT_STAT_CONN_INCR(session, txn_fail_cache); - return (WT_ROLLBACK); + WT_ERR(WT_ROLLBACK); } /* @@ -1816,7 +1834,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) * limit the work to 5 evictions and return. If that's not the * case, we can do more. */ - if (!busy && txn_state->snap_min != WT_TXN_NONE && + if (!busy && txn_state->pinned_id != WT_TXN_NONE && txn_global->current != txn_global->oldest_id) busy = true; max_pages_evicted = busy ? 5 : 20; @@ -1825,7 +1843,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) if (!__wt_eviction_needed(session, busy, &pct_full) || (pct_full < 100 && cache->pages_evict > init_evict_count + max_pages_evicted)) - return (0); + break; /* * Don't make application threads participate in scrubbing for @@ -1842,7 +1860,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) switch (ret = __evict_page(session, false)) { case 0: if (busy) - return (0); + goto err; /* FALLTHROUGH */ case EBUSY: break; @@ -1853,9 +1871,18 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) cache->app_waits++; break; default: - return (ret); + goto err; } } + +err: if (WT_STAT_ENABLED(session) && + !F_ISSET(session, WT_SESSION_INTERNAL)) { + __wt_epoch(session, &leave); + WT_STAT_CONN_INCRV(session, + application_cache_time, WT_TIMEDIFF_US(leave, enter)); + } + + return (ret); /* NOTREACHED */ } diff --git a/src/include/api.h b/src/include/api.h index e1b2f8edaf3..2783d17f825 100644 --- a/src/include/api.h +++ b/src/include/api.h @@ -139,7 +139,9 @@ (s) = (WT_SESSION_IMPL *)(cur)->session; \ TXN_API_CALL_NOCONF(s, WT_CURSOR, n, cur, \ ((bt) == NULL) ? NULL : ((WT_BTREE *)(bt))->dhandle); \ - if (F_ISSET(S2C(s), WT_CONN_IN_MEMORY) && __wt_cache_full(s)) \ + if (F_ISSET(S2C(s), WT_CONN_IN_MEMORY) && \ + !F_ISSET((WT_BTREE *)(bt), WT_BTREE_IGNORE_CACHE) && \ + __wt_cache_full(s)) \ WT_ERR(WT_CACHE_FULL); #define JOINABLE_CURSOR_UPDATE_API_CALL(cur, s, n, bt) \ diff --git a/src/include/btree.h b/src/include/btree.h index cfaf59e70e1..713d46ae85f 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -119,7 +119,7 @@ struct __wt_btree { uint64_t last_recno; /* Column-store last record number */ WT_REF root; /* Root page reference */ - int modified; /* If the tree ever modified */ + bool modified; /* If the tree ever modified */ bool bulk_load_ok; /* Bulk-load is a possibility */ WT_BM *bm; /* Block manager reference */ @@ -154,18 +154,19 @@ struct __wt_btree { WT_SPINLOCK flush_lock; /* Lock to flush the tree's pages */ /* Flags values up to 0xff are reserved for WT_DHANDLE_* */ -#define WT_BTREE_BULK 0x00100 /* Bulk-load handle */ -#define WT_BTREE_IN_MEMORY 0x00200 /* Cache-resident object */ -#define WT_BTREE_LOOKASIDE 0x00400 /* Look-aside table */ -#define WT_BTREE_NO_CHECKPOINT 0x00800 /* Disable checkpoints */ -#define WT_BTREE_NO_EVICTION 0x01000 /* Disable eviction */ -#define WT_BTREE_NO_LOGGING 0x02000 /* Disable logging */ -#define WT_BTREE_NO_RECONCILE 0x04000 /* Allow splits, even with no evict */ -#define WT_BTREE_REBALANCE 0x08000 /* Handle is for rebalance */ -#define WT_BTREE_SALVAGE 0x10000 /* Handle is for salvage */ -#define WT_BTREE_SKIP_CKPT 0x20000 /* Handle skipped checkpoint */ -#define WT_BTREE_UPGRADE 0x40000 /* Handle is for upgrade */ -#define WT_BTREE_VERIFY 0x80000 /* Handle is for verify */ +#define WT_BTREE_BULK 0x000100 /* Bulk-load handle */ +#define WT_BTREE_IGNORE_CACHE 0x000200 /* Cache-resident object */ +#define WT_BTREE_IN_MEMORY 0x000400 /* Cache-resident object */ +#define WT_BTREE_LOOKASIDE 0x000800 /* Look-aside table */ +#define WT_BTREE_NO_CHECKPOINT 0x001000 /* Disable checkpoints */ +#define WT_BTREE_NO_EVICTION 0x002000 /* Disable eviction */ +#define WT_BTREE_NO_LOGGING 0x004000 /* Disable logging */ +#define WT_BTREE_NO_RECONCILE 0x008000 /* Allow splits, even with no evict */ +#define WT_BTREE_REBALANCE 0x010000 /* Handle is for rebalance */ +#define WT_BTREE_SALVAGE 0x020000 /* Handle is for salvage */ +#define WT_BTREE_SKIP_CKPT 0x040000 /* Handle skipped checkpoint */ +#define WT_BTREE_UPGRADE 0x080000 /* Handle is for upgrade */ +#define WT_BTREE_VERIFY 0x100000 /* Handle is for verify */ uint32_t flags; }; diff --git a/src/include/btree.i b/src/include/btree.i index a9ce4f754a9..74ebf74f1e9 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -522,14 +522,22 @@ __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) * might result in an extra checkpoint that doesn't do any work but it * shouldn't cause problems; regardless, let's play it safe.) */ - if (S2BT(session)->modified == 0) { + if (!S2BT(session)->modified) { /* Assert we never dirty a checkpoint handle. */ WT_ASSERT(session, session->dhandle->checkpoint == NULL); - S2BT(session)->modified = 1; + S2BT(session)->modified = true; WT_FULL_BARRIER(); } + /* + * There is a possibility of btree being dirty whereas connection being + * clean when entering this function. So make sure to update connection + * to dirty outside a condition on btree modified flag. + */ + if (!S2C(session)->modified) + S2C(session)->modified = true; + __wt_page_only_modify_set(session, page); } diff --git a/src/include/cache.i b/src/include/cache.i index 4255d04ec37..17ab39e97d2 100644 --- a/src/include/cache.i +++ b/src/include/cache.i @@ -355,7 +355,7 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, bool busy, bool *didworkp) txn_state = WT_SESSION_TXN_STATE(session); busy = busy || txn_state->id != WT_TXN_NONE || session->nhazard > 0 || - (txn_state->snap_min != WT_TXN_NONE && + (txn_state->pinned_id != WT_TXN_NONE && txn_global->current != txn_global->oldest_id); /* diff --git a/src/include/connection.h b/src/include/connection.h index e19ad684b24..ce81dcf5976 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -352,6 +352,12 @@ struct __wt_connection_impl { WT_SESSION_IMPL *meta_ckpt_session;/* Metadata checkpoint session */ + /* + * Is there a data/schema change that needs to be the part of a + * checkpoint. + */ + bool modified; + WT_SESSION_IMPL *sweep_session; /* Handle sweep session */ wt_thread_t sweep_tid; /* Handle sweep thread */ int sweep_tid_set; /* Handle sweep thread set */ diff --git a/src/include/extern.h b/src/include/extern.h index 5444b2e9f14..e3cffa4ca3c 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -677,7 +677,7 @@ extern uint32_t __wt_log2_int(uint32_t n); extern bool __wt_ispo2(uint32_t v); extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2); extern void __wt_random_init(WT_RAND_STATE volatile *rnd_state); -extern int __wt_random_init_seed( WT_SESSION_IMPL *session, WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern void __wt_random_init_seed( WT_SESSION_IMPL *session, WT_RAND_STATE volatile *rnd_state); extern uint32_t __wt_random(WT_RAND_STATE volatile *rnd_state); extern int __wt_buf_grow_worker(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_buf_fmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 3, 4))) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/include/extern_posix.h b/src/include/extern_posix.h index d2f74d2ffe4..fd94ef0ddf2 100644 --- a/src/include/extern_posix.h +++ b/src/include/extern_posix.h @@ -27,5 +27,5 @@ extern void __wt_sleep(uint64_t seconds, uint64_t micro_seconds); extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_thread_id(char *buf, size_t buflen); -extern int __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp); extern void __wt_yield(void); diff --git a/src/include/extern_win.h b/src/include/extern_win.h index 8c2b19056e0..f06ee881ece 100644 --- a/src/include/extern_win.h +++ b/src/include/extern_win.h @@ -25,7 +25,7 @@ extern void __wt_sleep(uint64_t seconds, uint64_t micro_seconds); extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_thread_id(char *buf, size_t buflen); -extern int __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp); extern int __wt_to_utf16_string( WT_SESSION_IMPL *session, const char*utf8, WT_ITEM **outbuf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_to_utf8_string( WT_SESSION_IMPL *session, const wchar_t*wide, WT_ITEM **outbuf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern DWORD __wt_getlasterror(void); diff --git a/src/include/misc.i b/src/include/misc.i index f267c7afc91..befd480e085 100644 --- a/src/include/misc.i +++ b/src/include/misc.i @@ -33,16 +33,14 @@ __wt_strdup(WT_SESSION_IMPL *session, const char *str, void *retp) * __wt_seconds -- * Return the seconds since the Epoch. */ -static inline int +static inline void __wt_seconds(WT_SESSION_IMPL *session, time_t *timep) { struct timespec t; - WT_RET(__wt_epoch(session, &t)); + __wt_epoch(session, &t); *timep = t.tv_sec; - - return (0); } /* diff --git a/src/include/stat.h b/src/include/stat.h index cd0cae16826..68879206851 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -420,6 +420,8 @@ struct __wt_connection_stats { int64_t thread_fsync_active; int64_t thread_read_active; int64_t thread_write_active; + int64_t application_evict_time; + int64_t application_cache_time; int64_t page_busy_blocked; int64_t page_forcible_evict_blocked; int64_t page_locked_blocked; @@ -437,6 +439,7 @@ struct __wt_connection_stats { int64_t txn_checkpoint_scrub_time; int64_t txn_checkpoint_time_total; int64_t txn_checkpoint; + int64_t txn_checkpoint_skipped; int64_t txn_fail_cache; int64_t txn_checkpoint_fsync_post; int64_t txn_checkpoint_fsync_post_duration; diff --git a/src/include/txn.h b/src/include/txn.h index 2e41ae8620d..8128e8e4cc2 100644 --- a/src/include/txn.h +++ b/src/include/txn.h @@ -49,9 +49,9 @@ WT_ASSERT((s), (s)->txn.forced_iso > 0); \ (s)->txn.forced_iso--; \ WT_ASSERT((s), txn_state->id == saved_state.id && \ - (txn_state->snap_min == saved_state.snap_min || \ - saved_state.snap_min == WT_TXN_NONE)); \ - txn_state->snap_min = saved_state.snap_min; \ + (txn_state->pinned_id == saved_state.pinned_id || \ + saved_state.pinned_id == WT_TXN_NONE)); \ + txn_state->pinned_id = saved_state.pinned_id; \ } while (0) struct __wt_named_snapshot { @@ -59,14 +59,14 @@ struct __wt_named_snapshot { TAILQ_ENTRY(__wt_named_snapshot) q; - uint64_t snap_min, snap_max; + uint64_t pinned_id, snap_min, snap_max; uint64_t *snapshot; uint32_t snapshot_count; }; struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_txn_state { volatile uint64_t id; - volatile uint64_t snap_min; + volatile uint64_t pinned_id; }; struct __wt_txn_global { diff --git a/src/include/txn.i b/src/include/txn.i index 1a8851a9a2a..cf7e2eafc65 100644 --- a/src/include/txn.i +++ b/src/include/txn.i @@ -309,7 +309,7 @@ __wt_txn_idle_cache_check(WT_SESSION_IMPL *session) * WT_TXN_HAS_SNAPSHOT. */ if (F_ISSET(txn, WT_TXN_RUNNING) && - !F_ISSET(txn, WT_TXN_HAS_ID) && txn_state->snap_min == WT_TXN_NONE) + !F_ISSET(txn, WT_TXN_HAS_ID) && txn_state->pinned_id == WT_TXN_NONE) WT_RET(__wt_cache_eviction_check(session, false, NULL)); return (0); @@ -480,8 +480,8 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session) * positioned on a value, it can't be freed. */ if (txn->isolation == WT_ISO_READ_UNCOMMITTED) { - if (txn_state->snap_min == WT_TXN_NONE) - txn_state->snap_min = txn_global->last_running; + if (txn_state->pinned_id == WT_TXN_NONE) + txn_state->pinned_id = txn_global->last_running; } else if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)) WT_RET(__wt_txn_get_snapshot(session)); diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index f4763a113f1..2b71a580532 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -1069,6 +1069,11 @@ struct __wt_session { * Permitted values are \c "none"\, \c "english"\, \c "utf8<file>" or \c * "utf16<file>". See @ref huffman for more information., a string; * default \c none.} + * @config{ignore_in_memory_cache_size, allow update and insert + * operations to proceed even if the cache is already at capacity. Only + * valid in conjunction with in-memory databases. Should be used with + * caution - this configuration allows WiredTiger to consume memory over + * the configured cache limit., a boolean flag; default \c false.} * @config{immutable, configure the index to be immutable - that is an * index is not changed by any update to a record in the table., a * boolean flag; default \c false.} @@ -1815,14 +1820,13 @@ struct __wt_connection { * default \c 5.} * @config{eviction_dirty_target, perform eviction in worker threads * when the cache contains at least this much dirty content\, expressed - * as a percentage of the total cache size. Ignored if \c in_memory is - * \c true., an integer between 1 and 99; default \c 5.} + * as a percentage of the total cache size., an integer between 1 and + * 99; default \c 5.} * @config{eviction_dirty_trigger, trigger application threads to * perform eviction when the cache contains at least this much dirty * content\, expressed as a percentage of the total cache size. This - * setting only alters behavior if it is lower than eviction_trigger. - * Ignored if \c in_memory is \c true., an integer between 1 and 99; - * default \c 20.} + * setting only alters behavior if it is lower than eviction_trigger., + * an integer between 1 and 99; default \c 20.} * @config{eviction_target, perform eviction in worker threads when the * cache contains at least this much content\, expressed as a percentage * of the total cache size. Must be less than \c eviction_trigger., an @@ -2281,13 +2285,12 @@ struct __wt_connection { * is \c true., an integer between 0 and 99; default \c 5.} * @config{eviction_dirty_target, perform eviction in worker threads when the * cache contains at least this much dirty content\, expressed as a percentage - * of the total cache size. Ignored if \c in_memory is \c true., an integer - * between 1 and 99; default \c 5.} + * of the total cache size., an integer between 1 and 99; default \c 5.} * @config{eviction_dirty_trigger, trigger application threads to perform * eviction when the cache contains at least this much dirty content\, expressed * as a percentage of the total cache size. This setting only alters behavior - * if it is lower than eviction_trigger. Ignored if \c in_memory is \c true., - * an integer between 1 and 99; default \c 20.} + * if it is lower than eviction_trigger., an integer between 1 and 99; default + * \c 20.} * @config{eviction_target, perform eviction in worker threads when the cache * contains at least this much content\, expressed as a percentage of the total * cache size. Must be less than \c eviction_trigger., an integer between 10 @@ -4591,67 +4594,76 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_THREAD_READ_ACTIVE 1186 /*! thread-state: active filesystem write calls */ #define WT_STAT_CONN_THREAD_WRITE_ACTIVE 1187 +/*! thread-yield: application thread time evicting (usecs) */ +#define WT_STAT_CONN_APPLICATION_EVICT_TIME 1188 +/*! thread-yield: application thread time waiting for cache (usecs) */ +#define WT_STAT_CONN_APPLICATION_CACHE_TIME 1189 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1188 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1190 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1189 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1191 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1190 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1192 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1191 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1193 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1192 +#define WT_STAT_CONN_PAGE_SLEEP 1194 /*! transaction: number of named snapshots created */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1193 +#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1195 /*! transaction: number of named snapshots dropped */ -#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1194 +#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1196 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1195 +#define WT_STAT_CONN_TXN_BEGIN 1197 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1196 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1198 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1197 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1199 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1198 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1200 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1199 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1201 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1200 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1202 /*! transaction: transaction checkpoint scrub dirty target */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1201 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1203 /*! transaction: transaction checkpoint scrub time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1202 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1204 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1203 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1205 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1204 +#define WT_STAT_CONN_TXN_CHECKPOINT 1206 +/*! + * transaction: transaction checkpoints skipped because database was + * clean + */ +#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1207 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1205 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1208 /*! * transaction: transaction fsync calls for checkpoint after allocating * the transaction ID */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1206 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1209 /*! * transaction: transaction fsync duration for checkpoint after * allocating the transaction ID (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1207 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1210 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1208 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1211 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1209 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1212 /*! * transaction: transaction range of IDs currently pinned by named * snapshots */ -#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1210 +#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1213 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1211 +#define WT_STAT_CONN_TXN_SYNC 1214 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1212 +#define WT_STAT_CONN_TXN_COMMIT 1215 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1213 +#define WT_STAT_CONN_TXN_ROLLBACK 1216 /*! * @} diff --git a/src/log/log.c b/src/log/log.c index b0c789f0f9e..00e4ea5f441 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -128,9 +128,9 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) "log_force_sync: sync directory %s to LSN %" PRIu32 "/%" PRIu32, log->log_dir_fh->name, min_lsn->l.file, min_lsn->l.offset); - WT_ERR(__wt_epoch(session, &fsync_start)); + __wt_epoch(session, &fsync_start); WT_ERR(__wt_fsync(session, log->log_dir_fh, true)); - WT_ERR(__wt_epoch(session, &fsync_stop)); + __wt_epoch(session, &fsync_stop); fsync_duration_usecs = WT_TIMEDIFF_US(fsync_stop, fsync_start); log->sync_dir_lsn = *min_lsn; WT_STAT_CONN_INCR(session, log_sync_dir); @@ -152,9 +152,9 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) __wt_verbose(session, WT_VERB_LOG, "log_force_sync: sync %s to LSN %" PRIu32 "/%" PRIu32, log_fh->name, min_lsn->l.file, min_lsn->l.offset); - WT_ERR(__wt_epoch(session, &fsync_start)); + __wt_epoch(session, &fsync_start); WT_ERR(__wt_fsync(session, log_fh, true)); - WT_ERR(__wt_epoch(session, &fsync_stop)); + __wt_epoch(session, &fsync_stop); fsync_duration_usecs = WT_TIMEDIFF_US(fsync_stop, fsync_start); log->sync_lsn = *min_lsn; WT_STAT_CONN_INCR(session, log_sync); @@ -1478,9 +1478,9 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) "/%" PRIu32, log->log_dir_fh->name, sync_lsn.l.file, sync_lsn.l.offset); - WT_ERR(__wt_epoch(session, &fsync_start)); + __wt_epoch(session, &fsync_start); WT_ERR(__wt_fsync(session, log->log_dir_fh, true)); - WT_ERR(__wt_epoch(session, &fsync_stop)); + __wt_epoch(session, &fsync_stop); fsync_duration_usecs = WT_TIMEDIFF_US(fsync_stop, fsync_start); log->sync_dir_lsn = sync_lsn; @@ -1500,9 +1500,9 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) log->log_fh->name, sync_lsn.l.file, sync_lsn.l.offset); WT_STAT_CONN_INCR(session, log_sync); - WT_ERR(__wt_epoch(session, &fsync_start)); + __wt_epoch(session, &fsync_start); WT_ERR(__wt_fsync(session, log->log_fh, true)); - WT_ERR(__wt_epoch(session, &fsync_stop)); + __wt_epoch(session, &fsync_stop); fsync_duration_usecs = WT_TIMEDIFF_US(fsync_stop, fsync_start); WT_STAT_CONN_INCRV(session, diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index e98f59e7b05..b9a6dd18b7a 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -165,8 +165,7 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update) WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; WT_TXN *txn; - uint64_t *switch_txnp; - uint64_t snap_min; + uint64_t pinned_id, *switchp; lsm_tree = clsm->lsm_tree; session = (WT_SESSION_IMPL *)clsm->iface.session; @@ -226,8 +225,8 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update) * that overlaps with our snapshot is a potential * conflict. * - * Note that the global snap_min is correct here: it - * tracks concurrent transactions excluding special + * Note that the pinned ID is correct here: it tracks + * concurrent transactions excluding special * transactions such as checkpoint (which we can't * conflict with because checkpoint only writes the * metadata, which is not an LSM tree). @@ -237,17 +236,17 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update) F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { WT_ASSERT(session, F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)); - snap_min = - WT_SESSION_TXN_STATE(session)->snap_min; - for (switch_txnp = + pinned_id = + WT_SESSION_TXN_STATE(session)->pinned_id; + for (switchp = &clsm->switch_txn[clsm->nchunks - 2]; clsm->nupdates < clsm->nchunks; - clsm->nupdates++, switch_txnp--) { - if (WT_TXNID_LT(*switch_txnp, snap_min)) + clsm->nupdates++, switchp--) { + if (WT_TXNID_LT(*switchp, pinned_id)) break; WT_ASSERT(session, !__wt_txn_visible_all( - session, *switch_txnp)); + session, *switchp)); } } } diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c index 5a5140b9c3a..0a5f4fdd8b5 100644 --- a/src/lsm/lsm_manager.c +++ b/src/lsm/lsm_manager.c @@ -392,7 +392,7 @@ __lsm_manager_run_server(WT_SESSION_IMPL *session) TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) { if (!lsm_tree->active) continue; - WT_ERR(__wt_epoch(session, &now)); + __wt_epoch(session, &now); pushms = lsm_tree->work_push_ts.tv_sec == 0 ? 0 : WT_TIMEDIFF_MS(now, lsm_tree->work_push_ts); fillms = 3 * lsm_tree->chunk_fill_ms; @@ -651,7 +651,7 @@ __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session, return (0); } - WT_RET(__wt_epoch(session, &lsm_tree->work_push_ts)); + __wt_epoch(session, &lsm_tree->work_push_ts); WT_RET(__wt_calloc_one(session, &entry)); entry->type = type; entry->flags = flags; diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c index 4bbfcfd4411..493855d489a 100644 --- a/src/lsm/lsm_merge.c +++ b/src/lsm/lsm_merge.c @@ -54,7 +54,7 @@ __lsm_merge_aggressive_clear(WT_LSM_TREE *lsm_tree) * __lsm_merge_aggressive_update -- * Update the merge aggressiveness for an LSM tree. */ -static int +static void __lsm_merge_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { struct timespec now; @@ -72,7 +72,7 @@ __lsm_merge_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) if (!lsm_tree->modified || F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) { lsm_tree->merge_aggressiveness = 10; - return (0); + return; } /* @@ -81,7 +81,7 @@ __lsm_merge_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) */ if (lsm_tree->chunks_flushed <= lsm_tree->merge_min) { __lsm_merge_aggressive_clear(lsm_tree); - return (0); + return; } /* @@ -91,10 +91,10 @@ __lsm_merge_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) */ if (!F_ISSET(lsm_tree, WT_LSM_TREE_AGGRESSIVE_TIMER)) { F_SET(lsm_tree, WT_LSM_TREE_AGGRESSIVE_TIMER); - return (__wt_epoch(session, &lsm_tree->merge_aggressive_ts)); + __wt_epoch(session, &lsm_tree->merge_aggressive_ts); } - WT_RET(__wt_epoch(session, &now)); + __wt_epoch(session, &now); msec_since_last_merge = WT_TIMEDIFF_MS(now, lsm_tree->merge_aggressive_ts); @@ -113,7 +113,7 @@ __lsm_merge_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) * generates a variable load. */ if (msec_since_last_merge < msec_to_create_merge) - return (0); + return; /* * Bump how aggressively we look for merges based on how long since @@ -134,7 +134,6 @@ __lsm_merge_aggressive_update(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) msec_since_last_merge, lsm_tree->chunk_fill_ms); lsm_tree->merge_aggressiveness = new_aggressive; } - return (0); } /* @@ -326,7 +325,7 @@ retry_find: goto retry_find; } /* Consider getting aggressive if no merge was found */ - WT_RET(__lsm_merge_aggressive_update(session, lsm_tree)); + __lsm_merge_aggressive_update(session, lsm_tree); return (WT_NOTFOUND); } diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c index db9fd581110..0054dcd1583 100644 --- a/src/lsm/lsm_tree.c +++ b/src/lsm/lsm_tree.c @@ -265,7 +265,7 @@ __wt_lsm_tree_setup_chunk( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) { WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); - WT_RET(__wt_epoch(session, &chunk->create_ts)); + __wt_epoch(session, &chunk->create_ts); WT_RET(__wt_lsm_tree_chunk_name( session, lsm_tree, chunk->id, &chunk->uri)); @@ -496,7 +496,7 @@ __lsm_tree_open(WT_SESSION_IMPL *session, lsm_tree->queue_ref = 0; /* Set a flush timestamp as a baseline. */ - WT_ERR(__wt_epoch(session, &lsm_tree->last_flush_ts)); + __wt_epoch(session, &lsm_tree->last_flush_ts); /* Now the tree is setup, make it visible to others. */ TAILQ_INSERT_HEAD(&S2C(session)->lsmqh, lsm_tree, q); @@ -1139,7 +1139,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp) return (0); } - WT_ERR(__wt_seconds(session, &begin)); + __wt_seconds(session, &begin); /* * Compacting has two distinct phases. @@ -1267,7 +1267,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, bool *skipp) break; } __wt_sleep(1, 0); - WT_ERR(__wt_seconds(session, &end)); + __wt_seconds(session, &end); if (session->compact->max_time > 0 && session->compact->max_time < (uint64_t)(end - begin)) { WT_ERR(ETIMEDOUT); diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c index 72bcf56b3c4..917104031fc 100644 --- a/src/lsm/lsm_work_unit.c +++ b/src/lsm/lsm_work_unit.c @@ -358,7 +358,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); /* Update the flush timestamp to help track ongoing progress. */ - WT_ERR(__wt_epoch(session, &lsm_tree->last_flush_ts)); + __wt_epoch(session, &lsm_tree->last_flush_ts); ++lsm_tree->chunks_flushed; /* Lock the tree, mark the chunk as on disk and update the metadata. */ diff --git a/src/meta/meta_ckpt.c b/src/meta/meta_ckpt.c index 2b7719c3241..b985104c2eb 100644 --- a/src/meta/meta_ckpt.c +++ b/src/meta/meta_ckpt.c @@ -424,7 +424,7 @@ __wt_meta_ckptlist_set(WT_SESSION_IMPL *session, * guaranteed, a time_t has to be an arithmetic type, * but not an integral type. */ - WT_ERR(__wt_seconds(session, &secs)); + __wt_seconds(session, &secs); ckpt->sec = (uintmax_t)secs; } if (strcmp(ckpt->name, WT_CHECKPOINT) == 0) diff --git a/src/os_posix/os_mtx_cond.c b/src/os_posix/os_mtx_cond.c index b25bb8c25d1..842bb6eeec9 100644 --- a/src/os_posix/os_mtx_cond.c +++ b/src/os_posix/os_mtx_cond.c @@ -63,7 +63,7 @@ __wt_cond_wait_signal( locked = true; if (usecs > 0) { - WT_ERR(__wt_epoch(session, &ts)); + __wt_epoch(session, &ts); ts.tv_sec += (time_t) (((uint64_t)ts.tv_nsec + WT_THOUSAND * usecs) / WT_BILLION); ts.tv_nsec = (long) diff --git a/src/os_posix/os_time.c b/src/os_posix/os_time.c index b1b22a8e684..c7ae881af97 100644 --- a/src/os_posix/os_time.c +++ b/src/os_posix/os_time.c @@ -12,26 +12,26 @@ * __wt_epoch -- * Return the time since the Epoch. */ -int +void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) { WT_DECL_RET; #if defined(HAVE_CLOCK_GETTIME) - WT_SYSCALL(clock_gettime(CLOCK_REALTIME, tsp), ret); + WT_SYSCALL_RETRY(clock_gettime(CLOCK_REALTIME, tsp), ret); if (ret == 0) - return (0); - WT_RET_MSG(session, ret, "clock_gettime"); + return; + WT_PANIC_MSG(session, ret, "clock_gettime"); #elif defined(HAVE_GETTIMEOFDAY) struct timeval v; - WT_SYSCALL(gettimeofday(&v, NULL), ret); + WT_SYSCALL_RETRY(gettimeofday(&v, NULL), ret); if (ret == 0) { tsp->tv_sec = v.tv_sec; tsp->tv_nsec = v.tv_usec * WT_THOUSAND; - return (0); + return; } - WT_RET_MSG(session, ret, "gettimeofday"); + WT_PANIC_MSG(session, ret, "gettimeofday"); #else NO TIME-OF-DAY IMPLEMENTATION: see src/os_posix/os_time.c #endif diff --git a/src/os_win/os_time.c b/src/os_win/os_time.c index e784b5d8a36..6aa5b3719f6 100644 --- a/src/os_win/os_time.c +++ b/src/os_win/os_time.c @@ -12,11 +12,11 @@ * __wt_epoch -- * Return the time since the Epoch. */ -int +void __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) { - uint64_t ns100; FILETIME time; + uint64_t ns100; WT_UNUSED(session); @@ -26,8 +26,6 @@ __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp) - 116444736000000000LL; tsp->tv_sec = ns100 / 10000000; tsp->tv_nsec = (long)((ns100 % 10000000) * 100); - - return (0); } /* diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index 9c38c535301..810f3fd976b 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -451,19 +451,18 @@ __wt_reconcile(WT_SESSION_IMPL *session, } /* - * When application threads perform eviction, don't cache block manager - * or reconciliation structures (even across calls), we can have a - * significant number of application threads doing eviction at the same - * time with large items. We ignore checkpoints, once the checkpoint - * completes, all unnecessary session resources will be discarded. + * When threads perform eviction, don't cache block manager or + * reconciliation structures (even across calls), we can have a + * significant number of threads doing eviction at the same time with + * large items. We ignore checkpoints, once the checkpoint completes, + * all unnecessary session resources will be discarded. * - * Even in application threads doing checkpoints or in internal threads - * doing any reconciliation, clean up reconciliation resources. Some - * workloads have millions of boundary structures in a reconciliation - * and we don't want to tie that memory down, even across calls. + * Even in application threads doing checkpoints, clean up + * reconciliation resources. Some workloads have millions of boundary + * structures in a reconciliation and we don't want to tie that memory + * down, even across calls. */ - if (WT_SESSION_IS_CHECKPOINT(session) || - F_ISSET(session, WT_SESSION_INTERNAL)) + if (WT_SESSION_IS_CHECKPOINT(session)) __rec_bnd_cleanup(session, r, false); else { /* @@ -564,10 +563,12 @@ __rec_write_status(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * barrier after the change for clarity (the requirement is the * flag be set before a subsequent checkpoint reads it, and * as the current checkpoint is waiting on this reconciliation - * to complete, there's no risk of that happening) + * to complete, there's no risk of that happening). */ - btree->modified = 1; + btree->modified = true; WT_FULL_BARRIER(); + if (!S2C(session)->modified) + S2C(session)->modified = true; /* * Eviction should only be here if following the save/restore @@ -3335,7 +3336,7 @@ supd_check_complete: __wt_verbose(session, WT_VERB_SPLIT, "Reconciliation creating a page with %" PRIu32 " entries, memory footprint %" WT_SIZET_FMT - ", page count %" PRIu32 ", %s, split state: %d\n", + ", page count %" PRIu32 ", %s, split state: %d", r->entries, r->page->memory_footprint, r->bnd_next, F_ISSET(r, WT_EVICTING) ? "evict" : "checkpoint", r->bnd_state); diff --git a/src/session/session_api.c b/src/session/session_api.c index 0d3fcad3184..d3432c19ef3 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -68,9 +68,10 @@ __wt_session_copy_values(WT_SESSION_IMPL *session) * unless the cursor is reading from a checkpoint. */ WT_TXN_STATE *txn_state = WT_SESSION_TXN_STATE(session); - WT_ASSERT(session, txn_state->snap_min != WT_TXN_NONE || - (WT_PREFIX_MATCH(cursor->uri, "file:") && - F_ISSET((WT_CURSOR_BTREE *)cursor, WT_CBT_NO_TXN))); + WT_ASSERT(session, + txn_state->pinned_id != WT_TXN_NONE || + (WT_PREFIX_MATCH(cursor->uri, "file:") && + F_ISSET((WT_CURSOR_BTREE *)cursor, WT_CBT_NO_TXN))); #endif F_CLR(cursor, WT_CURSTD_VALUE_INT); @@ -1417,10 +1418,10 @@ __session_transaction_pinned_range(WT_SESSION *wt_session, uint64_t *prange) /* Assign pinned to the lesser of id or snap_min */ if (txn_state->id != WT_TXN_NONE && - WT_TXNID_LT(txn_state->id, txn_state->snap_min)) + WT_TXNID_LT(txn_state->id, txn_state->pinned_id)) pinned = txn_state->id; else - pinned = txn_state->snap_min; + pinned = txn_state->pinned_id; if (pinned == WT_TXN_NONE) *prange = 0; @@ -1494,14 +1495,14 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config) if (timeout_ms == 0) WT_ERR(ETIMEDOUT); - WT_ERR(__wt_epoch(session, &start)); + __wt_epoch(session, &start); /* * Keep checking the LSNs until we find it is stable or we reach * our timeout. */ while (__wt_log_cmp(&session->bg_sync_lsn, &log->sync_lsn) > 0) { __wt_cond_signal(session, conn->log_file_cond); - WT_ERR(__wt_epoch(session, &now)); + __wt_epoch(session, &now); waited_ms = WT_TIMEDIFF_MS(now, start); if (forever || waited_ms < timeout_ms) /* diff --git a/src/session/session_compact.c b/src/session/session_compact.c index f03d5d34bac..66635007723 100644 --- a/src/session/session_compact.c +++ b/src/session/session_compact.c @@ -179,17 +179,16 @@ __compact_handle_append(WT_SESSION_IMPL *session, const char *cfg[]) * Check if the timeout has been exceeded. */ static int -__session_compact_check_timeout( - WT_SESSION_IMPL *session, struct timespec begin) +__session_compact_check_timeout(WT_SESSION_IMPL *session, struct timespec begin) { struct timespec end; if (session->compact->max_time == 0) return (0); - WT_RET(__wt_epoch(session, &end)); + __wt_epoch(session, &end); if (session->compact->max_time < WT_TIMEDIFF_SEC(end, begin)) - WT_RET(ETIMEDOUT); + return (ETIMEDOUT); return (0); } @@ -219,7 +218,7 @@ __compact_file(WT_SESSION_IMPL *session, const char *cfg[]) session, t, "target=(\"%s\"),force=1", dhandle->name)); checkpoint_cfg[1] = t->data; - WT_ERR(__wt_epoch(session, &start_time)); + __wt_epoch(session, &start_time); /* * We compact 10% of the file on each pass (but the overall size of the diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c index e76407567bc..725854c6001 100644 --- a/src/session/session_dhandle.c +++ b/src/session/session_dhandle.c @@ -8,8 +8,6 @@ #include "wt_internal.h" -static int __session_dhandle_sweep(WT_SESSION_IMPL *); - /* * __session_add_dhandle -- * Add a handle to the session's cache. @@ -371,7 +369,7 @@ __wt_session_close_cache(WT_SESSION_IMPL *session) * __session_dhandle_sweep -- * Discard any session dhandles that are not open. */ -static int +static void __session_dhandle_sweep(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; @@ -385,9 +383,9 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session) * Periodically sweep for dead handles; if we've swept recently, don't * do it again. */ - WT_RET(__wt_seconds(session, &now)); + __wt_seconds(session, &now); if (difftime(now, session->last_sweep) < conn->sweep_interval) - return (0); + return; session->last_sweep = now; WT_STAT_CONN_INCR(session, dh_session_sweeps); @@ -408,7 +406,6 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session) } dhandle_cache = dhandle_cache_next; } - return (0); } /* @@ -446,7 +443,7 @@ __session_get_dhandle( } /* Sweep the handle list to remove any dead handles. */ - WT_RET(__session_dhandle_sweep(session)); + __session_dhandle_sweep(session); /* * We didn't find a match in the session cache, search the shared diff --git a/src/support/err.c b/src/support/err.c index 8bfac250b3a..3ecbab1cbe9 100644 --- a/src/support/err.c +++ b/src/support/err.c @@ -162,7 +162,6 @@ __wt_eventv(WT_SESSION_IMPL *session, bool msg_event, int error, WT_SESSION *wt_session; struct timespec ts; size_t len, remain, wlen; - int prefix_cnt; const char *err, *prefix; char *end, *p, tid[128]; @@ -211,44 +210,32 @@ __wt_eventv(WT_SESSION_IMPL *session, bool msg_event, int error, * name, and the session's name. Write them as a comma-separate list, * followed by a colon. */ - prefix_cnt = 0; - if (__wt_epoch(session, &ts) == 0) { - __wt_thread_id(tid, sizeof(tid)); - remain = WT_PTRDIFF(end, p); - wlen = (size_t)snprintf(p, remain, - "[%" PRIuMAX ":%" PRIuMAX "][%s]", - (uintmax_t)ts.tv_sec, - (uintmax_t)ts.tv_nsec / WT_THOUSAND, tid); - p = wlen >= remain ? end : p + wlen; - prefix_cnt = 1; - } + __wt_epoch(session, &ts); + __wt_thread_id(tid, sizeof(tid)); + remain = WT_PTRDIFF(end, p); + wlen = (size_t)snprintf(p, remain, "[%" PRIuMAX ":%" PRIuMAX "][%s]", + (uintmax_t)ts.tv_sec, (uintmax_t)ts.tv_nsec / WT_THOUSAND, tid); + p = wlen >= remain ? end : p + wlen; + if ((prefix = S2C(session)->error_prefix) != NULL) { remain = WT_PTRDIFF(end, p); - wlen = (size_t)snprintf(p, remain, - "%s%s", prefix_cnt == 0 ? "" : ", ", prefix); + wlen = (size_t)snprintf(p, remain, ", %s", prefix); p = wlen >= remain ? end : p + wlen; - prefix_cnt = 1; } prefix = session->dhandle == NULL ? NULL : session->dhandle->name; if (prefix != NULL) { remain = WT_PTRDIFF(end, p); - wlen = (size_t)snprintf(p, remain, - "%s%s", prefix_cnt == 0 ? "" : ", ", prefix); + wlen = (size_t)snprintf(p, remain, ", %s", prefix); p = wlen >= remain ? end : p + wlen; - prefix_cnt = 1; } if ((prefix = session->name) != NULL) { remain = WT_PTRDIFF(end, p); - wlen = (size_t)snprintf(p, remain, - "%s%s", prefix_cnt == 0 ? "" : ", ", prefix); - p = wlen >= remain ? end : p + wlen; - prefix_cnt = 1; - } - if (prefix_cnt != 0) { - remain = WT_PTRDIFF(end, p); - wlen = (size_t)snprintf(p, remain, ": "); + wlen = (size_t)snprintf(p, remain, ", %s", prefix); p = wlen >= remain ? end : p + wlen; } + remain = WT_PTRDIFF(end, p); + wlen = (size_t)snprintf(p, remain, ": "); + p = wlen >= remain ? end : p + wlen; if (file_name != NULL) { remain = WT_PTRDIFF(end, p); diff --git a/src/support/rand.c b/src/support/rand.c index d2e4cd27aab..025b18e4ed3 100644 --- a/src/support/rand.c +++ b/src/support/rand.c @@ -66,20 +66,18 @@ __wt_random_init(WT_RAND_STATE volatile * rnd_state) * threads and we want each thread to initialize its own random state based * on a different random seed. */ -int +void __wt_random_init_seed( WT_SESSION_IMPL *session, WT_RAND_STATE volatile * rnd_state) { struct timespec ts; WT_RAND_STATE rnd; - WT_RET(__wt_epoch(session, &ts)); + __wt_epoch(session, &ts); M_W(rnd) = (uint32_t)(ts.tv_nsec + 521288629); M_Z(rnd) = (uint32_t)(ts.tv_nsec + 362436069); *rnd_state = rnd; - - return (0); } /* diff --git a/src/support/stat.c b/src/support/stat.c index 7150223e6cb..9d440f9ebf3 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -706,6 +706,8 @@ static const char * const __stats_connection_desc[] = { "thread-state: active filesystem fsync calls", "thread-state: active filesystem read calls", "thread-state: active filesystem write calls", + "thread-yield: application thread time evicting (usecs)", + "thread-yield: application thread time waiting for cache (usecs)", "thread-yield: page acquire busy blocked", "thread-yield: page acquire eviction blocked", "thread-yield: page acquire locked blocked", @@ -723,6 +725,7 @@ static const char * const __stats_connection_desc[] = { "transaction: transaction checkpoint scrub time (msecs)", "transaction: transaction checkpoint total time (msecs)", "transaction: transaction checkpoints", + "transaction: transaction checkpoints skipped because database was clean", "transaction: transaction failures due to cache overflow", "transaction: transaction fsync calls for checkpoint after allocating the transaction ID", "transaction: transaction fsync duration for checkpoint after allocating the transaction ID (usecs)", @@ -950,6 +953,8 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) /* not clearing thread_fsync_active */ /* not clearing thread_read_active */ /* not clearing thread_write_active */ + stats->application_evict_time = 0; + stats->application_cache_time = 0; stats->page_busy_blocked = 0; stats->page_forcible_evict_blocked = 0; stats->page_locked_blocked = 0; @@ -967,6 +972,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) /* not clearing txn_checkpoint_scrub_time */ /* not clearing txn_checkpoint_time_total */ stats->txn_checkpoint = 0; + stats->txn_checkpoint_skipped = 0; stats->txn_fail_cache = 0; stats->txn_checkpoint_fsync_post = 0; /* not clearing txn_checkpoint_fsync_post_duration */ @@ -1242,6 +1248,10 @@ __wt_stat_connection_aggregate( to->thread_fsync_active += WT_STAT_READ(from, thread_fsync_active); to->thread_read_active += WT_STAT_READ(from, thread_read_active); to->thread_write_active += WT_STAT_READ(from, thread_write_active); + to->application_evict_time += + WT_STAT_READ(from, application_evict_time); + to->application_cache_time += + WT_STAT_READ(from, application_cache_time); to->page_busy_blocked += WT_STAT_READ(from, page_busy_blocked); to->page_forcible_evict_blocked += WT_STAT_READ(from, page_forcible_evict_blocked); @@ -1270,6 +1280,8 @@ __wt_stat_connection_aggregate( to->txn_checkpoint_time_total += WT_STAT_READ(from, txn_checkpoint_time_total); to->txn_checkpoint += WT_STAT_READ(from, txn_checkpoint); + to->txn_checkpoint_skipped += + WT_STAT_READ(from, txn_checkpoint_skipped); to->txn_fail_cache += WT_STAT_READ(from, txn_fail_cache); to->txn_checkpoint_fsync_post += WT_STAT_READ(from, txn_checkpoint_fsync_post); diff --git a/src/support/thread_group.c b/src/support/thread_group.c index f5ddabad7d4..a866d2d01c5 100644 --- a/src/support/thread_group.c +++ b/src/support/thread_group.c @@ -60,7 +60,7 @@ __thread_group_grow( while (group->current_threads < new_count) { thread = group->threads[group->current_threads++]; __wt_verbose(session, WT_VERB_THREAD_GROUP, - "Starting utility thread: %p:%"PRIu32"\n", + "Starting utility thread: %p:%" PRIu32, (void *)group, thread->id); F_SET(thread, WT_THREAD_RUN); WT_ASSERT(session, thread->session != NULL); @@ -100,7 +100,7 @@ __thread_group_shrink(WT_SESSION_IMPL *session, /* Wake threads to ensure they notice the state change */ if (thread->tid != 0) { __wt_verbose(session, WT_VERB_THREAD_GROUP, - "Stopping utility thread: %p:%"PRIu32"\n", + "Stopping utility thread: %p:%" PRIu32, (void *)group, thread->id); F_CLR(thread, WT_THREAD_RUN); __wt_cond_signal(session, group->wait_cond); @@ -224,7 +224,7 @@ __wt_thread_group_resize( __wt_verbose(session, WT_VERB_THREAD_GROUP, "Resize thread group: %p, from min: %" PRIu32 " -> %" PRIu32 - " from max: %" PRIu32 " -> %" PRIu32 "\n", + " from max: %" PRIu32 " -> %" PRIu32, (void *)group, group->min, new_min, group->max, new_max); __wt_writelock(session, group->lock); @@ -253,7 +253,7 @@ __wt_thread_group_create( cond_alloced = false; __wt_verbose(session, WT_VERB_THREAD_GROUP, - "Creating thread group: %p\n", (void *)group); + "Creating thread group: %p", (void *)group); WT_RET(__wt_rwlock_alloc(session, &group->lock, "Thread group")); WT_ERR(__wt_cond_alloc( @@ -286,7 +286,7 @@ __wt_thread_group_destroy(WT_SESSION_IMPL *session, WT_THREAD_GROUP *group) WT_DECL_RET; __wt_verbose(session, WT_VERB_THREAD_GROUP, - "Destroying thread group: %p\n", (void *)group); + "Destroying thread group: %p", (void *)group); WT_ASSERT(session, __wt_rwlock_islocked(session, group->lock)); diff --git a/src/txn/txn.c b/src/txn/txn.c index 01e0fbbb634..3b24bcd505d 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -96,11 +96,11 @@ __wt_txn_release_snapshot(WT_SESSION_IMPL *session) txn_state = WT_SESSION_TXN_STATE(session); WT_ASSERT(session, - txn_state->snap_min == WT_TXN_NONE || + txn_state->pinned_id == WT_TXN_NONE || session->txn.isolation == WT_ISO_READ_UNCOMMITTED || - !__wt_txn_visible_all(session, txn_state->snap_min)); + !__wt_txn_visible_all(session, txn_state->pinned_id)); - txn_state->snap_min = WT_TXN_NONE; + txn_state->pinned_id = WT_TXN_NONE; F_CLR(txn, WT_TXN_HAS_SNAPSHOT); } @@ -117,7 +117,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s, *txn_state; uint64_t current_id, id; - uint64_t prev_oldest_id, snap_min; + uint64_t prev_oldest_id, pinned_id; uint32_t i, n, session_cnt; conn = S2C(session); @@ -135,21 +135,21 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) WT_PAUSE(); WT_RET(ret); - current_id = snap_min = txn_global->current; + current_id = pinned_id = txn_global->current; prev_oldest_id = txn_global->oldest_id; /* * Include the checkpoint transaction, if one is running: we should * ignore any uncommitted changes the checkpoint has written to the * metadata. We don't have to keep the checkpoint's changes pinned so - * don't including it in the published snap_min. + * don't including it in the published pinned ID. */ if ((id = txn_global->checkpoint_txnid) != WT_TXN_NONE) txn->snapshot[n++] = id; /* For pure read-only workloads, avoid scanning. */ if (prev_oldest_id == current_id) { - txn_state->snap_min = current_id; + txn_state->pinned_id = current_id; /* Check that the oldest ID has not moved in the meantime. */ WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); goto done; @@ -172,18 +172,18 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) (id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id)) { txn->snapshot[n++] = id; - if (WT_TXNID_LT(id, snap_min)) - snap_min = id; + if (WT_TXNID_LT(id, pinned_id)) + pinned_id = id; } } /* - * If we got a new snapshot, update the published snap_min for this + * If we got a new snapshot, update the published pinned ID for this * session. */ - WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, snap_min)); + WT_ASSERT(session, WT_TXNID_LE(prev_oldest_id, pinned_id)); WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); - txn_state->snap_min = snap_min; + txn_state->pinned_id = pinned_id; done: __wt_readunlock(session, txn_global->scan_rwlock); __txn_sort_snapshot(session, n, current_id); @@ -232,13 +232,13 @@ __txn_oldest_scan(WT_SESSION_IMPL *session, /* * !!! - * Note: Don't ignore snap_min values older than the previous - * oldest ID. Read-uncommitted operations publish snap_min + * Note: Don't ignore pinned ID values older than the previous + * oldest ID. Read-uncommitted operations publish pinned ID * values without acquiring the scan lock to protect the global - * table. See the comment in __wt_txn_cursor_op for - * more details. + * table. See the comment in __wt_txn_cursor_op for more + * details. */ - if ((id = s->snap_min) != WT_TXN_NONE && + if ((id = s->pinned_id) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) { oldest_id = id; oldest_session = &conn->sessions[i]; @@ -360,7 +360,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, uint32_t flags) __wt_verbose(session, WT_VERB_TRANSACTION, "old snapshot %" PRIu64 " pinned in session %" PRIu32 " [%s]" - " with snap_min %" PRIu64 "\n", + " with snap_min %" PRIu64, oldest_id, oldest_session->id, oldest_session->lastop, oldest_session->txn.snap_min); @@ -673,7 +673,7 @@ __wt_txn_init(WT_SESSION_IMPL *session) if (S2C(session)->txn_global.states != NULL) { WT_TXN_STATE *txn_state; txn_state = WT_SESSION_TXN_STATE(session); - WT_ASSERT(session, txn_state->snap_min == WT_TXN_NONE); + WT_ASSERT(session, txn_state->pinned_id == WT_TXN_NONE); } #endif @@ -773,7 +773,7 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) WT_CACHE_LINE_ALIGNMENT_VERIFY(session, txn_global->states); for (i = 0, s = txn_global->states; i < conn->session_size; i++, s++) - s->id = s->snap_min = WT_TXN_NONE; + s->id = s->pinned_id = WT_TXN_NONE; return (0); } diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 3aad95f5a9f..0557e6ce60c 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -314,7 +314,7 @@ __checkpoint_update_generation(WT_SESSION_IMPL *session) * __checkpoint_reduce_dirty_cache -- * Release clean trees from the list cached for checkpoints. */ -static int +static void __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) { WT_CACHE *cache; @@ -332,9 +332,9 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) /* Give up if scrubbing is disabled. */ if (cache->eviction_checkpoint_target == 0 || cache->eviction_checkpoint_target >= cache->eviction_dirty_trigger) - return (0); + return; - WT_RET(__wt_epoch(session, &start)); + __wt_epoch(session, &start); last = start; bytes_written_last = 0; bytes_written_start = cache->bytes_written; @@ -345,7 +345,7 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) * cache via reconfigure. This avoids potential divide by zero. */ if (cache_size < 10 * WT_MEGABYTE) - return (0); + return; stepdown_us = 10000; work_us = 0; progress = false; @@ -371,7 +371,7 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) break; __wt_sleep(0, stepdown_us / 10); - WT_RET(__wt_epoch(session, &stop)); + __wt_epoch(session, &stop); current_us = WT_TIMEDIFF_US(stop, last); total_ms = WT_TIMEDIFF_MS(stop, start); bytes_written_total = @@ -427,14 +427,12 @@ __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) WT_MAX(cache->eviction_dirty_target, current_dirty - delta); WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, cache->eviction_scrub_limit); - WT_RET(__wt_epoch(session, &last)); + __wt_epoch(session, &last); } - WT_RET(__wt_epoch(session, &stop)); + __wt_epoch(session, &stop); total_ms = WT_TIMEDIFF_MS(stop, start); WT_STAT_CONN_SET(session, txn_checkpoint_scrub_time, total_ms); - - return (0); } /* @@ -497,7 +495,7 @@ __checkpoint_stats( * __checkpoint_verbose_track -- * Output a verbose message with timing information */ -static int +static void __checkpoint_verbose_track(WT_SESSION_IMPL *session, const char *msg, struct timespec *start) { @@ -506,9 +504,9 @@ __checkpoint_verbose_track(WT_SESSION_IMPL *session, uint64_t msec; if (!WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) - return (0); + return; - WT_RET(__wt_epoch(session, &stop)); + __wt_epoch(session, &stop); /* * Get time diff in microseconds. @@ -526,7 +524,6 @@ __checkpoint_verbose_track(WT_SESSION_IMPL *session, WT_UNUSED(msg); WT_UNUSED(start); #endif - return (0); } /* @@ -576,7 +573,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) conn->cache->evict_max_page_size = 0; /* Initialize the verbose tracking timer */ - WT_ERR(__wt_epoch(session, &verb_timer)); + __wt_epoch(session, &verb_timer); /* * Update the global oldest ID so we do all possible cleanup. @@ -594,18 +591,18 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * Try to reduce the amount of dirty data in cache so there is less * work do during the critical section of the checkpoint. */ - WT_ERR(__checkpoint_reduce_dirty_cache(session)); + __checkpoint_reduce_dirty_cache(session); /* Tell logging that we are about to start a database checkpoint. */ if (full && logging) WT_ERR(__wt_txn_checkpoint_log( session, full, WT_TXN_LOG_CKPT_PREPARE, NULL)); - WT_ERR(__checkpoint_verbose_track(session, - "starting transaction", &verb_timer)); + __checkpoint_verbose_track(session, + "starting transaction", &verb_timer); if (full) - WT_ERR(__wt_epoch(session, &start)); + __wt_epoch(session, &start); /* * Start the checkpoint for real. @@ -666,6 +663,14 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_txn_id_check(session)); /* + * Mark the connection as clean. If some data gets modified after + * generating checkpoint transaction id, connection will be reset to + * dirty when reconciliation marks the btree dirty on encountering the + * dirty page. + */ + conn->modified = false; + + /* * Save the checkpoint session ID. * * We never do checkpoints in the default session (with id zero). @@ -689,7 +694,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) */ WT_ASSERT(session, WT_TXNID_LE(txn_global->oldest_id, txn_state->id) && - WT_TXNID_LE(txn_global->oldest_id, txn_state->snap_min)); + WT_TXNID_LE(txn_global->oldest_id, txn_state->pinned_id)); /* * Clear our entry from the global transaction session table. Any @@ -698,7 +703,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * can safely ignore the checkpoint ID (see the visible all check for * details). */ - txn_state->id = txn_state->snap_min = WT_TXN_NONE; + txn_state->id = txn_state->pinned_id = WT_TXN_NONE; __wt_writeunlock(session, txn_global->scan_rwlock); /* @@ -739,23 +744,22 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_presync)); __wt_evict_server_wake(session); - WT_ERR(__checkpoint_verbose_track(session, - "committing transaction", &verb_timer)); + __checkpoint_verbose_track(session, + "committing transaction", &verb_timer); /* * Checkpoints have to hit disk (it would be reasonable to configure for * lazy checkpoints, but we don't support them yet). */ - WT_ERR(__wt_epoch(session, &fsync_start)); + __wt_epoch(session, &fsync_start); WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync)); - WT_ERR(__wt_epoch(session, &fsync_stop)); + __wt_epoch(session, &fsync_stop); fsync_duration_usecs = WT_TIMEDIFF_US(fsync_stop, fsync_start); WT_STAT_CONN_INCR(session, txn_checkpoint_fsync_post); WT_STAT_CONN_SET(session, txn_checkpoint_fsync_post_duration, fsync_duration_usecs); - WT_ERR(__checkpoint_verbose_track(session, - "sync completed", &verb_timer)); + __checkpoint_verbose_track(session, "sync completed", &verb_timer); /* * Commit the transaction now that we are sure that all files in the @@ -793,8 +797,8 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) ret = __wt_checkpoint_sync(session, NULL)); WT_ERR(ret); - WT_ERR(__checkpoint_verbose_track(session, - "metadata sync completed", &verb_timer)); + __checkpoint_verbose_track(session, + "metadata sync completed", &verb_timer); } else WT_WITH_DHANDLE(session, WT_SESSION_META_DHANDLE(session), @@ -808,7 +812,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) txn_global->checkpoint_pinned = WT_TXN_NONE; if (full) { - WT_ERR(__wt_epoch(session, &stop)); + __wt_epoch(session, &stop); __checkpoint_stats(session, &start, &stop); } @@ -825,6 +829,9 @@ err: /* * overwritten the checkpoint, so what ends up on disk is not * consistent. */ + if (ret != 0 && !conn->modified) + conn->modified = true; + session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED; if (tracking) WT_TRET(__wt_meta_track_off(session, false, ret != 0)); @@ -1352,9 +1359,13 @@ __checkpoint_tree( * out of sync with the set of dirty pages (modify is set, but there * are no dirty pages), we perform a checkpoint without any writes, no * checkpoint is created, and then things get bad. + * While marking the root page as dirty, we do not want to dirty the + * btree because we are marking the btree as clean just after this call. + * Also, marking the btree dirty at this stage will unnecessarily mark + * the connection as dirty causing checkpoint-skip code to fail. */ WT_ERR(__wt_page_modify_init(session, btree->root.page)); - __wt_page_modify_set(session, btree->root.page); + __wt_page_only_modify_set(session, btree->root.page); /* * Clear the tree's modified flag; any changes before we clear the flag @@ -1366,7 +1377,7 @@ __checkpoint_tree( * it sets the modified flag itself. Use a full barrier so we get the * store done quickly, this isn't a performance path. */ - btree->modified = 0; + btree->modified = false; WT_FULL_BARRIER(); /* Tell logging that a file checkpoint is starting. */ @@ -1440,8 +1451,11 @@ err: /* * If the checkpoint didn't complete successfully, make sure the * tree is marked dirty. */ - if (ret != 0 && !btree->modified && was_modified) - btree->modified = 1; + if (ret != 0 && !btree->modified && was_modified) { + btree->modified = true; + if (!S2C(session)->modified) + S2C(session)->modified = true; + } __wt_meta_ckptlist_free(session, ckptbase); btree->ckpt = NULL; diff --git a/src/txn/txn_nsnap.c b/src/txn/txn_nsnap.c index 8f7e93238de..7ba0cc8700e 100644 --- a/src/txn/txn_nsnap.c +++ b/src/txn/txn_nsnap.c @@ -42,9 +42,16 @@ __nsnap_drop_one(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name) return (WT_NOTFOUND); /* Bump the global ID if we are removing the first entry */ - if (found == TAILQ_FIRST(&txn_global->nsnaph)) + if (found == TAILQ_FIRST(&txn_global->nsnaph)) { + WT_ASSERT(session, !__wt_txn_visible_all( + session, txn_global->nsnap_oldest_id)); txn_global->nsnap_oldest_id = (TAILQ_NEXT(found, q) != NULL) ? - TAILQ_NEXT(found, q)->snap_min : WT_TXN_NONE; + TAILQ_NEXT(found, q)->pinned_id : WT_TXN_NONE; + WT_DIAGNOSTIC_YIELD; + WT_ASSERT(session, txn_global->nsnap_oldest_id == WT_TXN_NONE || + !__wt_txn_visible_all( + session, txn_global->nsnap_oldest_id)); + } TAILQ_REMOVE(&txn_global->nsnaph, found, q); __nsnap_destroy(session, found); WT_STAT_CONN_INCR(session, txn_snapshots_dropped); @@ -104,7 +111,7 @@ __nsnap_drop_to(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name, bool inclusive) } if (TAILQ_NEXT(last, q) != NULL) - new_nsnap_oldest = TAILQ_NEXT(last, q)->snap_min; + new_nsnap_oldest = TAILQ_NEXT(last, q)->pinned_id; } do { @@ -117,7 +124,15 @@ __nsnap_drop_to(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name, bool inclusive) } while (nsnap != last && !TAILQ_EMPTY(&txn_global->nsnaph)); /* Now that the queue of named snapshots is updated, update the ID */ + WT_ASSERT(session, !__wt_txn_visible_all( + session, txn_global->nsnap_oldest_id) && + (new_nsnap_oldest == WT_TXN_NONE || + WT_TXNID_LE(txn_global->nsnap_oldest_id, new_nsnap_oldest))); txn_global->nsnap_oldest_id = new_nsnap_oldest; + WT_DIAGNOSTIC_YIELD; + WT_ASSERT(session, + new_nsnap_oldest == WT_TXN_NONE || + !__wt_txn_visible_all(session, new_nsnap_oldest)); return (ret); } @@ -157,6 +172,7 @@ __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[]) WT_ERR(__wt_calloc_one(session, &nsnap_new)); nsnap = nsnap_new; WT_ERR(__wt_strndup(session, cval.str, cval.len, &nsnap->name)); + nsnap->pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; nsnap->snap_min = txn->snap_min; nsnap->snap_max = txn->snap_max; if (txn->snapshot_count > 0) { @@ -175,15 +191,25 @@ __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[]) */ WT_ERR_NOTFOUND_OK(__nsnap_drop_one(session, &cval)); - if (TAILQ_EMPTY(&txn_global->nsnaph)) - txn_global->nsnap_oldest_id = nsnap_new->snap_min; + if (TAILQ_EMPTY(&txn_global->nsnaph)) { + WT_ASSERT(session, txn_global->nsnap_oldest_id == WT_TXN_NONE && + !__wt_txn_visible_all(session, nsnap_new->pinned_id)); + __wt_readlock(session, txn_global->scan_rwlock); + txn_global->nsnap_oldest_id = nsnap_new->pinned_id; + __wt_readunlock(session, txn_global->scan_rwlock); + } TAILQ_INSERT_TAIL(&txn_global->nsnaph, nsnap_new, q); WT_STAT_CONN_INCR(session, txn_snapshots_created); nsnap_new = NULL; -err: if (started_txn) +err: if (started_txn) { +#ifdef HAVE_DIAGNOSTIC + uint64_t pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; +#endif WT_TRET(__wt_txn_rollback(session, NULL)); - else if (ret == 0) + WT_DIAGNOSTIC_YIELD; + WT_ASSERT(session, !__wt_txn_visible_all(session, pinned_id)); + } else if (ret == 0) F_SET(txn, WT_TXN_NAMED_SNAPSHOT); if (nsnap_new != NULL) @@ -258,7 +284,20 @@ __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval) __wt_readlock(session, txn_global->nsnap_rwlock); TAILQ_FOREACH(nsnap, &txn_global->nsnaph, q) if (WT_STRING_MATCH(nsnap->name, nameval->str, nameval->len)) { - txn->snap_min = txn_state->snap_min = nsnap->snap_min; + /* + * Acquire the scan lock so the oldest ID can't move + * forward without seeing our pinned ID. + */ + __wt_readlock(session, txn_global->scan_rwlock); + txn_state->pinned_id = nsnap->pinned_id; + __wt_readunlock(session, txn_global->scan_rwlock); + + WT_ASSERT(session, !__wt_txn_visible_all( + session, txn_state->pinned_id) && + txn_global->nsnap_oldest_id != WT_TXN_NONE && + WT_TXNID_LE(txn_global->nsnap_oldest_id, + txn_state->pinned_id)); + txn->snap_min = nsnap->snap_min; txn->snap_max = nsnap->snap_max; if ((txn->snapshot_count = nsnap->snapshot_count) != 0) memcpy(txn->snapshot, nsnap->snapshot, |