diff options
209 files changed, 8245 insertions, 5277 deletions
diff --git a/SConstruct b/SConstruct index 49e4417133f..70ed6e0220b 100644 --- a/SConstruct +++ b/SConstruct @@ -454,6 +454,7 @@ t = env.Program("wtperf", [ "bench/wtperf/misc.c", "bench/wtperf/track.c", "bench/wtperf/wtperf.c", + "bench/wtperf/wtperf_truncate.c", ], LIBS=[wtlib, shim] + wtlibs) Default(t) diff --git a/bench/wtperf/config.c b/bench/wtperf/config.c index 4445de3296d..6b0ce47ef3f 100644 --- a/bench/wtperf/config.c +++ b/bench/wtperf/config.c @@ -96,7 +96,7 @@ config_assign(CONFIG *dest, const CONFIG *src) } } - STAILQ_INIT(&dest->stone_head); + TAILQ_INIT(&dest->stone_head); return (0); } @@ -257,13 +257,15 @@ config_threads(CONFIG *cfg, const char *config, size_t len) continue; } if (STRING_MATCH("truncate_pct", k.str, k.len)) { - if ((workp->truncate_pct = v.val) <= 0) + if (v.val <= 0) goto err; + workp->truncate_pct = (uint64_t)v.val; continue; } if (STRING_MATCH("truncate_count", k.str, k.len)) { - if ((workp->truncate_count = v.val) <= 0) + if (v.val <= 0) goto err; + workp->truncate_count = (uint64_t)v.val; continue; } goto err; diff --git a/bench/wtperf/runners/mongodb-large-oplog.wtperf b/bench/wtperf/runners/mongodb-large-oplog.wtperf new file mode 100644 index 00000000000..1e203a34cc3 --- /dev/null +++ b/bench/wtperf/runners/mongodb-large-oplog.wtperf @@ -0,0 +1,13 @@ +# wtperf options file to simulate populating a MongoDB oplog +# This creates a test database of 7.8GB +conn_config="cache_size=2GB,checkpoint=(wait=60)" +table_config="type=file" +# Start with a small set of inserts in the populate phase. +icount=300000 +report_interval=5 +run_time=3600 +populate_threads=1 +key_sz=8192 +# Setup three threads to insert into the oplog +# Setup one thread to be doing truncates from the oplog +threads=((count=3,inserts=1,throttle=2000),(count=1,truncate=1,truncate_pct=10,truncate_count=300000)) diff --git a/bench/wtperf/runners/mongodb-small-oplog.wtperf b/bench/wtperf/runners/mongodb-small-oplog.wtperf new file mode 100644 index 00000000000..4f2ae5359cd --- /dev/null +++ b/bench/wtperf/runners/mongodb-small-oplog.wtperf @@ -0,0 +1,13 @@ +# wtperf options file to simulate populating a MongoDB oplog +# This creates an oplog of 6.1GB +conn_config="cache_size=2GB,checkpoint=(wait=60)" +table_config="type=file" +# Start with a small set of inserts in the populate phase. +icount=750000 +report_interval=5 +run_time=3600 +populate_threads=1 +key_sz=512 +# Setup three threads to insert into the oplog +# Setup one thread to be doing truncates from the oplog +threads=((count=3,inserts=1,throttle=2000),(count=1,truncate=1,truncate_pct=10,truncate_count=750000)) diff --git a/bench/wtperf/runners/wtperf_run.sh b/bench/wtperf/runners/wtperf_run.sh index d5de7c4abdb..ac31c2a2e78 100755 --- a/bench/wtperf/runners/wtperf_run.sh +++ b/bench/wtperf/runners/wtperf_run.sh @@ -24,18 +24,18 @@ outfile=./wtperf.out rm -f $outfile # Each of these has an entry for each op in ops below. -avg=(0 0 0) -max=(0 0 0) -min=(0 0 0) -sum=(0 0 0) +avg=(0 0 0 0) +max=(0 0 0 0) +min=(0 0 0 0) +sum=(0 0 0 0) # Load needs floating point and bc, handle separately. -loadindex=4 +loadindex=5 avg[$loadindex]=0 max[$loadindex]=0 min[$loadindex]=0 sum[$loadindex]=0 -ops=(read insert update) -outp=("Read count:" "Insert count:" "Update count:") +ops=(read insert update truncate) +outp=("Read count:" "Insert count:" "Update count:" "Truncate count:") outp[$loadindex]="Load time:" # getval min/max val cur diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index 148aa0e4e84..5d3b334785d 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -96,17 +96,11 @@ static uint64_t wtperf_value_range(CONFIG *); #define HELIUM_CONFIG ",type=helium" #define INDEX_COL_NAMES ",columns=(key,val)" -inline uint64_t -decode_key(char *key_buf) -{ - return (strtoull(key_buf, NULL, 10)); -} - /* Retrieve an ID for the next insert operation. */ static inline uint64_t get_next_incr(CONFIG *cfg) { - return (WT_ATOMIC_ADD8(cfg->insert_key, 1)); + return (__wt_atomic_add64(&cfg->insert_key, 1)); } static void @@ -157,7 +151,7 @@ cb_asyncop(WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP *op, int ret, uint32_t flags) switch (type) { case WT_AOP_COMPACT: tables = (uint32_t *)op->app_private; - WT_ATOMIC_ADD4(*tables, (uint32_t)-1); + (void)__wt_atomic_add32(tables, (uint32_t)-1); break; case WT_AOP_INSERT: trk = &thread->insert; @@ -192,7 +186,7 @@ cb_asyncop(WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP *op, int ret, uint32_t flags) return (0); if (ret == 0 || (ret == WT_NOTFOUND && type != WT_AOP_INSERT)) { if (!cfg->in_warmup) - (void)WT_ATOMIC_ADD8(trk->ops, 1); + (void)__wt_atomic_add64(&trk->ops, 1); return (0); } err: @@ -513,10 +507,9 @@ worker(void *arg) * is 0, to avoid first time latency spikes. */ measure_latency = - cfg->sample_interval != 0 && trk->ops != 0 && ( - trk->ops % cfg->sample_rate == 0); - if (measure_latency && - (ret = __wt_epoch(NULL, &start)) != 0) { + cfg->sample_interval != 0 && trk != NULL && + trk->ops != 0 && (trk->ops % cfg->sample_rate == 0); + if (measure_latency && (ret = __wt_epoch(NULL, &start)) != 0) { lprintf(cfg, ret, 0, "Get time call failed"); goto err; } @@ -880,10 +873,9 @@ populate_thread(void *arg) cursor = cursors[op % cfg->table_count]; generate_key(cfg, key_buf, op); measure_latency = - cfg->sample_interval != 0 && trk->ops != 0 && ( - trk->ops % cfg->sample_rate == 0); - if (measure_latency && - (ret = __wt_epoch(NULL, &start)) != 0) { + cfg->sample_interval != 0 && + trk->ops != 0 && (trk->ops % cfg->sample_rate == 0); + if (measure_latency && (ret = __wt_epoch(NULL, &start)) != 0) { lprintf(cfg, ret, 0, "Get time call failed"); goto err; } @@ -1001,10 +993,9 @@ populate_async(void *arg) * the time to process by workers. */ measure_latency = - cfg->sample_interval != 0 && trk->ops != 0 && ( - trk->ops % cfg->sample_rate == 0); - if (measure_latency && - (ret = __wt_epoch(NULL, &start)) != 0) { + cfg->sample_interval != 0 && + trk->ops != 0 && (trk->ops % cfg->sample_rate == 0); + if (measure_latency && (ret = __wt_epoch(NULL, &start)) != 0) { lprintf(cfg, ret, 0, "Get time call failed"); goto err; } @@ -1046,8 +1037,7 @@ populate_async(void *arg) goto err; if (measure_latency) { if ((ret = __wt_epoch(NULL, &stop)) != 0) { - lprintf(cfg, ret, 0, - "Get time call failed"); + lprintf(cfg, ret, 0, "Get time call failed"); goto err; } ++trk->latency_ops; diff --git a/bench/wtperf/wtperf.h b/bench/wtperf/wtperf.h index 991c09138e3..e4b9fc00798 100644 --- a/bench/wtperf/wtperf.h +++ b/bench/wtperf/wtperf.h @@ -95,7 +95,7 @@ typedef struct { int64_t ops_per_txn; int64_t truncate; /* Truncate ratio */ uint64_t truncate_pct; /* Truncate Percent */ - uint64_t truncate_count; /* Truncate Percent */ + uint64_t truncate_count; /* Truncate Count */ #define WORKER_INSERT 1 /* Insert */ #define WORKER_INSERT_RMW 2 /* Insert with read-modify-write */ @@ -108,7 +108,6 @@ typedef struct { /* Steering items for the truncate workload */ typedef struct __truncate_struct TRUNCATE_CONFIG; struct __truncate_struct { - double truncation_percentage; uint64_t stone_gap; uint64_t needed_stones; uint64_t final_stone_gap; @@ -122,8 +121,8 @@ struct __truncate_struct { /* Queue entry for use with the Truncate Logic */ struct __truncate_queue_entry { char *key; /* Truncation point */ - u_int diff; /* Number of items to be truncated*/ - STAILQ_ENTRY(__truncate_queue_entry) q; + uint64_t diff; /* Number of items to be truncated*/ + TAILQ_ENTRY(__truncate_queue_entry) q; }; typedef struct __truncate_queue_entry TRUNCATE_QUEUE_ENTRY; @@ -179,7 +178,7 @@ struct __config { /* Configuration structure */ u_int has_truncate; /* if there is a truncate workload */ /* Queue head for use with the Truncate Logic */ - STAILQ_HEAD(__truncate_qh, __truncate_queue_entry) stone_head; + TAILQ_HEAD(__truncate_qh, __truncate_queue_entry) stone_head; /* Fields changeable on command line are listed in wtperf_opt.i */ #define OPT_DECLARE_STRUCT @@ -273,7 +272,6 @@ int config_opt_line(CONFIG *, const char *); int config_opt_str(CONFIG *, const char *, const char *); void config_print(CONFIG *); int config_sanity(CONFIG *); -uint64_t decode_key(char *); void latency_insert(CONFIG *, uint32_t *, uint32_t *, uint32_t *); void latency_read(CONFIG *, uint32_t *, uint32_t *, uint32_t *); void latency_update(CONFIG *, uint32_t *, uint32_t *, uint32_t *); diff --git a/bench/wtperf/wtperf_truncate.c b/bench/wtperf/wtperf_truncate.c index 0cdbbb914a4..581d1987947 100644 --- a/bench/wtperf/wtperf_truncate.c +++ b/bench/wtperf/wtperf_truncate.c @@ -28,6 +28,12 @@ #include "wtperf.h" +static inline uint64_t +decode_key(char *key_buf) +{ + return (strtoull(key_buf, NULL, 10)); +} + int setup_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_SESSION *session) { @@ -37,8 +43,7 @@ setup_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_SESSION *session) { WT_CURSOR *cursor; char *key, *truncate_key; int ret; - size_t i; - uint64_t end_point, final_stone_gap, start_point; + uint64_t end_point, final_stone_gap, i, start_point; end_point = final_stone_gap = start_point = 0; trunc_cfg = &thread->trunc_cfg; @@ -49,11 +54,9 @@ setup_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_SESSION *session) { session, cfg->uris[0], NULL, NULL, &cursor)) != 0) goto err; - /* Truncation percentage value. eg 10% is 0.1. */ - trunc_cfg->truncation_percentage = (double)workload->truncate_pct / 100; /* How many entries between each stone. */ trunc_cfg->stone_gap = - workload->truncate_count * trunc_cfg->truncation_percentage; + (workload->truncate_count * workload->truncate_pct) / 100; /* How many stones we need. */ trunc_cfg->needed_stones = workload->truncate_count / trunc_cfg->stone_gap; @@ -94,8 +97,13 @@ setup_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_SESSION *session) { trunc_cfg->expected_total = (end_point - start_point); for (i = 1; i <= trunc_cfg->needed_stones; i++) { truncate_key = calloc(cfg->key_sz, 1); + if (truncate_key == NULL) { + ret = enomem(cfg); + goto err; + } truncate_item = calloc(sizeof(TRUNCATE_QUEUE_ENTRY), 1); if (truncate_item == NULL) { + free(truncate_key); ret = enomem(cfg); goto err; } @@ -104,14 +112,16 @@ setup_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_SESSION *session) { truncate_item->key = truncate_key; truncate_item->diff = (trunc_cfg->stone_gap * i) - trunc_cfg->last_key; - STAILQ_INSERT_TAIL( &cfg->stone_head, truncate_item, q); + TAILQ_INSERT_TAIL( &cfg->stone_head, truncate_item, q); trunc_cfg->last_key = trunc_cfg->stone_gap * i; trunc_cfg->num_stones++; } } trunc_cfg->stone_gap = final_stone_gap; -err: cursor->close(cursor); +err: if ((ret = cursor->close(cursor)) != 0) { + lprintf(cfg, ret, 0, "truncate setup: cursor close failed"); + } return (ret); } @@ -141,16 +151,22 @@ run_truncate(CONFIG *cfg, CONFIG_THREAD *thread, while (trunc_cfg->num_stones < trunc_cfg->needed_stones) { trunc_cfg->last_key += trunc_cfg->stone_gap; truncate_key = calloc(cfg->key_sz, 1); + if (truncate_key == NULL) { + lprintf(cfg, ENOMEM, 0, + "truncate: couldn't allocate key array"); + return (ENOMEM); + } truncate_item = calloc(sizeof(TRUNCATE_QUEUE_ENTRY), 1); if (truncate_item == NULL) { + free(truncate_key); lprintf(cfg, ENOMEM, 0, - "worker: couldn't allocate cursor array"); + "truncate: couldn't allocate item"); return (ENOMEM); } generate_key(cfg, truncate_key, trunc_cfg->last_key); truncate_item->key = truncate_key; truncate_item->diff = trunc_cfg->stone_gap; - STAILQ_INSERT_TAIL(&cfg->stone_head, truncate_item, q); + TAILQ_INSERT_TAIL(&cfg->stone_head, truncate_item, q); trunc_cfg->num_stones++; } @@ -159,9 +175,9 @@ run_truncate(CONFIG *cfg, CONFIG_THREAD *thread, trunc_cfg->expected_total <= thread->workload->truncate_count) return (0); - truncate_item = STAILQ_FIRST(&cfg->stone_head); + truncate_item = TAILQ_FIRST(&cfg->stone_head); trunc_cfg->num_stones--; - STAILQ_REMOVE_HEAD(&cfg->stone_head, q); + TAILQ_REMOVE(&cfg->stone_head, truncate_item, q); cursor->set_key(cursor,truncate_item->key); if ((ret = cursor->search(cursor)) != 0) { lprintf(cfg, ret, 0, "Truncate search: failed"); @@ -179,7 +195,6 @@ run_truncate(CONFIG *cfg, CONFIG_THREAD *thread, err: free(truncate_item->key); free(truncate_item); - truncate_item = NULL; t_ret = cursor->reset(cursor); if (t_ret != 0) lprintf(cfg, t_ret, 0, "Cursor reset failed"); @@ -192,9 +207,9 @@ void cleanup_truncate_config(CONFIG *cfg) { TRUNCATE_QUEUE_ENTRY *truncate_item; - while (!STAILQ_EMPTY(&cfg->stone_head)) { - truncate_item = STAILQ_FIRST(&cfg->stone_head); - STAILQ_REMOVE_HEAD(&cfg->stone_head, q); + while (!TAILQ_EMPTY(&cfg->stone_head)) { + truncate_item = TAILQ_FIRST(&cfg->stone_head); + TAILQ_REMOVE(&cfg->stone_head, truncate_item, q); free(truncate_item->key); free(truncate_item); } diff --git a/build_win/filelist.win b/build_win/filelist.win index 099451e418d..9d0ee10d305 100644 --- a/build_win/filelist.win +++ b/build_win/filelist.win @@ -45,6 +45,7 @@ src/btree/col_srch.c src/btree/row_key.c src/btree/row_modify.c src/btree/row_srch.c +src/cache/cache_las.c src/config/config.c src/config/config_api.c src/config/config_check.c diff --git a/dist/api_data.py b/dist/api_data.py index 43b585a6c6d..3a700cf886b 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -449,13 +449,17 @@ connection_runtime_config = [ Config('chunk', '10MB', r''' the granularity that a shared cache is redistributed''', min='1MB', max='10TB'), + Config('name', 'none', r''' + the name of a cache that is shared between databases or + \c "none" when no shared cache is configured'''), + Config('quota', '0', r''' + maximum size of cache this database can be allocated from the + shared cache. Defaults to the entire shared cache size''', + type='int'), Config('reserve', '0', r''' amount of cache this database is guaranteed to have available from the shared cache. This setting is per database. Defaults to the chunk size''', type='int'), - Config('name', 'none', r''' - the name of a cache that is shared between databases or - \c "none" when no shared cache is configured'''), Config('size', '500MB', r''' maximum memory to allocate for the shared cache. Setting this will update the value if one is already set''', @@ -981,8 +985,10 @@ methods = { connection_runtime_config + common_wiredtiger_open + [ Config('config_base', 'true', r''' - write the base configuration file if creating the database, - see @ref config_base for more information''', + write the base configuration file if creating the database. If + \c false in the config passed directly to ::wiredtiger_open, will + ignore any existing base configuration file in addition to not creating + one. See @ref config_base for more information''', type='boolean'), Config('create', 'false', r''' create the database if it does not exist''', @@ -1011,8 +1017,10 @@ methods = { connection_runtime_config + common_wiredtiger_open + [ Config('config_base', 'true', r''' - write the base configuration file if creating the database, - see @ref config_base for more information''', + write the base configuration file if creating the database. If + \c false in the config passed directly to ::wiredtiger_open, will + ignore any existing base configuration file in addition to not creating + one. See @ref config_base for more information''', type='boolean'), Config('create', 'false', r''' create the database if it does not exist''', diff --git a/dist/filelist b/dist/filelist index c3321cf845d..f33f0e9a962 100644 --- a/dist/filelist +++ b/dist/filelist @@ -45,6 +45,7 @@ src/btree/col_srch.c src/btree/row_key.c src/btree/row_modify.c src/btree/row_srch.c +src/cache/cache_las.c src/config/config.c src/config/config_api.c src/config/config_check.c diff --git a/dist/flags.py b/dist/flags.py index c8d9bcc6a5e..d98f249335e 100644 --- a/dist/flags.py +++ b/dist/flags.py @@ -12,7 +12,6 @@ flags = { 'SYNC_CHECKPOINT', 'SYNC_CLOSE', 'SYNC_DISCARD', - 'SYNC_DISCARD_FORCE', 'SYNC_WRITE_LEAVES', ], 'file_types' : [ @@ -46,9 +45,10 @@ flags = { 'READ_WONT_NEED', ], 'rec_write' : [ + 'EVICT_LOOKASIDE', 'EVICTING', - 'SKIP_UPDATE_ERR', - 'SKIP_UPDATE_RESTORE', + 'EVICT_UPDATE_RESTORE', + 'VISIBILITY_ERR', ], 'txn_log_checkpoint' : [ 'TXN_LOG_CKPT_CLEANUP', @@ -107,19 +107,20 @@ flags = { 'session' : [ 'SESSION_CAN_WAIT', 'SESSION_CLEAR_EVICT_WALK', - 'SESSION_DISCARD_FORCE', + 'SESSION_INTERNAL', 'SESSION_LOCKED_CHECKPOINT', 'SESSION_LOCKED_HANDLE_LIST', 'SESSION_LOCKED_SCHEMA', + 'SESSION_LOCKED_SLOT', 'SESSION_LOCKED_TABLE', - 'SESSION_INTERNAL', 'SESSION_LOGGING_INMEM', + 'SESSION_LOOKASIDE_CURSOR', 'SESSION_NO_CACHE', - 'SESSION_NO_CACHE_CHECK', 'SESSION_NO_DATA_HANDLES', + 'SESSION_NO_EVICTION', 'SESSION_NO_LOGGING', 'SESSION_NO_SCHEMA_LOCK', - 'SESSION_SALVAGE_CORRUPT_OK', + 'SESSION_QUIET_CORRUPT_FILE', 'SESSION_SERVER_ASYNC', ], } diff --git a/dist/s_all b/dist/s_all index c624db06a97..8e3f265e79b 100755 --- a/dist/s_all +++ b/dist/s_all @@ -2,7 +2,7 @@ # Run standard scripts. t=__wt.$$ -t_pfx=__s_all_tmp +t_pfx=__s_all_tmp_ trap 'rm -f $t *.pyc __tmp __wt.* __s_all_tmp*' 0 1 2 3 13 15 # We require python which may not be installed. diff --git a/dist/s_define b/dist/s_define index 7809bf14918..77673bdcdf9 100755 --- a/dist/s_define +++ b/dist/s_define @@ -4,18 +4,23 @@ t=__wt.$$ trap 'rm -f $t; exit 0' 0 1 2 3 13 15 -# List of files to search. +# List of source files to search. l=`sed -e 's,#.*,,' -e '/^$/d' -e 's,^,../,' filelist` l="$l `echo ../src/include/*.i ../src/utilities/*.c ../test/*/*.c`" +# List of include files for source #defines. +# Ignore the queue.h file, we don't use most of it. +dl="../src/include/*.[hi] ../src/include/*.in" +dl=`echo $dl | sed 's/ [^ ]*queue.h//'` + ( # Copy out the list of #defines we don't use, but it's OK. sed -e '/^$/d' -e '/^#/d' < s_define.list -# Get the list of #defines. -# Ignore the list of configuration objects -# Ignore the list of statistic "keys" generated for applications. -search=`cat ../src/include/*.[hi] ../src/include/*.in | +# Search the list of include files for #defines +# Ignore configuration objects #defines +# Ignore statistic "keys" generated for applications #defines +search=`cat $dl | sed -e '/configuration section: BEGIN/,/configuration section: END/d' \ -e '/Statistics section: BEGIN/,/Statistics section: END/d' | egrep '^#define' | diff --git a/dist/s_define.list b/dist/s_define.list index 623a34447a8..aaf365a7376 100644 --- a/dist/s_define.list +++ b/dist/s_define.list @@ -16,115 +16,43 @@ TXN_API_CALL TXN_API_CALL_NOCONF TXN_API_END WIN32_LEAN_AND_MEAN -WT_ATOMIC_ADD1 -WT_ATOMIC_ADD2 -WT_ATOMIC_CAS1 -WT_ATOMIC_CAS2 -WT_ATOMIC_FETCH_ADD1 -WT_ATOMIC_FETCH_ADD2 -WT_ATOMIC_FETCH_ADD4 -WT_ATOMIC_STORE1 -WT_ATOMIC_STORE2 -WT_ATOMIC_SUB1 -WT_ATOMIC_SUB2 +WT_ATOMIC_CAS +WT_ATOMIC_FUNC WT_BARRIER WT_BLOCK_DESC_SIZE WT_CACHE_LINE_ALIGNMENT WT_COMPILER_TYPE_ALIGN WT_CONN_CHECK_PANIC +WT_COUNTER_SLOTS WT_DEADLOCK WT_DEBUG_BYTE WT_HANDLE_CLOSED WT_HANDLE_NULLABLE +WT_LOG_SLOT_ACTIVE +WT_LOG_SLOT_BITS +WT_LOG_SLOT_JOIN_MASK +WT_LOG_SLOT_MASK_OFF +WT_LOG_SLOT_MASK_ON +WT_LOG_SLOT_MAXBITS +WT_LOG_SLOT_UNBUFFERED_ISSET WT_PACKED_STRUCT_BEGIN WT_PACKED_STRUCT_END WT_READ_BARRIER WT_REF_SIZE WT_SESSION_LOCKED_CHECKPOINT -WT_STAT_ATOMIC_DECR -WT_STAT_ATOMIC_DECRV -WT_STAT_ATOMIC_INCR -WT_STAT_ATOMIC_INCRV +WT_STATS_FIELD_TO_SLOT +WT_STATS_SLOT_ID WT_STAT_DECR WT_STAT_DECRV -WT_STAT_FAST_ATOMIC_DECR -WT_STAT_FAST_ATOMIC_DECRV -WT_STAT_FAST_ATOMIC_INCR -WT_STAT_FAST_ATOMIC_INCRV -WT_STAT_FAST_CONN_ATOMIC_DECRV -WT_STAT_FAST_CONN_ATOMIC_INCRV WT_STAT_FAST_CONN_DECRV WT_STAT_FAST_DATA_DECRV WT_STAT_FAST_DECR WT_STAT_FAST_DECRV +WT_STAT_FAST_INCR WT_STAT_FAST_INCRV WT_STAT_FAST_SET +WT_STAT_WRITE WT_WITH_LOCK __F __WIREDTIGER_EXT_H_ __WIREDTIGER_H_ -__WT_ATOMIC_ADD -__WT_ATOMIC_CAS -__WT_ATOMIC_FETCH_ADD -__WT_ATOMIC_STORE -__WT_ATOMIC_SUB - -# List of queue.h #defines that are "unused", but it's OK. -LIST_EMPTY -LIST_ENTRY -LIST_FIRST -LIST_FOREACH -LIST_HEAD -LIST_HEAD_INITIALIZER -LIST_INIT -LIST_INSERT_AFTER -LIST_INSERT_BEFORE -LIST_INSERT_HEAD -LIST_NEXT -LIST_REMOVE -QMD_TRACE_ELEM -QMD_TRACE_HEAD -QUEUE_MACRO_DEBUG -SLIST_EMPTY -SLIST_ENTRY -SLIST_FIRST -SLIST_FOREACH -SLIST_FOREACH_PREVPTR -SLIST_HEAD -SLIST_HEAD_INITIALIZER -SLIST_INIT -SLIST_INSERT_AFTER -SLIST_INSERT_HEAD -SLIST_NEXT -SLIST_REMOVE -SLIST_REMOVE_HEAD -STAILQ_CONCAT -STAILQ_EMPTY -STAILQ_ENTRY -STAILQ_FIRST -STAILQ_FOREACH -STAILQ_HEAD -STAILQ_HEAD_INITIALIZER -STAILQ_INIT -STAILQ_INSERT_AFTER -STAILQ_INSERT_HEAD -STAILQ_INSERT_TAIL -STAILQ_LAST -STAILQ_NEXT -STAILQ_REMOVE -STAILQ_REMOVE_HEAD -STAILQ_REMOVE_HEAD_UNTIL -TAILQ_CONCAT -TAILQ_EMPTY -TAILQ_ENTRY -TAILQ_FOREACH_REVERSE -TAILQ_HEAD -TAILQ_HEAD_INITIALIZER -TAILQ_INSERT_AFTER -TAILQ_INSERT_BEFORE -TAILQ_LAST -TAILQ_NEXT -TAILQ_PREV -TRACEBUF -TRASHIT -_DB_QUEUE_H_ diff --git a/dist/s_stat b/dist/s_stat index 152097f14be..44c22ab56bb 100755 --- a/dist/s_stat +++ b/dist/s_stat @@ -16,7 +16,7 @@ l="$l `echo ../src/include/*.i`" ( # Get the list of statistics fields. search=`sed \ - -e 's/^ WT_STATS \([a-z_*]*\);$/\1/p' \ + -e 's/^ int64_t \([a-z_*]*\);$/\1/p' \ -e d ../src/include/stat.h | sort` diff --git a/dist/s_string.ok b/dist/s_string.ok index 48c0f7f30f4..bfc4124f74d 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -32,6 +32,7 @@ BIGENDIAN BOOL BSR BTREE +BUF BZ Barack Bitfield @@ -156,11 +157,13 @@ KVS Kanowski's Kounavis LANGID +LAS LF LLLLLL LLLLLLL LOGREC LOGSCAN +LOOKASIDE LRU LRVv LSB @@ -176,6 +179,7 @@ Levyx Llqr Llqrt LockFile +Lookaside Lookup MALLOC MEM @@ -210,6 +214,7 @@ NetBSD NoAddr Noll Nul +OOB OPTYPE OUTBUFF OVFL @@ -231,6 +236,7 @@ Preload Prepend Qsort RCS +RECNO REF's REFs RET @@ -291,6 +297,7 @@ ULINE URI URIs UTF +Unbuffered UnixLib Unmap UnmapViewOfFile @@ -320,6 +327,7 @@ WiredTiger's WiredTigerCheckpoint WiredTigerException WiredTigerInit +WiredTigerLAS WiredTigerLog WiredTigerPreplog WiredTigerTmplog @@ -396,6 +404,7 @@ bzalloc bzfree bzip calloc +cas catfmt cd centric @@ -494,6 +503,7 @@ desc dest destSize dev +dh dhandle dhandles dir @@ -503,6 +513,7 @@ dlh dll dlopen dlsym +dmalloc dmsg doxgen doxygen @@ -512,6 +523,7 @@ dsk dsrc dst dstlen +dstrdup dsync dumpcmp dumpfile @@ -648,6 +660,7 @@ kvraw kvs kvsbdb lang +las latencies lbrace lbracket @@ -675,6 +688,7 @@ logread logrec logsize logtest +lookaside lookup lookups lossy @@ -745,6 +759,7 @@ nop noraw notfound notsup +notused nset nsnap nul @@ -797,6 +812,7 @@ progname ps psp pthread +ptr pushms putK putV @@ -937,6 +953,7 @@ uS uint uintmax unbare +unbuffered uncompressing uncompresssed undef @@ -945,6 +962,7 @@ unesc unescaped uninstantiated unistd +unlinked unmap unmarshall unmarshalled diff --git a/dist/s_style b/dist/s_style index e5411748a31..0e013852914 100755 --- a/dist/s_style +++ b/dist/s_style @@ -46,6 +46,11 @@ else cat $t fi + if ! expr "$f" : 'src/include/queue\.h' > /dev/null && + egrep 'STAILQ_|SLIST_|\bLIST_' $f ; then + echo "$f: use TAILQ for all lists" + fi + if ! expr "$f" : 'src/os_posix/.*' > /dev/null && ! expr "$f" : 'src/os_win/.*' > /dev/null && ! expr "$f" : 'src/include/extern.h' > /dev/null && @@ -69,6 +74,13 @@ else cat $t } + # Alignment directive before "struct". + egrep 'WT_COMPILER_TYPE_ALIGN.*struct' $f > $t + test -s $t && { + echo "$f: compiler alignment direction must precede \"struct\"" + cat $t + } + # Direct calls to functions we're not supposed to use in the library. # We don't check for all of them, just a few of the common ones. if ! expr "$f" : 'bench/.*' > /dev/null && diff --git a/dist/s_typedef b/dist/s_typedef index 2e206757f48..233f432f0e5 100755 --- a/dist/s_typedef +++ b/dist/s_typedef @@ -25,7 +25,7 @@ build() { $l | sed -e 's/WT_PACKED_STRUCT_BEGIN(\(.*\))/struct \1 {/' \ -e 's/WT_COMPILER_TYPE_ALIGN(.*)[ ]*//' \ - -e 's/^[ ]*//' -e 's/[ ]*{.*//' | sort | \ + -e 's/^[ ]*//' -e 's/[ ]*{.*//' | sort -u | \ while read t n; do upper=`echo $n | sed -e 's/^__//' | tr [a-z] [A-Z]` echo "$t $n;" diff --git a/dist/s_whitespace b/dist/s_whitespace index 3a51b251bfe..dfc031e3ea4 100755 --- a/dist/s_whitespace +++ b/dist/s_whitespace @@ -4,7 +4,16 @@ t=__wt.$$ trap 'rm -f $t; exit 0' 0 1 2 3 13 15 -ws() +# Clear lines that only contain whitespace. +whitespace() +{ + sed -e 's/[ ][ ]*$//' < $1 > $t + cmp $t $1 > /dev/null 2>&1 || (echo "$1" && cp $t $1) +} + +# Clear lines that only contain whitespace, compress multiple empty lines +# into a single line, discarding trailing empty lines. +whitespace_and_empty_line() { sed -e 's/[ ][ ]*$//' \ -e '/^$/N' \ @@ -14,10 +23,12 @@ ws() cd .. +# Scripts. for f in `find dist -name '*.py' -name 's_*'`; do - ws $f + whitespace_and_empty_line $f done +# C-language sources. for f in `find examples ext src test \ -name '*.[chi]' -o \ -name '*.dox' -o \ @@ -26,5 +37,11 @@ for f in `find examples ext src test \ if expr "$f" : ".*/Makefile.in" > /dev/null; then continue fi - ws $f + whitespace_and_empty_line $f +done + +# Python sources. +for f in `find test \ + -name '*.py' | sed '/3rdparty/d'`; do + whitespace $f done diff --git a/dist/stat.py b/dist/stat.py index 2a87d4425e6..c9684665a53 100644 --- a/dist/stat.py +++ b/dist/stat.py @@ -12,12 +12,11 @@ def print_struct(title, name, base, stats): f.write('/*\n') f.write(' * Statistics entries for ' + title + '.\n') f.write(' */\n') - f.write( - '#define\tWT_' + name.upper() + '_STATS_BASE\t' + str(base) + '\n') + f.write('#define\tWT_' + name.upper() + '_STATS_BASE\t' + str(base) + '\n') f.write('struct __wt_' + name + '_stats {\n') for l in stats: - f.write('\tWT_STATS ' + l.name + ';\n') + f.write('\tint64_t ' + l.name + ';\n') f.write('};\n\n') # Update the #defines in the stat.h file. @@ -90,67 +89,113 @@ for line in open('../src/include/wiredtiger.in', 'r'): f.close() compare_srcfile(tmp_file, '../src/include/wiredtiger.in') -def print_func(name, list): - '''Print the functions for the stat.c file.''' +def print_func(name, handle, list): + '''Print the structures/functions for the stat.c file.''' + f.write('\n') + f.write('static const char * const __stats_' + name + '_desc[] = {\n') + for l in list: + f.write('\t"' + l.desc + '",\n') + f.write('};\n') + + f.write(''' +const char * +__wt_stat_''' + name + '''_desc(int slot) +{ +\treturn (__stats_''' + name + '''_desc[slot]); +} +''') + f.write(''' void -__wt_stat_init_''' + name + '''_stats(WT_''' + name.upper() + '''_STATS *stats) +__wt_stat_''' + name + '_init_single(WT_' + name.upper() + '''_STATS *stats) { -\t/* Clear, so can also be called for reinitialization. */ \tmemset(stats, 0, sizeof(*stats)); - -''') - for l in sorted(list): - o = '\tstats->' + l.name + '.desc = "' + l.desc + '";\n' - if len(o) + 7 > 80: - o = o.replace('= ', '=\n\t ') - f.write(o) - f.write('''} +} ''') f.write(''' void -__wt_stat_refresh_''' + name + '''_stats(void *stats_arg) +__wt_stat_''' + name + '_init(' + handle + ''' *handle) { -\tWT_''' + name.upper() + '''_STATS *stats; +\tint i; + +\tfor (i = 0; i < WT_COUNTER_SLOTS; ++i) { +\t\thandle->stats[i] = &handle->stat_array[i]; +\t\t__wt_stat_''' + name + '''_init_single(handle->stats[i]); +\t} +} +''') -\tstats = (WT_''' + name.upper() + '''_STATS *)stats_arg; + f.write(''' +void +__wt_stat_''' + name + '_clear_single(WT_' + name.upper() + '''_STATS *stats) +{ ''') for l in sorted(list): # no_clear: don't clear the value. - if not 'no_clear' in l.flags: - f.write('\tstats->' + l.name + '.v = 0;\n'); + if 'no_clear' in l.flags: + f.write('\t\t/* not clearing ' + l.name + ' */\n') + else: + f.write('\tstats->' + l.name + ' = 0;\n') f.write('}\n') - # Aggregation is only interesting for data-source statistics. - # Complain if any aggregation flags are set. - if name == 'connection': + f.write(''' +void +__wt_stat_''' + name + '_clear_all(WT_' + name.upper() + '''_STATS **stats) +{ +\tu_int i; + +\tfor (i = 0; i < WT_COUNTER_SLOTS; ++i) +\t\t__wt_stat_''' + name + '''_clear_single(stats[i]); +} +''') + + # Single structure aggregation is currently only used by data sources. + if name == 'dsrc': + f.write(''' +void +__wt_stat_''' + name + '''_aggregate_single( + WT_''' + name.upper() + '_STATS *from, WT_' + name.upper() + '''_STATS *to) +{ +''') for l in sorted(list): - if 'no_aggregate' in l.flags or 'max_aggregate' in l.flags: - print >>sys.stdout,\ - "Aggregation configuration for " +\ - name + "." + l.name + " statistics not supported" - return; + if 'no_aggregate' in l.flags: + o = '\tto->' + l.name + ' = from->' + l.name + ';\n' + elif 'max_aggregate' in l.flags: + o = '\tif (from->' + l.name + ' > to->' + l.name + ')\n' +\ + '\t\tto->' + l.name + ' = from->' + l.name + ';\n' + else: + o = '\tto->' + l.name + ' += from->' + l.name + ';\n' + if len(o) > 72: # Account for the leading tab. + o = o.replace(' += ', ' +=\n\t ') + f.write(o) + f.write('}\n') f.write(''' void -__wt_stat_aggregate_''' + name + -'''_stats(const void *child, const void *parent) +__wt_stat_''' + name + '''_aggregate( + WT_''' + name.upper() + '_STATS **from, WT_' + name.upper() + '''_STATS *to) { -\tWT_''' + name.upper() + '''_STATS *c, *p; - -\tc = (WT_''' + name.upper() + '''_STATS *)child; -\tp = (WT_''' + name.upper() + '''_STATS *)parent; ''') + # Connection level aggregation does not currently have any computation + # of a maximum value; I'm leaving in support for it, but don't declare + # a temporary variable until it's needed. + for l in sorted(list): + if 'max_aggregate' in l.flags: + f.write('\tint64_t v;\n\n') + break; for l in sorted(list): if 'no_aggregate' in l.flags: - continue; + o = '\tto->' + l.name + ' = from[0]->' + l.name + ';\n' elif 'max_aggregate' in l.flags: - o = 'if (c->' + l.name + '.v > p->' + l.name +\ - '.v)\n\t p->' + l.name + '.v = c->' + l.name + '.v;' + o = '\tif ((v = WT_STAT_READ(from, ' + l.name + ')) >\n' +\ + '\t to->' + l.name + ')\n' +\ + '\t\tto->' + l.name + ' = v;\n' else: - o = 'p->' + l.name + '.v += c->' + l.name + '.v;' - f.write('\t' + o + '\n') + o = '\tto->' + l.name + ' += WT_STAT_READ(from, ' + l.name + ');\n' + if len(o) > 72: # Account for the leading tab. + o = o.replace(' += ', ' +=\n\t ') + f.write(o) f.write('}\n') # Write the stat initialization and refresh routines to the stat.c file. @@ -158,12 +203,11 @@ f = open(tmp_file, 'w') f.write('/* DO NOT EDIT: automatically built by dist/stat.py. */\n\n') f.write('#include "wt_internal.h"\n') -print_func('dsrc', dsrc_stats) -print_func('connection', connection_stats) +print_func('dsrc', 'WT_DATA_HANDLE', dsrc_stats) +print_func('connection', 'WT_CONNECTION_IMPL', connection_stats) f.close() compare_srcfile(tmp_file, '../src/support/stat.c') - # Update the statlog file with the entries we can scale per second. scale_info = 'no_scale_per_second_list = [\n' clear_info = 'no_clear_list = [\n' diff --git a/dist/stat_data.py b/dist/stat_data.py index caf68364696..c91fc921380 100644 --- a/dist/stat_data.py +++ b/dist/stat_data.py @@ -7,14 +7,21 @@ # currently open'. # NOTE: All statistics descriptions must have a prefix string followed by ':'. # -# Optional configuration flags: -# no_clear Value not cleared when statistics cleared -# no_scale Don't scale value per second in the logging tool script -# # Data-source statistics are normally aggregated across the set of underlying # objects. Additional optionaly configuration flags are available: # no_aggregate Ignore the value when aggregating statistics # max_aggregate Take the maximum value when aggregating statistics +# +# Optional configuration flags: +# no_clear Value not cleared when statistics cleared +# no_scale Don't scale value per second in the logging tool script +# +# The no_clear flag is a little complicated: it means we don't clear the values +# when resetting statistics after each run (necessary when the WiredTiger engine +# is updating values that persist over multiple runs, for example the count of +# cursors), but it also causes the underlying display routines to not treat the +# change between displays as relative to the number of seconds, that is, it's an +# absolute value. The no_clear flag should be set in either case. from operator import attrgetter import sys @@ -120,9 +127,9 @@ connection_stats = [ AsyncStat('async_alloc_race', 'number of allocation state races'), AsyncStat('async_alloc_view', 'number of operation slots viewed for allocation'), + AsyncStat('async_cur_queue', 'current work queue length'), AsyncStat('async_flush', 'number of flush calls'), AsyncStat('async_full', 'number of times operation allocation failed'), - AsyncStat('async_cur_queue', 'current work queue length'), AsyncStat('async_max_queue', 'maximum work queue length', 'no_clear,no_scale'), AsyncStat('async_nowork', 'number of times worker found no work'), @@ -149,11 +156,11 @@ connection_stats = [ ########################################## CacheStat('cache_bytes_dirty', 'tracked dirty bytes in the cache', 'no_clear,no_scale'), - CacheStat('cache_bytes_inuse', - 'bytes currently in the cache', 'no_clear,no_scale'), CacheStat('cache_bytes_internal', 'tracked bytes belonging to internal pages in the cache', 'no_clear,no_scale'), + CacheStat('cache_bytes_inuse', + 'bytes currently in the cache', 'no_clear,no_scale'), CacheStat('cache_bytes_leaf', 'tracked bytes belonging to leaf pages in the cache', 'no_clear,no_scale'), @@ -165,11 +172,11 @@ connection_stats = [ CacheStat('cache_bytes_read', 'bytes read into cache'), CacheStat('cache_bytes_write', 'bytes written from cache'), CacheStat('cache_eviction_app', 'pages evicted by application threads'), + CacheStat('cache_eviction_checkpoint', 'checkpoint blocked page eviction'), CacheStat('cache_eviction_clean', 'unmodified pages evicted'), CacheStat('cache_eviction_deepen', 'page split during eviction deepened the tree'), CacheStat('cache_eviction_dirty', 'modified pages evicted'), - CacheStat('cache_eviction_checkpoint', 'checkpoint blocked page eviction'), CacheStat('cache_eviction_fail', 'pages selected for eviction unable to be evicted'), CacheStat('cache_eviction_force', @@ -197,21 +204,35 @@ connection_stats = [ CacheStat('cache_eviction_worker_evicting', 'eviction worker thread evicting pages'), CacheStat('cache_inmem_split', 'in-memory page splits'), + CacheStat('cache_inmem_splittable', + 'in-memory page passed criteria to be split'), + CacheStat('cache_lookaside_insert', 'lookaside table insert calls'), + CacheStat('cache_lookaside_remove', 'lookaside table remove calls'), CacheStat('cache_overhead', 'percentage overhead', 'no_clear,no_scale'), CacheStat('cache_pages_dirty', 'tracked dirty pages in the cache', 'no_clear,no_scale'), CacheStat('cache_pages_inuse', 'pages currently held in the cache', 'no_clear,no_scale'), CacheStat('cache_read', 'pages read into cache'), + CacheStat('cache_read_lookaside', + 'pages read into cache requiring lookaside entries'), CacheStat('cache_write', 'pages written from cache'), + CacheStat('cache_write_lookaside', + 'page written requiring lookaside records'), + CacheStat('cache_write_restore', + 'pages written requiring in-memory restoration'), ########################################## # Dhandle statistics ########################################## - DhandleStat('dh_conn_handles', 'connection dhandles swept'), - DhandleStat('dh_conn_ref', 'connection candidate referenced'), - DhandleStat('dh_conn_sweeps', 'connection sweeps'), - DhandleStat('dh_conn_tod', 'connection time-of-death sets'), + DhandleStat('dh_conn_handle_count', + 'connection data handles currently active', 'no_clear,no_scale'), + DhandleStat('dh_sweep_close', 'connection sweep dhandles closed'), + DhandleStat('dh_sweep_remove', + 'connection sweep dhandles removed from hash list'), + DhandleStat('dh_sweep_ref', 'connection sweep candidate became referenced'), + DhandleStat('dh_sweep_tod', 'connection sweep time-of-death sets'), + DhandleStat('dh_sweeps', 'connection sweeps'), DhandleStat('dh_session_handles', 'session dhandles swept'), DhandleStat('dh_session_sweeps', 'session sweep attempts'), @@ -225,8 +246,8 @@ connection_stats = [ LogStat('log_compress_len', 'total size of compressed records'), LogStat('log_compress_mem', 'total in-memory size of compressed records'), LogStat('log_compress_small', 'log records too small to compress'), - LogStat('log_compress_writes', 'log records compressed'), LogStat('log_compress_write_fails', 'log records not compressed'), + LogStat('log_compress_writes', 'log records compressed'), LogStat('log_max_filesize', 'maximum log file size', 'no_clear,no_scale'), LogStat('log_prealloc_files', 'pre-allocated log files prepared'), LogStat('log_prealloc_max', @@ -236,20 +257,18 @@ connection_stats = [ LogStat('log_scan_records', 'records processed by log scan'), LogStat('log_scan_rereads', 'log scan records requiring two reads'), LogStat('log_scans', 'log scan operations'), - LogStat('log_sync', 'log sync operations'), - LogStat('log_sync_dir', 'log sync_dir operations'), - LogStat('log_writes', 'log write operations'), - LogStat('log_write_lsn', 'log server thread advances write LSN'), - + LogStat('log_slot_closes', 'consolidated slot closures'), LogStat('log_slot_coalesced', 'written slots coalesced'), LogStat('log_slot_consolidated', 'logging bytes consolidated'), - LogStat('log_slot_closes', 'consolidated slot closures'), LogStat('log_slot_joins', 'consolidated slot joins'), LogStat('log_slot_races', 'consolidated slot join races'), - LogStat('log_slot_toobig', 'record size exceeded maximum'), - LogStat('log_slot_toosmall', - 'failed to find a slot large enough for record'), + LogStat('log_slot_switch_busy', 'busy returns attempting to switch slots'), LogStat('log_slot_transitions', 'consolidated slot join transitions'), + LogStat('log_slot_unbuffered', 'consolidated slot unbuffered writes'), + LogStat('log_sync', 'log sync operations'), + LogStat('log_sync_dir', 'log sync_dir operations'), + LogStat('log_write_lsn', 'log server thread advances write LSN'), + LogStat('log_writes', 'log write operations'), ########################################## # Reconciliation statistics @@ -268,6 +287,8 @@ connection_stats = [ TxnStat('txn_checkpoint', 'transaction checkpoints'), TxnStat('txn_checkpoint_generation', 'transaction checkpoint generation', 'no_clear,no_scale'), + TxnStat('txn_checkpoint_running', + 'transaction checkpoint currently running', 'no_clear,no_scale'), TxnStat('txn_checkpoint_time_max', 'transaction checkpoint max time (msecs)', 'no_clear,no_scale'), TxnStat('txn_checkpoint_time_min', @@ -276,17 +297,16 @@ connection_stats = [ 'transaction checkpoint most recent time (msecs)', 'no_clear,no_scale'), TxnStat('txn_checkpoint_time_total', 'transaction checkpoint total time (msecs)', 'no_clear,no_scale'), - TxnStat('txn_checkpoint_running', - 'transaction checkpoint currently running', 'no_clear,no_scale'), + TxnStat('txn_commit', 'transactions committed'), + TxnStat('txn_fail_cache', + 'transaction failures due to cache overflow'), TxnStat('txn_pinned_checkpoint_range', 'transaction range of IDs currently pinned by a checkpoint', - 'no_clear,no_scale'), + 'no_clear,no_scale'), TxnStat('txn_pinned_range', 'transaction range of IDs currently pinned', 'no_clear,no_scale'), - TxnStat('txn_sync', 'transaction sync calls'), - TxnStat('txn_commit', 'transactions committed'), - TxnStat('txn_fail_cache', 'transaction failures due to cache overflow'), TxnStat('txn_rollback', 'transactions rolled back'), + TxnStat('txn_sync', 'transaction sync calls'), ########################################## # LSM statistics @@ -322,6 +342,7 @@ connection_stats = [ CursorStat('cursor_prev', 'cursor prev calls'), CursorStat('cursor_remove', 'cursor remove calls'), CursorStat('cursor_reset', 'cursor reset calls'), + CursorStat('cursor_restart', 'cursor restarted searches'), CursorStat('cursor_search', 'cursor search calls'), CursorStat('cursor_search_near', 'cursor search near calls'), CursorStat('cursor_update', 'cursor update calls'), @@ -362,6 +383,7 @@ dsrc_stats = [ CursorStat('cursor_remove', 'remove calls'), CursorStat('cursor_remove_bytes', 'cursor-remove key bytes removed'), CursorStat('cursor_reset', 'reset calls'), + CursorStat('cursor_restart', 'restarted searches'), CursorStat('cursor_search', 'search calls'), CursorStat('cursor_search_near', 'search near calls'), CursorStat('cursor_update', 'update calls'), @@ -378,6 +400,8 @@ dsrc_stats = [ 'column-store fixed-size leaf pages', 'no_scale'), BtreeStat('btree_column_internal', 'column-store internal pages', 'no_scale'), + BtreeStat('btree_column_rle', + 'column-store variable-size RLE encoded values', 'no_scale'), BtreeStat('btree_column_variable', 'column-store variable-size leaf pages', 'no_scale'), BtreeStat('btree_compact_rewrite', 'pages rewritten by compaction'), @@ -421,9 +445,9 @@ dsrc_stats = [ ########################################## # Block manager statistics ########################################## - BlockStat('block_alloc', 'blocks allocated'), BlockStat('allocation_size', 'file allocation unit size', 'no_aggregate,no_scale'), + BlockStat('block_alloc', 'blocks allocated'), BlockStat('block_checkpoint_size', 'checkpoint size', 'no_scale'), BlockStat('block_extension', 'allocations requiring file extension'), BlockStat('block_free', 'blocks freed'), @@ -450,20 +474,28 @@ dsrc_stats = [ CacheStat('cache_eviction_internal', 'internal pages evicted'), CacheStat('cache_eviction_split', 'pages split during eviction'), CacheStat('cache_inmem_split', 'in-memory page splits'), + CacheStat('cache_inmem_splittable', + 'in-memory page passed criteria to be split'), CacheStat('cache_overflow_value', 'overflow values cached in memory', 'no_scale'), CacheStat('cache_read', 'pages read into cache'), + CacheStat('cache_read_lookaside', + 'pages read into cache requiring lookaside entries'), CacheStat('cache_read_overflow', 'overflow pages read into cache'), CacheStat('cache_write', 'pages written from cache'), + CacheStat('cache_write_lookaside', + 'page written requiring lookaside records'), + CacheStat('cache_write_restore', + 'pages written requiring in-memory restoration'), ########################################## # Compression statistics ########################################## - CompressStat('compress_raw_ok', 'raw compression call succeeded'), CompressStat('compress_raw_fail', 'raw compression call failed, no additional data available'), CompressStat('compress_raw_fail_temporary', 'raw compression call failed, additional data available'), + CompressStat('compress_raw_ok', 'raw compression call succeeded'), CompressStat('compress_read', 'compressed pages read'), CompressStat('compress_write', 'compressed pages written'), CompressStat('compress_write_fail', 'page written failed to compress'), @@ -474,21 +506,21 @@ dsrc_stats = [ # Reconciliation statistics ########################################## RecStat('rec_dictionary', 'dictionary matches'), + RecStat('rec_multiblock_internal', 'internal page multi-block writes'), + RecStat('rec_multiblock_leaf', 'leaf page multi-block writes'), + RecStat('rec_multiblock_max', + 'maximum blocks required for a page', 'max_aggregate,no_scale'), RecStat('rec_overflow_key_internal', 'internal-page overflow keys'), RecStat('rec_overflow_key_leaf', 'leaf-page overflow keys'), RecStat('rec_overflow_value', 'overflow values written'), - RecStat('rec_page_match', 'page checksum matches'), RecStat('rec_page_delete', 'pages deleted'), + RecStat('rec_page_match', 'page checksum matches'), RecStat('rec_pages', 'page reconciliation calls'), RecStat('rec_pages_eviction', 'page reconciliation calls for eviction'), RecStat('rec_prefix_compression', 'leaf page key bytes discarded using prefix compression'), RecStat('rec_suffix_compression', 'internal page key bytes discarded using suffix compression'), - RecStat('rec_multiblock_internal', 'internal page multi-block writes'), - RecStat('rec_multiblock_leaf', 'leaf page multi-block writes'), - RecStat('rec_multiblock_max', - 'maximum blocks required for a page', 'max_aggregate,no_scale'), ########################################## # Transaction statistics diff --git a/examples/c/ex_all.c b/examples/c/ex_all.c index 6905169c4c2..213e058d4cc 100644 --- a/examples/c/ex_all.c +++ b/examples/c/ex_all.c @@ -1064,7 +1064,8 @@ main(void) home = NULL; /*! [Open a connection] */ - ret = wiredtiger_open(home, NULL, "create,cache_size=500M", &conn); + ret = wiredtiger_open(home, NULL, + "create,cache_size=5GB,log=(enabled,recover=on)", &conn); /*! [Open a connection] */ if (ret == 0) diff --git a/examples/c/ex_log.c b/examples/c/ex_log.c index 136cca900cd..d5a8f32487d 100644 --- a/examples/c/ex_log.c +++ b/examples/c/ex_log.c @@ -128,20 +128,22 @@ print_record(WT_LSN *lsn, uint32_t opcount, * A simple walk of the log. */ static int -simple_walk_log(WT_SESSION *session) +simple_walk_log(WT_SESSION *session, int count_min) { WT_CURSOR *cursor; WT_LSN lsn; WT_ITEM logrec_key, logrec_value; uint64_t txnid; uint32_t fileid, opcount, optype, rectype; - int ret; + int count, ret; /*! [log cursor open] */ ret = session->open_cursor(session, "log:", NULL, NULL, &cursor); /*! [log cursor open] */ + count = 0; while ((ret = cursor->next(cursor)) == 0) { + count++; /*! [log cursor get_key] */ ret = cursor->get_key(cursor, &lsn.file, &lsn.offset, &opcount); /*! [log cursor get_key] */ @@ -156,6 +158,12 @@ simple_walk_log(WT_SESSION *session) if (ret == WT_NOTFOUND) ret = 0; ret = cursor->close(cursor); + if (count < count_min) { + fprintf(stderr, + "Expected minimum %d records, found %d\n", + count_min, count); + abort(); + } return (ret); } /*! [log cursor walk] */ @@ -206,11 +214,13 @@ walk_log(WT_SESSION *session) /* * If the operation is a put, replay it here on the backup - * connection. Note, we cheat by looking only for fileid 1 - * in this example. The metadata is fileid 0. + * connection. + * + * !!! + * Minor cheat: the metadata is fileid 0, skip its records. */ - if (fileid == 1 && rectype == WT_LOGREC_COMMIT && - optype == WT_LOGOP_ROW_PUT) { + if (fileid != 0 && + rectype == WT_LOGREC_COMMIT && optype == WT_LOGOP_ROW_PUT) { if (!in_txn) { ret = session2->begin_transaction(session2, NULL); @@ -276,9 +286,10 @@ main(void) WT_CONNECTION *wt_conn; WT_CURSOR *cursor; WT_SESSION *session; - int i, record_count, ret; + int count_min, i, record_count, ret; char cmd_buf[256], k[16], v[16]; + count_min = 0; snprintf(cmd_buf, sizeof(cmd_buf), "rm -rf %s %s && mkdir %s %s", home1, home2, home1, home2); if ((ret = system(cmd_buf)) != 0) { @@ -293,6 +304,7 @@ main(void) ret = wt_conn->open_session(wt_conn, NULL, NULL, &session); ret = session->create(session, uri, "key_format=S,value_format=S"); + count_min++; ret = session->open_cursor(session, uri, NULL, NULL, &cursor); /* @@ -304,6 +316,7 @@ main(void) cursor->set_key(cursor, k); cursor->set_value(cursor, v); ret = cursor->insert(cursor); + count_min++; } ret = session->begin_transaction(session, NULL); /* @@ -317,10 +330,12 @@ main(void) ret = cursor->insert(cursor); } ret = session->commit_transaction(session, NULL); + count_min++; ret = cursor->close(cursor); /*! [log cursor printf] */ ret = session->log_printf(session, "Wrote %d records", record_count); + count_min++; /*! [log cursor printf] */ /* @@ -336,7 +351,7 @@ main(void) } ret = wt_conn->open_session(wt_conn, NULL, NULL, &session); - ret = simple_walk_log(session); + ret = simple_walk_log(session, count_min); ret = walk_log(session); ret = wt_conn->close(wt_conn, NULL); return (ret); diff --git a/examples/python/ex_access.py b/examples/python/ex_access.py index 8eeefd56cf7..2940ac63625 100755 --- a/examples/python/ex_access.py +++ b/examples/python/ex_access.py @@ -50,6 +50,6 @@ cursor.insert() # Iterate through the records cursor.reset() for key, value in cursor: - print('Got record: ' + key + ' : ' + value) + print('Got record: %s : %s' % (key, value)) conn.close() diff --git a/examples/python/ex_stat.py b/examples/python/ex_stat.py index e27177403cc..af2c4f7a1a7 100755 --- a/examples/python/ex_stat.py +++ b/examples/python/ex_stat.py @@ -32,6 +32,7 @@ import os from wiredtiger import wiredtiger_open,WIREDTIGER_VERSION_STRING,stat + def main(): # Create a clean test directory for this run of the test program os.system('rm -rf WT_HOME') @@ -39,16 +40,16 @@ def main(): # Connect to the database and open a session conn = wiredtiger_open('WT_HOME', 'create,statistics=(all)') session = conn.open_session() - + # Create a simple table session.create('table:access', 'key_format=S,value_format=S') - + # Open a cursor and insert a record cursor = session.open_cursor('table:access', None) - cursor['key'] = 'value' + cursor['key'] = 'value' cursor.close() - + session.checkpoint() print WIREDTIGER_VERSION_STRING print_database_stats(session) @@ -57,46 +58,51 @@ def main(): print_derived_stats(session) conn.close() + def print_database_stats(session): statcursor = session.open_cursor("statistics:") print_cursor(statcursor) statcursor.close() + def print_file_stats(session): fstatcursor = session.open_cursor("statistics:table:access") print_cursor(fstatcursor) fstatcursor.close() + def print_overflow_pages(session): ostatcursor = session.open_cursor("statistics:table:access") val = ostatcursor[stat.dsrc.btree_overflow] - if val != 0 : - print str(val[0]) + '=' + str(val[1]) + if val != 0: + print '%s=%s' % (str(val[0]), str(val[1])) ostatcursor.close() + def print_derived_stats(session): dstatcursor = session.open_cursor("statistics:table:access") ckpt_size = dstatcursor[stat.dsrc.block_checkpoint_size][1] file_size = dstatcursor[stat.dsrc.block_size][1] percent = 0 - if file_size != 0 : + if file_size != 0: percent = 100 * ((float(file_size) - float(ckpt_size)) / float(file_size)) - print "Table is %" + str(percent) + " fragmented" + print "Table is %%%s fragmented" % str(percent) app_insert = int(dstatcursor[stat.dsrc.cursor_insert_bytes][1]) app_remove = int(dstatcursor[stat.dsrc.cursor_remove_bytes][1]) app_update = int(dstatcursor[stat.dsrc.cursor_update_bytes][1]) - fs_writes = int(dstatcursor[stat.dsrc.cache_bytes_write][1]) + fs_writes = int(dstatcursor[stat.dsrc.cache_bytes_write][1]) - if(app_insert + app_remove + app_update != 0): + if app_insert + app_remove + app_update != 0: print "Write amplification is " + '{:.2f}'.format(fs_writes / (app_insert + app_remove + app_update)) dstatcursor.close() + def print_cursor(mycursor): while mycursor.next() == 0: val = mycursor.get_value() - if val[1] != '0' : - print str(val[0]) + '=' + str(val[1]) + if val[1] != '0': + print '%s=%s' % (str(val[0]), str(val[1])) if __name__ == "__main__": main() diff --git a/ext/encryptors/rotn/rotn_encrypt.c b/ext/encryptors/rotn/rotn_encrypt.c index 503dcae83a7..5b29e66c503 100644 --- a/ext/encryptors/rotn/rotn_encrypt.c +++ b/ext/encryptors/rotn/rotn_encrypt.c @@ -68,7 +68,7 @@ typedef struct { WT_ENCRYPTOR encryptor; /* Must come first */ - WT_EXTENSION_API *wt_api; /* Extension API */ + WT_EXTENSION_API *wtext; /* Extension API */ int rot_N; /* rotN value */ char *keyid; /* Saved keyid */ @@ -76,6 +76,7 @@ typedef struct { u_char *shift_forw; /* Encrypt shift data from secretkey */ u_char *shift_back; /* Decrypt shift data from secretkey */ size_t shift_len; /* Length of shift* byte arrays */ + int force_error; /* Force a decrypt error for testing */ } ROTN_ENCRYPTOR; /*! [WT_ENCRYPTOR initialization structure] */ @@ -84,6 +85,22 @@ typedef struct { #define IV_LEN 16 /* + * rotn_error -- + * Display an error from this module in a standard way. + */ +static int +rotn_error(ROTN_ENCRYPTOR *encryptor, WT_SESSION *session, int err, + const char *msg) +{ + WT_EXTENSION_API *wtext; + + wtext = encryptor->wtext; + (void)wtext->err_printf(wtext, session, + "rotn encryption: %s: %s", msg, wtext->strerror(wtext, NULL, err)); + return (err); +} + +/* * make_cksum -- * This is where one would call a checksum function on the encrypted * buffer. Here we just put a constant value in it. @@ -221,13 +238,18 @@ rotn_decrypt(WT_ENCRYPTOR *encryptor, WT_SESSION *session, (void)session; /* Unused */ /* + * For certain tests, force an error we can recognize. + */ + if (rotn_encryptor->force_error) + return (-1000); + + /* * Make sure it is big enough. */ mylen = src_len - (CHKSUM_LEN + IV_LEN); - if (dst_len < mylen) { - fprintf(stderr, "Rotate: ENOMEM ERROR\n"); - return (ENOMEM); - } + if (dst_len < mylen) + return (rotn_error(rotn_encryptor, session, + ENOMEM, "decrypt buffer not big enough")); /* * !!! Most implementations would verify the checksum here. @@ -286,7 +308,7 @@ rotn_customize(WT_ENCRYPTOR *encryptor, WT_SESSION *session, const ROTN_ENCRYPTOR *orig; ROTN_ENCRYPTOR *rotn_encryptor; WT_CONFIG_ITEM keyid, secret; - WT_EXTENSION_API *wt_api; + WT_EXTENSION_API *wtext; size_t i, len; int ret, keyid_val; u_char base; @@ -295,7 +317,7 @@ rotn_customize(WT_ENCRYPTOR *encryptor, WT_SESSION *session, keyid_val = 0; orig = (const ROTN_ENCRYPTOR *)encryptor; - wt_api = orig->wt_api; + wtext = orig->wtext; if ((rotn_encryptor = calloc(1, sizeof(ROTN_ENCRYPTOR))) == NULL) return (errno); @@ -305,7 +327,7 @@ rotn_customize(WT_ENCRYPTOR *encryptor, WT_SESSION *session, /* * Stash the keyid from the configuration string. */ - if ((ret = wt_api->config_get(wt_api, session, encrypt_config, + if ((ret = wtext->config_get(wtext, session, encrypt_config, "keyid", &keyid)) == 0 && keyid.len != 0) { /* * In this demonstration, we expect keyid to be a number. @@ -327,7 +349,7 @@ rotn_customize(WT_ENCRYPTOR *encryptor, WT_SESSION *session, * We stash the secret key from the configuration string * and build some shift bytes to make encryption/decryption easy. */ - if ((ret = wt_api->config_get(wt_api, session, encrypt_config, + if ((ret = wtext->config_get(wtext, session, encrypt_config, "secretkey", &secret)) == 0 && secret.len != 0) { len = secret.len; if ((rotn_encryptor->secretkey = malloc(len + 1)) == NULL || @@ -396,6 +418,53 @@ rotn_terminate(WT_ENCRYPTOR *encryptor, WT_SESSION *session) } /*! [WT_ENCRYPTOR terminate] */ +/* + * rotn_configure -- + * WiredTiger no-op encryption configuration. + */ +static int +rotn_configure(ROTN_ENCRYPTOR *rotn_encryptor, WT_CONFIG_ARG *config) +{ + WT_CONFIG_ITEM k, v; + WT_CONFIG_PARSER *config_parser; + WT_EXTENSION_API *wtext; /* Extension API */ + int ret, t_ret; + + wtext = rotn_encryptor->wtext; + + /* Get the configuration string. */ + if ((ret = wtext->config_get(wtext, NULL, config, "config", &v)) != 0) + return (rotn_error(rotn_encryptor, NULL, ret, + "WT_EXTENSION_API.config_get")); + + /* Step through the list of configuration options. */ + if ((ret = wtext->config_parser_open( + wtext, NULL, v.str, v.len, &config_parser)) != 0) + return (rotn_error(rotn_encryptor, NULL, ret, + "WT_EXTENSION_API.config_parser_open")); + + while ((ret = config_parser->next(config_parser, &k, &v)) == 0) { + if (strncmp("rotn_force_error", k.str, k.len) == 0 && + strlen("rotn_force_error") == k.len) { + rotn_encryptor->force_error = v.val == 0 ? 0 : 1; + continue; + } + else { + (void)config_parser->close(config_parser); + return (rotn_error(rotn_encryptor, NULL, EINVAL, + "unknown config key")); + } + } + if ((t_ret = config_parser->close(config_parser)) != 0) + return (rotn_error(rotn_encryptor, NULL, t_ret, + "WT_CONFIG_PARSER.close")); + if (ret != WT_NOTFOUND) + return (rotn_error(rotn_encryptor, NULL, ret, + "WT_CONFIG_PARSER.next")); + + return (0); +} + /*! [WT_ENCRYPTOR initialization function] */ /* * wiredtiger_extension_init -- @@ -405,8 +474,7 @@ int wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) { ROTN_ENCRYPTOR *rotn_encryptor; - - (void)config; /* Unused parameters */ + int ret; if ((rotn_encryptor = calloc(1, sizeof(ROTN_ENCRYPTOR))) == NULL) return (errno); @@ -423,9 +491,12 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config) rotn_encryptor->encryptor.sizing = rotn_sizing; rotn_encryptor->encryptor.customize = rotn_customize; rotn_encryptor->encryptor.terminate = rotn_terminate; + rotn_encryptor->wtext = connection->get_extension_api(connection); - rotn_encryptor->wt_api = connection->get_extension_api(connection); - + if ((ret = rotn_configure(rotn_encryptor, config)) != 0) { + free(rotn_encryptor); + return (ret); + } /* Load the encryptor */ return (connection->add_encryptor( connection, "rotn", (WT_ENCRYPTOR *)rotn_encryptor, NULL)); diff --git a/ext/extractors/csv/csv_extractor.c b/ext/extractors/csv/csv_extractor.c index 0dd110955ad..34b8d7c7c64 100644 --- a/ext/extractors/csv/csv_extractor.c +++ b/ext/extractors/csv/csv_extractor.c @@ -128,7 +128,7 @@ csv_customize(WT_EXTRACTOR *extractor, WT_SESSION *session, return (errno); *csv_extractor = *orig; - csv_extractor->field_num = field_num; + csv_extractor->field_num = (int)field_num; *customp = (WT_EXTRACTOR *)csv_extractor; return (0); } diff --git a/src/async/async_api.c b/src/async/async_api.c index 44e492cb0e5..416c3c84f7b 100644 --- a/src/async/async_api.c +++ b/src/async/async_api.c @@ -43,7 +43,7 @@ __async_get_format(WT_CONNECTION_IMPL *conn, const char *uri, * is a possibility a duplicate entry might be inserted, but * that is not harmful. */ - STAILQ_FOREACH(af, &async->formatqh, q) { + TAILQ_FOREACH(af, &async->formatqh, q) { if (af->uri_hash == uri_hash && af->cfg_hash == cfg_hash) goto setup; } @@ -71,7 +71,7 @@ __async_get_format(WT_CONNECTION_IMPL *conn, const char *uri, WT_ERR(c->close(c)); c = NULL; - STAILQ_INSERT_HEAD(&async->formatqh, af, q); + TAILQ_INSERT_HEAD(&async->formatqh, af, q); __wt_spin_unlock(session, &async->ops_lock); WT_ERR(wt_session->close(wt_session, NULL)); @@ -151,15 +151,16 @@ retry: * If we can set the state then the op entry is ours. * Start the next search at the next entry after this one. */ - if (!WT_ATOMIC_CAS4(op->state, WT_ASYNCOP_FREE, WT_ASYNCOP_READY)) { + if (!__wt_atomic_cas32(&op->state, WT_ASYNCOP_FREE, WT_ASYNCOP_READY)) { WT_STAT_FAST_CONN_INCR(session, async_alloc_race); goto retry; } WT_STAT_FAST_CONN_INCRV(session, async_alloc_view, view); WT_RET(__async_get_format(conn, uri, config, op)); - op->unique_id = WT_ATOMIC_ADD8(async->op_id, 1); + op->unique_id = __wt_atomic_add64(&async->op_id, 1); op->optype = WT_AOP_NONE; - (void)WT_ATOMIC_STORE4(async->ops_index, (i + 1) % conn->async_size); + (void)__wt_atomic_store32( + &async->ops_index, (i + 1) % conn->async_size); *opp = op; return (0); } @@ -206,15 +207,15 @@ __wt_async_stats_update(WT_SESSION_IMPL *session) { WT_ASYNC *async; WT_CONNECTION_IMPL *conn; - WT_CONNECTION_STATS *stats; + WT_CONNECTION_STATS **stats; conn = S2C(session); async = conn->async; if (async == NULL) return; - stats = &conn->stats; - WT_STAT_SET(stats, async_cur_queue, async->cur_queue); - WT_STAT_SET(stats, async_max_queue, async->max_queue); + stats = conn->stats; + WT_STAT_SET(session, stats, async_cur_queue, async->cur_queue); + WT_STAT_SET(session, stats, async_max_queue, async->max_queue); F_SET(conn, WT_CONN_SERVER_ASYNC); } @@ -237,7 +238,7 @@ __async_start(WT_SESSION_IMPL *session) */ WT_RET(__wt_calloc_one(session, &conn->async)); async = conn->async; - STAILQ_INIT(&async->formatqh); + TAILQ_INIT(&async->formatqh); WT_RET(__wt_spin_init(session, &async->ops_lock, "ops")); WT_RET(__wt_cond_alloc(session, "async flush", 0, &async->flush_cond)); WT_RET(__wt_async_op_init(session)); @@ -461,9 +462,9 @@ __wt_async_destroy(WT_SESSION_IMPL *session) } /* Free format resources */ - af = STAILQ_FIRST(&async->formatqh); + af = TAILQ_FIRST(&async->formatqh); while (af != NULL) { - afnext = STAILQ_NEXT(af, q); + afnext = TAILQ_NEXT(af, q); __wt_free(session, af->uri); __wt_free(session, af->config); __wt_free(session, af->key_format); @@ -514,7 +515,7 @@ retry: */ __wt_sleep(0, 100000); - if (!WT_ATOMIC_CAS4(async->flush_state, WT_ASYNC_FLUSH_NONE, + if (!__wt_atomic_cas32(&async->flush_state, WT_ASYNC_FLUSH_NONE, WT_ASYNC_FLUSH_IN_PROGRESS)) goto retry; /* @@ -524,7 +525,7 @@ retry: * things off the work queue with the lock. */ async->flush_count = 0; - (void)WT_ATOMIC_ADD8(async->flush_gen, 1); + (void)__wt_atomic_add64(&async->flush_gen, 1); WT_ASSERT(session, async->flush_op.state == WT_ASYNCOP_FREE); async->flush_op.state = WT_ASYNCOP_READY; WT_ERR(__wt_async_op_enqueue(session, &async->flush_op)); diff --git a/src/async/async_op.c b/src/async/async_op.c index d0c58f584cc..469dbc8e615 100644 --- a/src/async/async_op.c +++ b/src/async/async_op.c @@ -237,7 +237,7 @@ __async_op_init(WT_CONNECTION_IMPL *conn, WT_ASYNC_OP_IMPL *op, uint32_t id) asyncop->c.set_key = __wt_cursor_set_key; asyncop->c.get_value = __wt_cursor_get_value; asyncop->c.set_value = __wt_cursor_set_value; - asyncop->c.recno = 0; + asyncop->c.recno = WT_RECNO_OOB; memset(asyncop->c.raw_recno_buf, 0, sizeof(asyncop->c.raw_recno_buf)); memset(&asyncop->c.key, 0, sizeof(asyncop->c.key)); memset(&asyncop->c.value, 0, sizeof(asyncop->c.value)); @@ -280,7 +280,7 @@ __wt_async_op_enqueue(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op) * Enqueue op at the tail of the work queue. * We get our slot in the ring buffer to use. */ - my_alloc = WT_ATOMIC_ADD8(async->alloc_head, 1); + my_alloc = __wt_atomic_add64(&async->alloc_head, 1); my_slot = my_alloc % async->async_qsize; /* @@ -300,7 +300,7 @@ __wt_async_op_enqueue(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op) #endif WT_PUBLISH(async->async_queue[my_slot], op); op->state = WT_ASYNCOP_ENQUEUED; - if (WT_ATOMIC_ADD4(async->cur_queue, 1) > async->max_queue) + if (__wt_atomic_add32(&async->cur_queue, 1) > async->max_queue) WT_PUBLISH(async->max_queue, async->cur_queue); /* * Multiple threads may be adding ops to the queue. We need to wait diff --git a/src/async/async_worker.c b/src/async/async_worker.c index 4f372d05d19..6a5ec5feeb0 100644 --- a/src/async/async_worker.c +++ b/src/async/async_worker.c @@ -67,7 +67,7 @@ retry: * a race, try again. */ my_consume = last_consume + 1; - if (!WT_ATOMIC_CAS8(async->alloc_tail, last_consume, my_consume)) + if (!__wt_atomic_cas64(&async->alloc_tail, last_consume, my_consume)) goto retry; /* * This item of work is ours to process. Clear it out of the @@ -81,7 +81,7 @@ retry: WT_ASSERT(session, async->cur_queue > 0); WT_ASSERT(session, *op != NULL); WT_ASSERT(session, (*op)->state == WT_ASYNCOP_ENQUEUED); - (void)WT_ATOMIC_SUB4(async->cur_queue, 1); + (void)__wt_atomic_sub32(&async->cur_queue, 1); (*op)->state = WT_ASYNCOP_WORKING; if (*op == &async->flush_op) @@ -135,7 +135,7 @@ __async_worker_cursor(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op, if (op->optype == WT_AOP_COMPACT) return (0); WT_ASSERT(session, op->format != NULL); - STAILQ_FOREACH(ac, &worker->cursorqh, q) { + TAILQ_FOREACH(ac, &worker->cursorqh, q) { if (op->format->cfg_hash == ac->cfg_hash && op->format->uri_hash == ac->uri_hash) { /* @@ -156,7 +156,7 @@ __async_worker_cursor(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op, ac->cfg_hash = op->format->cfg_hash; ac->uri_hash = op->format->uri_hash; ac->c = c; - STAILQ_INSERT_HEAD(&worker->cursorqh, ac, q); + TAILQ_INSERT_HEAD(&worker->cursorqh, ac, q); worker->num_cursors++; *cursorp = c; return (0); @@ -297,7 +297,7 @@ __wt_async_worker(void *arg) async = conn->async; worker.num_cursors = 0; - STAILQ_INIT(&worker.cursorqh); + TAILQ_INIT(&worker.cursorqh); while (F_ISSET(conn, WT_CONN_SERVER_ASYNC) && F_ISSET(session, WT_SESSION_SERVER_ASYNC)) { WT_ERR(__async_op_dequeue(conn, session, &op)); @@ -316,7 +316,7 @@ __wt_async_worker(void *arg) * the queue. */ WT_ORDERED_READ(flush_gen, async->flush_gen); - if (WT_ATOMIC_ADD4(async->flush_count, 1) == + if (__wt_atomic_add32(&async->flush_count, 1) == conn->async_workers) { /* * We're last. All workers accounted for so @@ -346,9 +346,9 @@ err: WT_PANIC_MSG(session, ret, "async worker error"); * Worker thread cleanup, close our cached cursors and free all the * WT_ASYNC_CURSOR structures. */ - ac = STAILQ_FIRST(&worker.cursorqh); + ac = TAILQ_FIRST(&worker.cursorqh); while (ac != NULL) { - acnext = STAILQ_NEXT(ac, q); + acnext = TAILQ_NEXT(ac, q); WT_TRET(ac->c->close(ac->c)); __wt_free(session, ac); ac = acnext; diff --git a/src/block/block_ext.c b/src/block/block_ext.c index d593537446b..018f6a20164 100644 --- a/src/block/block_ext.c +++ b/src/block/block_ext.c @@ -86,7 +86,7 @@ __block_off_srch(WT_EXT **head, wt_off_t off, WT_EXT ***stack, int skip_off) * __block_first_srch -- * Search the skiplist for the first available slot. */ -static inline int +static inline bool __block_first_srch(WT_EXT **head, wt_off_t size, WT_EXT ***stack) { WT_EXT *ext; @@ -99,11 +99,11 @@ __block_first_srch(WT_EXT **head, wt_off_t size, WT_EXT ***stack) if (ext->size >= size) break; if (ext == NULL) - return (0); + return (false); /* Build a stack for the offset we want. */ __block_off_srch(head, ext->off, stack, 0); - return (1); + return (true); } /* @@ -251,7 +251,7 @@ __block_off_insert( * Return if any part of a specified range appears on a specified extent * list. */ -static int +static bool __block_off_match(WT_EXTLIST *el, wt_off_t off, wt_off_t size) { WT_EXT *before, *after; @@ -261,10 +261,10 @@ __block_off_match(WT_EXTLIST *el, wt_off_t off, wt_off_t size) /* If "before" or "after" overlaps, we have a winner. */ if (before != NULL && before->off + before->size > off) - return (1); + return (true); if (after != NULL && off + size > after->off) - return (1); - return (0); + return (true); + return (false); } /* @@ -283,7 +283,7 @@ __wt_block_misplaced(WT_SESSION_IMPL *session, * Don't check during the salvage read phase, we might be reading an * already freed overflow page. */ - if (F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK)) + if (F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) return (0); /* diff --git a/src/block/block_open.c b/src/block/block_open.c index fd00e0c7deb..cfb5b000092 100644 --- a/src/block/block_open.c +++ b/src/block/block_open.c @@ -158,9 +158,9 @@ __wt_block_configure_first_fit(WT_BLOCK *block, int on) * as long as any operation wants it. */ if (on) - (void)WT_ATOMIC_ADD4(block->allocfirst, 1); + (void)__wt_atomic_add32(&block->allocfirst, 1); else - (void)WT_ATOMIC_SUB4(block->allocfirst, 1); + (void)__wt_atomic_sub32(&block->allocfirst, 1); } /* @@ -185,7 +185,7 @@ __wt_block_open(WT_SESSION_IMPL *session, hash = __wt_hash_city64(filename, strlen(filename)); bucket = hash % WT_HASH_ARRAY_SIZE; __wt_spin_lock(session, &conn->block_lock); - SLIST_FOREACH(block, &conn->blockhash[bucket], hashl) { + TAILQ_FOREACH(block, &conn->blockhash[bucket], hashq) { if (strcmp(filename, block->name) == 0) { ++block->ref; *blockp = block; @@ -398,21 +398,19 @@ err: __wt_scr_free(session, &buf); void __wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats) { + WT_UNUSED(session); + /* - * We're looking inside the live system's structure, which normally - * requires locking: the chances of a corrupted read are probably - * non-existent, and it's statistics information regardless, but it - * isn't like this is a common function for an application to call. + * Reading from the live system's structure normally requires locking, + * but it's an 8B statistics read, there's no need. */ - __wt_spin_lock(session, &block->live_lock); - WT_STAT_SET(stats, allocation_size, block->allocsize); - WT_STAT_SET(stats, block_checkpoint_size, block->live.ckpt_size); - WT_STAT_SET(stats, block_magic, WT_BLOCK_MAGIC); - WT_STAT_SET(stats, block_major, WT_BLOCK_MAJOR_VERSION); - WT_STAT_SET(stats, block_minor, WT_BLOCK_MINOR_VERSION); - WT_STAT_SET(stats, block_reuse_bytes, block->live.avail.bytes); - WT_STAT_SET(stats, block_size, block->fh->size); - __wt_spin_unlock(session, &block->live_lock); + stats->allocation_size = block->allocsize; + stats->block_checkpoint_size = (int64_t)block->live.ckpt_size; + stats->block_magic = WT_BLOCK_MAGIC; + stats->block_major = WT_BLOCK_MAJOR_VERSION; + stats->block_minor = WT_BLOCK_MINOR_VERSION; + stats->block_reuse_bytes = (int64_t)block->live.avail.bytes; + stats->block_size = block->fh->size; } /* @@ -426,7 +424,7 @@ __wt_block_manager_size( wt_off_t filesize; WT_RET(__wt_filesize_name(session, filename, &filesize)); - WT_STAT_SET(stats, block_size, filesize); + stats->block_size = filesize; return (0); } diff --git a/src/block/block_read.c b/src/block/block_read.c index 0d631396b41..9f7c869dd38 100644 --- a/src/block/block_read.c +++ b/src/block/block_read.c @@ -200,7 +200,7 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, if (page_cksum == cksum) return (0); - if (!F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK)) + if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) __wt_errx(session, "read checksum error for %" PRIu32 "B block at " "offset %" PRIuMAX ": calculated block checksum " @@ -208,7 +208,7 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, "of %" PRIu32, size, (uintmax_t)offset, page_cksum, cksum); } else - if (!F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK)) + if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) __wt_errx(session, "read checksum error for %" PRIu32 "B block at " "offset %" PRIuMAX ": block header checksum " @@ -218,6 +218,6 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, /* Panic if a checksum fails during an ordinary read. */ return (block->verify || - F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ? + F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE) ? WT_ERROR : __wt_illegal_value(session, block->name)); } diff --git a/src/block/block_slvg.c b/src/block/block_slvg.c index c78a6c39942..641bb8a42f7 100644 --- a/src/block/block_slvg.c +++ b/src/block/block_slvg.c @@ -73,19 +73,19 @@ __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block) * __wt_block_offset_invalid -- * Return if the block offset is insane. */ -int +bool __wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size) { if (size == 0) /* < minimum page size */ - return (1); + return (true); if (size % block->allocsize != 0) /* not allocation-size units */ - return (1); + return (true); if (size > WT_BTREE_PAGE_SIZE_MAX) /* > maximum page size */ - return (1); + return (true); /* past end-of-file */ if (offset + (wt_off_t)size > block->fh->size) - return (1); - return (0); + return (true); + return (false); } /* diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c index 18f8ca54601..79a52dbcaa3 100644 --- a/src/btree/bt_compact.c +++ b/src/btree/bt_compact.c @@ -53,12 +53,12 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp) } else if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_REPLACE) { /* * The page's modification information can change underfoot if - * the page is being reconciled, lock the page down. + * the page is being reconciled, serialize with reconciliation. */ - WT_PAGE_LOCK(session, page); + F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION); ret = bm->compact_page_skip(bm, session, mod->mod_replace.addr, mod->mod_replace.size, skipp); - WT_PAGE_UNLOCK(session, page); + F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION); WT_RET(ret); } return (0); @@ -73,14 +73,12 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_BTREE *btree; - WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_REF *ref; - int block_manager_begin, evict_reset, skip; + int block_manager_begin, skip; WT_UNUSED(cfg); - conn = S2C(session); btree = S2BT(session); bm = btree->bm; ref = NULL; @@ -118,25 +116,6 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) */ __wt_spin_lock(session, &btree->flush_lock); - /* - * That leaves eviction, we don't want to block eviction. Set a flag - * so reconciliation knows compaction is running. If reconciliation - * sees the flag it locks the page it's writing, we acquire the same - * lock when reading the page's modify information, serializing access. - * The same page lock blocks work on the page, but compaction is an - * uncommon, heavy-weight operation. If it's ever a problem, there's - * no reason we couldn't use an entirely separate lock than the page - * lock. - * - * We also need to ensure we don't race with an on-going reconciliation. - * After we set the flag, wait for eviction of this file to drain, and - * then let eviction continue; - */ - conn->compact_in_memory_pass = 1; - WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset)); - if (evict_reset) - __wt_evict_file_exclusive_off(session); - /* Start compaction. */ WT_ERR(bm->compact_start(bm, session)); block_manager_begin = 1; @@ -172,11 +151,7 @@ err: if (ref != NULL) if (block_manager_begin) WT_TRET(bm->compact_end(bm, session)); - /* - * Unlock will be a release barrier, use it to update the compaction - * status for reconciliation. - */ - conn->compact_in_memory_pass = 0; + /* Unblock threads writing leaf pages. */ __wt_spin_unlock(session, &btree->flush_lock); return (ret); diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index 0aed5940533..458a1985e28 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -70,7 +70,7 @@ __cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt) * __cursor_valid -- * Return if the cursor references an valid key/value pair. */ -static inline int +static inline bool __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) { WT_BTREE *btree; @@ -133,10 +133,10 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) if (cbt->ins != NULL && (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) - return (0); + return (false); if (updp != NULL) *updp = upd; - return (1); + return (true); } /* @@ -155,7 +155,7 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) * keys, check for retrieval past the end of the page. */ if (cbt->recno >= page->pg_fix_recno + page->pg_fix_entries) - return (0); + return (false); /* * Updates aren't stored on the page, an update would have @@ -170,7 +170,7 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) * "slots", check if search returned a valid slot. */ if (cbt->slot >= page->pg_var_entries) - return (0); + return (false); /* * Updates aren't stored on the page, an update would have @@ -181,7 +181,7 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) cip = &page->pg_var_d[cbt->slot]; if ((cell = WT_COL_PTR(page, cip)) == NULL || __wt_cell_type(cell) == WT_CELL_DEL) - return (0); + return (false); break; case BTREE_ROW: /* @@ -189,7 +189,7 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) * key as an on-page object, we're done. */ if (cbt->ins != NULL) - return (0); + return (false); /* * Check if searched returned a valid slot (the failure mode is @@ -198,19 +198,19 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) * mirrors the column-store test). */ if (cbt->slot >= page->pg_row_entries) - return (0); + return (false); /* Updates are stored on the page, check for a delete. */ if (page->pg_row_upd != NULL && (upd = __wt_txn_read( session, page->pg_row_upd[cbt->slot])) != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) - return (0); + return (false); if (updp != NULL) *updp = upd; } break; } - return (1); + return (true); } /* @@ -517,7 +517,7 @@ retry: WT_RET(__cursor_func_init(cbt, 1)); WT_ERR(__cursor_col_search(session, cbt, NULL)); if (F_ISSET(cursor, WT_CURSTD_APPEND)) - cbt->iface.recno = 0; + cbt->iface.recno = WT_RECNO_OOB; /* * If not overwriting, fail if the key exists. Creating a @@ -549,8 +549,11 @@ retry: WT_RET(__cursor_func_init(cbt, 1)); WT_ILLEGAL_VALUE_ERR(session); } -err: if (ret == WT_RESTART) +err: if (ret == WT_RESTART) { + WT_STAT_FAST_CONN_INCR(session, cursor_restart); + WT_STAT_FAST_DATA_INCR(session, cursor_restart); goto retry; + } /* Insert doesn't maintain a position across calls, clear resources. */ if (ret == 0) WT_TRET(__curfile_leave(cbt)); @@ -624,8 +627,11 @@ retry: WT_RET(__cursor_func_init(cbt, 1)); WT_ILLEGAL_VALUE_ERR(session); } -err: if (ret == WT_RESTART) +err: if (ret == WT_RESTART) { + WT_STAT_FAST_CONN_INCR(session, cursor_restart); + WT_STAT_FAST_DATA_INCR(session, cursor_restart); goto retry; + } WT_TRET(__curfile_leave(cbt)); if (ret != 0) WT_TRET(__cursor_reset(cbt)); @@ -702,8 +708,11 @@ retry: WT_RET(__cursor_func_init(cbt, 1)); WT_ILLEGAL_VALUE_ERR(session); } -err: if (ret == WT_RESTART) +err: if (ret == WT_RESTART) { + WT_STAT_FAST_CONN_INCR(session, cursor_restart); + WT_STAT_FAST_DATA_INCR(session, cursor_restart); goto retry; + } /* * If the cursor is configured to overwrite and the record is not * found, that is exactly what we want. @@ -790,8 +799,11 @@ retry: WT_RET(__cursor_func_init(cbt, 1)); WT_ILLEGAL_VALUE_ERR(session); } -err: if (ret == WT_RESTART) +err: if (ret == WT_RESTART) { + WT_STAT_FAST_CONN_INCR(session, cursor_restart); + WT_STAT_FAST_DATA_INCR(session, cursor_restart); goto retry; + } /* * If successful, point the cursor at internal copies of the data. We @@ -899,7 +911,7 @@ __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp) * __cursor_equals -- * Return if two cursors reference the same row. */ -static inline int +static inline bool __cursor_equals(WT_CURSOR_BTREE *a, WT_CURSOR_BTREE *b) { switch (a->btree->type) { @@ -911,21 +923,21 @@ __cursor_equals(WT_CURSOR_BTREE *a, WT_CURSOR_BTREE *b) * one being returned to the application. */ if (((WT_CURSOR *)a)->recno == ((WT_CURSOR *)b)->recno) - return (1); + return (true); break; case BTREE_ROW: if (a->ref != b->ref) - return (0); + return (false); if (a->ins != NULL || b->ins != NULL) { if (a->ins == b->ins) - return (1); + return (true); break; } if (a->slot == b->slot) - return (1); + return (true); break; } - return (0); + return (false); } /* @@ -993,22 +1005,27 @@ __cursor_truncate(WT_SESSION_IMPL *session, * instantiated the end cursor, so we know that page is pinned in memory * and we can proceed without concern. */ - do { - WT_RET(__wt_btcur_remove(start)); - /* - * Reset ret each time through so that we don't loop forever in - * the cursor equals case. - */ - for (ret = 0;;) { - if (stop != NULL && __cursor_equals(start, stop)) - break; - if ((ret = __wt_btcur_next(start, 1)) != 0) - break; - start->compare = 0; /* Exact match */ - if ((ret = rmfunc(session, start, 1)) != 0) - break; - } - } while (ret == WT_RESTART); +retry: WT_RET(__wt_btcur_remove(start)); + + /* + * Reset ret each time through so that we don't loop forever in + * the cursor equals case. + */ + for (ret = 0;;) { + if (stop != NULL && __cursor_equals(start, stop)) + break; + if ((ret = __wt_btcur_next(start, 1)) != 0) + break; + start->compare = 0; /* Exact match */ + if ((ret = rmfunc(session, start, 1)) != 0) + break; + } + + if (ret == WT_RESTART) { + WT_STAT_FAST_CONN_INCR(session, cursor_restart); + WT_STAT_FAST_DATA_INCR(session, cursor_restart); + goto retry; + } WT_RET_NOTFOUND_OK(ret); return (0); @@ -1042,24 +1059,28 @@ __cursor_truncate_fix(WT_SESSION_IMPL *session, * other thread of control; in that case, repeat the full search to * refresh the page's modification information. */ - do { - WT_RET(__wt_btcur_remove(start)); - /* - * Reset ret each time through so that we don't loop forever in - * the cursor equals case. - */ - for (ret = 0;;) { - if (stop != NULL && __cursor_equals(start, stop)) - break; - if ((ret = __wt_btcur_next(start, 1)) != 0) - break; - start->compare = 0; /* Exact match */ - value = (uint8_t *)start->iface.value.data; - if (*value != 0 && - (ret = rmfunc(session, start, 1)) != 0) - break; - } - } while (ret == WT_RESTART); +retry: WT_RET(__wt_btcur_remove(start)); + /* + * Reset ret each time through so that we don't loop forever in + * the cursor equals case. + */ + for (ret = 0;;) { + if (stop != NULL && __cursor_equals(start, stop)) + break; + if ((ret = __wt_btcur_next(start, 1)) != 0) + break; + start->compare = 0; /* Exact match */ + value = (uint8_t *)start->iface.value.data; + if (*value != 0 && + (ret = rmfunc(session, start, 1)) != 0) + break; + } + + if (ret == WT_RESTART) { + WT_STAT_FAST_CONN_INCR(session, cursor_restart); + WT_STAT_FAST_DATA_INCR(session, cursor_restart); + goto retry; + } WT_RET_NOTFOUND_OK(ret); return (0); @@ -1132,6 +1153,19 @@ err: if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED)) } /* + * __wt_btcur_init -- + * Initialize an cursor used for internal purposes. + */ +void +__wt_btcur_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) +{ + memset(cbt, 0, sizeof(WT_CURSOR_BTREE)); + + cbt->iface.session = &session->iface; + cbt->btree = S2BT(session); +} + +/* * __wt_btcur_open -- * Open a btree cursor. */ @@ -1147,14 +1181,22 @@ __wt_btcur_open(WT_CURSOR_BTREE *cbt) * Close a btree cursor. */ int -__wt_btcur_close(WT_CURSOR_BTREE *cbt) +__wt_btcur_close(WT_CURSOR_BTREE *cbt, int lowlevel) { WT_DECL_RET; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)cbt->iface.session; - ret = __curfile_leave(cbt); + /* + * The in-memory split and lookaside table code creates low-level btree + * cursors to search/modify leaf pages. Those cursors don't hold hazard + * pointers, nor are they counted in the session handle's cursor count. + * Skip the usual cursor tear-down in that case. + */ + if (!lowlevel) + ret = __curfile_leave(cbt); + __wt_buf_free(session, &cbt->_row_key); __wt_buf_free(session, &cbt->_tmp); diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index 77d80cdb3a2..38ef407e160 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -340,6 +340,8 @@ __wt_debug_disk( __dmsg(ds, ", empty-all"); if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE)) __dmsg(ds, ", empty-none"); + if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE)) + __dmsg(ds, ", LAS-update"); __dmsg(ds, ", generation %" PRIu64 "\n", dsk->write_gen); @@ -643,12 +645,10 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page) __dmsg(ds, ", disk-mapped"); if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)) __dmsg(ds, ", evict-lru"); - if (F_ISSET_ATOMIC(page, WT_PAGE_SCANNING)) - __dmsg(ds, ", scanning"); + if (F_ISSET_ATOMIC(page, WT_PAGE_RECONCILIATION)) + __dmsg(ds, ", reconciliation"); if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT)) __dmsg(ds, ", split-insert"); - if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_LOCKED)) - __dmsg(ds, ", split-locked"); if (mod != NULL) switch (F_ISSET(mod, WT_PM_REC_MASK)) { diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c index 8cca6328f21..c3c7afa1450 100644 --- a/src/btree/bt_delete.c +++ b/src/btree/bt_delete.c @@ -70,15 +70,15 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp) /* If we have a clean page in memory, attempt to evict it. */ if (ref->state == WT_REF_MEM && - WT_ATOMIC_CAS4(ref->state, WT_REF_MEM, WT_REF_LOCKED)) { + __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED)) { if (__wt_page_is_modified(ref->page)) { WT_PUBLISH(ref->state, WT_REF_MEM); return (0); } - (void)WT_ATOMIC_ADD4(S2BT(session)->evict_busy, 1); - ret = __wt_evict_page(session, ref); - (void)WT_ATOMIC_SUB4(S2BT(session)->evict_busy, 1); + (void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1); + ret = __wt_evict(session, ref, 0); + (void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1); WT_RET_BUSY_OK(ret); } @@ -93,7 +93,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp) * unclear optimizing for overlapping range deletes is worth the effort. */ if (ref->state != WT_REF_DISK || - !WT_ATOMIC_CAS4(ref->state, WT_REF_DISK, WT_REF_LOCKED)) + !__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED)) return (0); /* @@ -176,8 +176,8 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) * If the page is still "deleted", it's as we left it, * reset the state. */ - if (WT_ATOMIC_CAS4( - ref->state, WT_REF_DELETED, WT_REF_DISK)) + if (__wt_atomic_casv32( + &ref->state, WT_REF_DELETED, WT_REF_DISK)) return; break; case WT_REF_LOCKED: @@ -216,10 +216,10 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) * __wt_delete_page_skip -- * If iterating a cursor, skip deleted pages that are visible to us. */ -int +bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) { - int skip; + bool skip; /* * Deleted pages come from two sources: either it's a fast-delete as @@ -240,13 +240,13 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) * the structure, just to be safe. */ if (ref->page_del == NULL) - return (1); + return (true); - if (!WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED)) - return (0); + if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED)) + return (false); - skip = (ref->page_del == NULL || - __wt_txn_visible(session, ref->page_del->txnid)); + skip = ref->page_del == NULL || + __wt_txn_visible(session, ref->page_del->txnid); WT_PUBLISH(ref->state, WT_REF_DELETED); return (skip); diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c index a05c6217338..73e6affccd3 100644 --- a/src/btree/bt_discard.c +++ b/src/btree/bt_discard.c @@ -15,7 +15,6 @@ static void __free_page_row_leaf(WT_SESSION_IMPL *, WT_PAGE *); static void __free_skip_array(WT_SESSION_IMPL *, WT_INSERT_HEAD **, uint32_t); static void __free_skip_list(WT_SESSION_IMPL *, WT_INSERT *); static void __free_update(WT_SESSION_IMPL *, WT_UPDATE **, uint32_t); -static void __free_update_list(WT_SESSION_IMPL *, WT_UPDATE *); /* * __wt_ref_out -- @@ -56,7 +55,7 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) */ WT_ASSERT(session, !__wt_page_is_modified(page)); WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)); - WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_LOCKED)); + WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_RECONCILIATION)); #ifdef HAVE_DIAGNOSTIC { @@ -160,8 +159,8 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) __wt_free(session, multi->key.ikey); break; } - __wt_free(session, multi->skip); - __wt_free(session, multi->skip_dsk); + __wt_free(session, multi->supd); + __wt_free(session, multi->supd_dsk); __wt_free(session, multi->addr.addr); } __wt_free(session, mod->mod_multi); @@ -235,10 +234,7 @@ __wt_free_ref( * it clean explicitly.) */ if (free_pages && ref->page != NULL) { - if (ref->page->modify != NULL) { - ref->page->modify->write_gen = 0; - __wt_cache_dirty_decr(session, ref->page); - } + __wt_page_modify_clear(session, ref->page); __wt_page_out(session, &ref->page); } @@ -373,7 +369,7 @@ __free_skip_list(WT_SESSION_IMPL *session, WT_INSERT *ins) WT_INSERT *next; for (; ins != NULL; ins = next) { - __free_update_list(session, ins->upd); + __wt_free_update_list(session, ins->upd); next = WT_SKIP_NEXT(ins); __wt_free(session, ins); } @@ -395,29 +391,23 @@ __free_update( */ for (updp = update_head; entries > 0; --entries, ++updp) if (*updp != NULL) - __free_update_list(session, *updp); + __wt_free_update_list(session, *updp); /* Free the update array. */ __wt_free(session, update_head); } /* - * __free_update_list -- + * __wt_free_update_list -- * Walk a WT_UPDATE forward-linked list and free the per-thread combination * of a WT_UPDATE structure and its associated data. */ -static void -__free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd) +void +__wt_free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd) { WT_UPDATE *next; for (; upd != NULL; upd = next) { - /* Everything we free should be visible to everyone. */ - WT_ASSERT(session, - F_ISSET(session, WT_SESSION_DISCARD_FORCE) || - upd->txnid == WT_TXN_ABORTED || - __wt_txn_visible_all(session, upd->txnid)); - next = upd->next; __wt_free(session, upd); } diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c index c1a8ab61054..6a4243a0fc7 100644 --- a/src/btree/bt_handle.c +++ b/src/btree/bt_handle.c @@ -255,27 +255,17 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) /* Page sizes */ WT_RET(__btree_page_sizes(session)); - /* - * Set special flags for the metadata file. - * Eviction; the metadata file is never evicted. - * Logging; the metadata file is always logged if possible. - */ - if (WT_IS_METADATA(btree->dhandle)) { + WT_RET(__wt_config_gets(session, cfg, "cache_resident", &cval)); + if (cval.val) F_SET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION); + else + F_CLR(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION); + + WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval)); + if (cval.val) F_CLR(btree, WT_BTREE_NO_LOGGING); - } else { - WT_RET(__wt_config_gets(session, cfg, "cache_resident", &cval)); - if (cval.val) - F_SET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION); - else - F_CLR(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION); - - WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval)); - if (cval.val) - F_CLR(btree, WT_BTREE_NO_LOGGING); - else - F_SET(btree, WT_BTREE_NO_LOGGING); - } + else + F_SET(btree, WT_BTREE_NO_LOGGING); /* Checksums */ WT_RET(__wt_config_gets(session, cfg, "checksum", &cval)); @@ -352,8 +342,6 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt) session, &btree->ovfl_lock, "btree overflow lock")); WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush lock")); - __wt_stat_init_dsrc_stats(&btree->dhandle->stats); - btree->write_gen = ckpt->write_gen; /* Write generation */ btree->modified = 0; /* Clean */ @@ -372,7 +360,7 @@ __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, int is_recno) root_ref->page = root; root_ref->state = WT_REF_MEM; - root_ref->key.recno = is_recno ? 1 : 0; + root_ref->key.recno = is_recno ? 1 : WT_RECNO_OOB; root->pg_intl_parent_ref = root_ref; } @@ -385,12 +373,15 @@ int __wt_btree_tree_open( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) { + WT_BM *bm; WT_BTREE *btree; + WT_DECL_ITEM(tmp); WT_DECL_RET; WT_ITEM dsk; WT_PAGE *page; btree = S2BT(session); + bm = btree->bm; /* * A buffer into which we read a root page; don't use a scratch buffer, @@ -399,12 +390,43 @@ __wt_btree_tree_open( WT_CLEAR(dsk); /* - * Read the page, then build the in-memory version of the page. Clear - * any local reference to an allocated copy of the disk image on return, - * the page steals it. + * Read and verify the page (verify to catch encrypted objects we can't + * decrypt, where we read the object successfully but we can't decrypt + * it, and we want to fail gracefully). + * + * Create a printable version of the address to pass to verify. + */ + WT_ERR(__wt_scr_alloc(session, 0, &tmp)); + WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size)); + + F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE); + if ((ret = __wt_bt_read(session, &dsk, addr, addr_size)) == 0) + ret = __wt_verify_dsk(session, tmp->data, &dsk); + F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE); + if (ret != 0) + __wt_err(session, ret, + "unable to read root page from %s", session->dhandle->name); + /* + * Failure to open metadata means that the database is unavailable. + * Try to provide a helpful failure message. + */ + if (ret != 0 && WT_IS_METADATA(session->dhandle)) { + __wt_errx(session, + "WiredTiger has failed to open its metadata"); + __wt_errx(session, "This may be due to the database" + " files being encrypted, being from an older" + " version or due to corruption on disk"); + __wt_errx(session, "You should confirm that you have" + " opened the database with the correct options including" + " all encryption and compression options"); + } + WT_ERR(ret); + + /* + * Build the in-memory version of the page. Clear our local reference to + * the allocated copy of the disk image on return, the in-memory object + * steals it. */ - WT_ERR(__wt_bt_read(session, &dsk, addr, addr_size)); - WT_ERR(__wt_verify_dsk(session, (const char *)addr, &dsk)); WT_ERR(__wt_page_inmem(session, NULL, dsk.data, dsk.memsize, WT_DATA_IN_ITEM(&dsk) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); @@ -414,6 +436,8 @@ __wt_btree_tree_open( __wt_root_ref_init(&btree->root, page, btree->type != BTREE_ROW); err: __wt_buf_free(session, &dsk); + __wt_scr_free(session, &tmp); + return (ret); } @@ -663,9 +687,11 @@ __btree_page_sizes(WT_SESSION_IMPL *session) WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval)); btree->maxmempage = WT_MAX((uint64_t)cval.val, 50 * (uint64_t)btree->maxleafpage); - cache_size = S2C(session)->cache_size; - if (cache_size > 0) - btree->maxmempage = WT_MIN(btree->maxmempage, cache_size / 4); + if (!F_ISSET(S2C(session), WT_CONN_CACHE_POOL)) { + if ((cache_size = S2C(session)->cache_size) > 0) + btree->maxmempage = + WT_MIN(btree->maxmempage, cache_size / 4); + } /* * Get the split percentage (reconciliation splits pages into smaller diff --git a/src/btree/bt_io.c b/src/btree/bt_io.c index a8bbf8a0266..836c1540c5f 100644 --- a/src/btree/bt_io.c +++ b/src/btree/bt_io.c @@ -24,10 +24,12 @@ __wt_bt_read(WT_SESSION_IMPL *session, WT_ENCRYPTOR *encryptor; WT_ITEM *ip; const WT_PAGE_HEADER *dsk; + const char *fail_msg; size_t result_len; btree = S2BT(session); bm = btree->bm; + fail_msg = NULL; /* -Wuninitialized */ /* * If anticipating a compressed or encrypted block, read into a scratch @@ -52,40 +54,36 @@ __wt_bt_read(WT_SESSION_IMPL *session, if (F_ISSET(dsk, WT_PAGE_ENCRYPTED)) { if (btree->kencryptor == NULL || (encryptor = btree->kencryptor->encryptor) == NULL || - encryptor->decrypt == NULL) - WT_ERR_MSG(session, WT_ERROR, - "read encrypted block where no decryption engine " - "configured"); + encryptor->decrypt == NULL) { + fail_msg = + "encrypted block in file for which no encryption " + "configured"; + goto corrupt; + } WT_ERR(__wt_scr_alloc(session, 0, &etmp)); - ret = __wt_decrypt(session, - encryptor, WT_BLOCK_ENCRYPT_SKIP, ip, etmp); - /* - * It may be file corruption, which is really, really bad, or - * may be a mismatch of encryption configuration, for example, - * an incorrect secretkey. - */ - if (ret != 0) - WT_ERR(F_ISSET(btree, WT_BTREE_VERIFY) || - F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ? - WT_ERROR : - __wt_illegal_value(session, btree->dhandle->name)); + if ((ret = __wt_decrypt(session, + encryptor, WT_BLOCK_ENCRYPT_SKIP, ip, etmp)) != 0) { + fail_msg = "block decryption failed"; + goto corrupt; + } ip = etmp; dsk = ip->data; - } else if (btree->kencryptor != NULL && - !F_ISSET(btree, WT_BTREE_VERIFY) && - !F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK)) - WT_ERR_MSG(session, WT_ERROR, - "encryption configured, and existing file is not " - "encrypted"); + } else if (btree->kencryptor != NULL) { + fail_msg = + "unencrypted block in file for which encryption configured"; + goto corrupt; + } if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) { if (btree->compressor == NULL || - btree->compressor->decompress == NULL) - WT_ERR_MSG(session, WT_ERROR, - "read compressed block where no compression engine " - "configured"); + btree->compressor->decompress == NULL) { + fail_msg = + "compressed block in file for which no compression " + "configured"; + goto corrupt; + } /* * Size the buffer based on the in-memory bytes we're expecting @@ -118,11 +116,10 @@ __wt_bt_read(WT_SESSION_IMPL *session, * it's OK, otherwise it's really, really bad. */ if (ret != 0 || - result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) - WT_ERR(F_ISSET(btree, WT_BTREE_VERIFY) || - F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ? - WT_ERROR : - __wt_illegal_value(session, btree->dhandle->name)); + result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) { + fail_msg = "block decryption failed"; + goto corrupt; + } } else /* * If we uncompressed above, the page is in the correct buffer. @@ -139,7 +136,7 @@ __wt_bt_read(WT_SESSION_IMPL *session, if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size)); - WT_ERR(__wt_verify_dsk(session, (const char *)tmp->data, buf)); + WT_ERR(__wt_verify_dsk(session, tmp->data, buf)); } WT_STAT_FAST_CONN_INCR(session, cache_read); @@ -149,6 +146,16 @@ __wt_bt_read(WT_SESSION_IMPL *session, WT_STAT_FAST_CONN_INCRV(session, cache_bytes_read, dsk->mem_size); WT_STAT_FAST_DATA_INCRV(session, cache_bytes_read, dsk->mem_size); + if (0) { +corrupt: if (ret == 0) + ret = WT_ERROR; + if (!F_ISSET(btree, WT_BTREE_VERIFY) && + !F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) { + __wt_err(session, ret, "%s", fail_msg); + ret = __wt_illegal_value(session, btree->dhandle->name); + } + } + err: __wt_scr_free(session, &tmp); __wt_scr_free(session, &etmp); return (ret); diff --git a/src/btree/bt_ovfl.c b/src/btree/bt_ovfl.c index d8456c5b61f..7104e702418 100644 --- a/src/btree/bt_ovfl.c +++ b/src/btree/bt_ovfl.c @@ -79,7 +79,7 @@ __wt_ovfl_read(WT_SESSION_IMPL *session, * __ovfl_cache_col_visible -- * column-store: check for a globally visible update. */ -static int +static bool __ovfl_cache_col_visible( WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_CELL_UNPACK *unpack) { @@ -99,15 +99,15 @@ __ovfl_cache_col_visible( if (__wt_cell_rle(unpack) == 1 && upd != NULL && /* Sanity: upd should always be set. */ __wt_txn_visible_all(session, upd->txnid)) - return (1); - return (0); + return (true); + return (false); } /* * __ovfl_cache_row_visible -- * row-store: check for a globally visible update. */ -static int +static bool __ovfl_cache_row_visible(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip) { WT_UPDATE *upd; @@ -115,9 +115,9 @@ __ovfl_cache_row_visible(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip) /* Check to see if there's a globally visible update. */ for (upd = WT_ROW_UPDATE(page, rip); upd != NULL; upd = upd->next) if (__wt_txn_visible_all(session, upd->txnid)) - return (1); + return (true); - return (0); + return (false); } /* diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c index 86edd992b28..ba218fc332c 100644 --- a/src/btree/bt_page.c +++ b/src/btree/bt_page.c @@ -17,214 +17,6 @@ static int __inmem_row_leaf_entries( WT_SESSION_IMPL *, const WT_PAGE_HEADER *, uint32_t *); /* - * __evict_force_check -- - * Check if a page matches the criteria for forced eviction. - */ -static int -__evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page) -{ - WT_BTREE *btree; - - btree = S2BT(session); - - /* Pages are usually small enough, check that first. */ - if (page->memory_footprint < btree->maxmempage) - return (0); - - /* Leaf pages only. */ - if (WT_PAGE_IS_INTERNAL(page)) - return (0); - - /* - * It's hard to imagine a page with a huge memory footprint that has - * never been modified, but check to be sure. - */ - if (page->modify == NULL) - return (0); - - /* Trigger eviction on the next page release. */ - __wt_page_evict_soon(page); - - /* Bump the oldest ID, we're about to do some visibility checks. */ - __wt_txn_update_oldest(session, 0); - - /* If eviction cannot succeed, don't try. */ - return (__wt_page_can_evict(session, page, 1, NULL)); -} - -/* - * __wt_page_in_func -- - * Acquire a hazard pointer to a page; if the page is not in-memory, - * read it from the disk and build an in-memory version. - */ -int -__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags -#ifdef HAVE_DIAGNOSTIC - , const char *file, int line -#endif - ) -{ - WT_BTREE *btree; - WT_DECL_RET; - WT_PAGE *page; - u_int sleep_cnt, wait_cnt; - int busy, cache_work, force_attempts, oldgen; - - btree = S2BT(session); - - for (force_attempts = oldgen = 0, wait_cnt = 0;;) { - switch (ref->state) { - case WT_REF_DISK: - case WT_REF_DELETED: - if (LF_ISSET(WT_READ_CACHE)) - return (WT_NOTFOUND); - - /* - * The page isn't in memory, attempt to read it. - * Make sure there is space in the cache. - */ - WT_RET(__wt_cache_eviction_check(session, 1, NULL)); - WT_RET(__wt_cache_read(session, ref)); - oldgen = LF_ISSET(WT_READ_WONT_NEED) || - F_ISSET(session, WT_SESSION_NO_CACHE); - continue; - case WT_REF_READING: - if (LF_ISSET(WT_READ_CACHE)) - return (WT_NOTFOUND); - if (LF_ISSET(WT_READ_NO_WAIT)) - return (WT_NOTFOUND); - - /* Waiting on another thread's read, stall. */ - WT_STAT_FAST_CONN_INCR(session, page_read_blocked); - goto stall; - case WT_REF_LOCKED: - if (LF_ISSET(WT_READ_NO_WAIT)) - return (WT_NOTFOUND); - - /* Waiting on eviction, stall. */ - WT_STAT_FAST_CONN_INCR(session, page_locked_blocked); - goto stall; - case WT_REF_SPLIT: - return (WT_RESTART); - case WT_REF_MEM: - /* - * The page is in memory. - * - * Get a hazard pointer if one is required. We cannot - * be evicting if no hazard pointer is required, we're - * done. - */ - if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) - goto skip_evict; - - /* - * The expected reason we can't get a hazard pointer is - * because the page is being evicted, yield, try again. - */ -#ifdef HAVE_DIAGNOSTIC - WT_RET( - __wt_hazard_set(session, ref, &busy, file, line)); -#else - WT_RET(__wt_hazard_set(session, ref, &busy)); -#endif - if (busy) { - WT_STAT_FAST_CONN_INCR( - session, page_busy_blocked); - break; - } - - /* - * If eviction is configured for this file, check to see - * if the page qualifies for forced eviction and update - * the page's generation number. If eviction isn't being - * done on this file, we're done. - */ - if (LF_ISSET(WT_READ_NO_EVICT) || - F_ISSET(btree, WT_BTREE_NO_EVICTION)) - goto skip_evict; - - /* - * Forcibly evict pages that are too big. - */ - page = ref->page; - if (force_attempts < 10 && - __evict_force_check(session, page)) { - ++force_attempts; - ret = __wt_page_release_evict(session, ref); - /* If forced eviction fails, stall. */ - if (ret == EBUSY) { - ret = 0; - WT_STAT_FAST_CONN_INCR(session, - page_forcible_evict_blocked); - goto stall; - } - WT_RET(ret); - - /* - * The result of a successful forced eviction - * is a page-state transition (potentially to - * an in-memory page we can use, or a restart - * return for our caller), continue the outer - * page-acquisition loop. - */ - continue; - } - - /* - * If we read the page and we are configured to not - * trash the cache, set the oldest read generation so - * the page is forcibly evicted as soon as possible. - * - * Otherwise, update the page's read generation. - */ - if (oldgen && page->read_gen == WT_READGEN_NOTSET) - __wt_page_evict_soon(page); - else if (!LF_ISSET(WT_READ_NO_GEN) && - page->read_gen != WT_READGEN_OLDEST && - page->read_gen < __wt_cache_read_gen(session)) - page->read_gen = - __wt_cache_read_gen_bump(session); -skip_evict: - /* - * Check if we need an autocommit transaction. - * Starting a transaction can trigger eviction, so skip - * it if eviction isn't permitted. - */ - return (LF_ISSET(WT_READ_NO_EVICT) ? 0 : - __wt_txn_autocommit_check(session)); - WT_ILLEGAL_VALUE(session); - } - - /* - * We failed to get the page -- yield before retrying, and if - * we've yielded enough times, start sleeping so we don't burn - * CPU to no purpose. - */ - if (++wait_cnt < 1000) - __wt_yield(); - else { - if (0) { -stall: wait_cnt += 1000; - } - - /* - * If stalling, check if the cache needs help. If we do - * work for the cache, substitute that for a sleep. - */ - WT_RET( - __wt_cache_eviction_check(session, 1, &cache_work)); - if (!cache_work) { - sleep_cnt = WT_MIN(wait_cnt, 10000); - wait_cnt *= 2; - WT_STAT_FAST_CONN_INCRV( - session, page_sleep, sleep_cnt); - __wt_sleep(0, sleep_cnt); - } - } - } -} - -/* * __wt_page_alloc -- * Create or read a page into the cache. */ @@ -326,8 +118,8 @@ err: if ((pindex = WT_INTL_INDEX_GET_SAFE(page)) != NULL) { /* Increment the cache statistics. */ __wt_cache_page_inmem_incr(session, page, size); - (void)WT_ATOMIC_ADD8(cache->bytes_read, size); - (void)WT_ATOMIC_ADD8(cache->pages_inmem, 1); + (void)__wt_atomic_add64(&cache->bytes_read, size); + (void)__wt_atomic_add64(&cache->pages_inmem, 1); *pagep = page; return (0); diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c index e27f7c3398c..d26b44e04c0 100644 --- a/src/btree/bt_read.c +++ b/src/btree/bt_read.c @@ -9,19 +9,328 @@ #include "wt_internal.h" /* - * __wt_cache_read -- - * Read a page from the file. + * __wt_las_remove_block -- + * Remove all records matching a key prefix from the lookaside store. */ int -__wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref) +__wt_las_remove_block(WT_SESSION_IMPL *session, + WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size) { + WT_DECL_ITEM(las_addr); + WT_DECL_ITEM(las_key); + WT_DECL_RET; + uint64_t las_counter, las_txnid; + uint32_t las_id; + int exact; + + WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); + WT_ERR(__wt_scr_alloc(session, 0, &las_key)); + + /* + * Search for the block's unique prefix and step through all matching + * records, removing them. + */ + las_addr->data = addr; + las_addr->size = addr_size; + las_key->size = 0; + cursor->set_key( + cursor, btree_id, las_addr, (uint64_t)0, (uint32_t)0, las_key); + if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0) + ret = cursor->next(cursor); + for (; ret == 0; ret = cursor->next(cursor)) { + WT_ERR(cursor->get_key(cursor, + &las_id, las_addr, &las_counter, &las_txnid, las_key)); + + /* + * Confirm the search using the unique prefix; if not a match, + * we're done searching for records for this page. + */ + if (las_id != btree_id || + las_addr->size != addr_size || + memcmp(las_addr->data, addr, addr_size) != 0) + break; + + /* + * Cursor opened overwrite=true: won't return WT_NOTFOUND should + * another thread remove the record before we do, and the cursor + * remains positioned in that case. + */ + WT_ERR(cursor->remove(cursor)); + } + WT_ERR_NOTFOUND_OK(ret); + +err: __wt_scr_free(session, &las_addr); + __wt_scr_free(session, &las_key); + return (ret); +} + +/* + * __col_instantiate -- + * Update a column-store page entry based on a lookaside table update list. + */ +static int +__col_instantiate(WT_SESSION_IMPL *session, + uint64_t recno, WT_REF *ref, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +{ + /* Search the page and add updates. */ + WT_RET(__wt_col_search(session, recno, ref, cbt)); + WT_RET(__wt_col_modify(session, cbt, recno, NULL, upd, 0)); + return (0); +} + +/* + * __row_instantiate -- + * Update a row-store page entry based on a lookaside table update list. + */ +static int +__row_instantiate(WT_SESSION_IMPL *session, + WT_ITEM *key, WT_REF *ref, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +{ + /* Search the page and add updates. */ + WT_RET(__wt_row_search(session, key, ref, cbt, 1)); + WT_RET(__wt_row_modify(session, cbt, key, NULL, upd, 0)); + return (0); +} + +/* + * __las_page_instantiate -- + * Instantiate lookaside update records in a recently read page. + */ +static int +__las_page_instantiate(WT_SESSION_IMPL *session, + WT_REF *ref, uint32_t read_id, const uint8_t *addr, size_t addr_size) +{ + WT_CURSOR *cursor; + WT_CURSOR_BTREE cbt; + WT_DECL_ITEM(current_key); + WT_DECL_ITEM(las_addr); + WT_DECL_ITEM(las_key); + WT_DECL_ITEM(las_value); + WT_DECL_RET; + WT_PAGE *page; + WT_UPDATE *first_upd, *last_upd, *upd; + size_t incr, total_incr; + uint64_t current_recno, las_counter, las_txnid, recno, upd_txnid; + uint32_t las_id, upd_size, session_flags; + int exact; + const uint8_t *p; + + cursor = NULL; + page = ref->page; + first_upd = last_upd = upd = NULL; + total_incr = 0; + current_recno = recno = WT_RECNO_OOB; + session_flags = 0; /* [-Werror=maybe-uninitialized] */ + + __wt_btcur_init(session, &cbt); + __wt_btcur_open(&cbt); + + WT_ERR(__wt_scr_alloc(session, 0, ¤t_key)); + WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); + WT_ERR(__wt_scr_alloc(session, 0, &las_key)); + WT_ERR(__wt_scr_alloc(session, 0, &las_value)); + + /* Open a lookaside table cursor. */ + WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); + + /* + * The lookaside records are in key and update order, that is, there + * will be a set of in-order updates for a key, then another set of + * in-order updates for a subsequent key. We process all of the updates + * for a key and then insert those updates into the page, then all the + * updates for the next key, and so on. + * + * Search for the block's unique prefix, stepping through any matching + * records. + */ + las_addr->data = addr; + las_addr->size = addr_size; + las_key->size = 0; + cursor->set_key( + cursor, read_id, las_addr, (uint64_t)0, (uint32_t)0, las_key); + if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0) + ret = cursor->next(cursor); + for (; ret == 0; ret = cursor->next(cursor)) { + WT_ERR(cursor->get_key(cursor, + &las_id, las_addr, &las_counter, &las_txnid, las_key)); + + /* + * Confirm the search using the unique prefix; if not a match, + * we're done searching for records for this page. + */ + if (las_id != read_id || + las_addr->size != addr_size || + memcmp(las_addr->data, addr, addr_size) != 0) + break; + + /* + * If the on-page value has become globally visible, this record + * is no longer needed. + */ + if (__wt_txn_visible_all(session, las_txnid)) + continue; + + /* Allocate the WT_UPDATE structure. */ + WT_ERR(cursor->get_value( + cursor, &upd_txnid, &upd_size, las_value)); + WT_ERR(__wt_update_alloc(session, + (upd_size == WT_UPDATE_DELETED_VALUE) ? NULL : las_value, + &upd, &incr)); + total_incr += incr; + upd->txnid = upd_txnid; + + switch (page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + p = las_key->data; + WT_ERR(__wt_vunpack_uint(&p, 0, &recno)); + if (current_recno == recno) + break; + WT_ASSERT(session, current_recno < recno); + + if (first_upd != NULL) { + WT_ERR(__col_instantiate(session, + current_recno, ref, &cbt, first_upd)); + first_upd = NULL; + } + current_recno = recno; + break; + case WT_PAGE_ROW_LEAF: + if (current_key->size == las_key->size && + memcmp(current_key->data, + las_key->data, las_key->size) == 0) + break; + + if (first_upd != NULL) { + WT_ERR(__row_instantiate(session, + current_key, ref, &cbt, first_upd)); + first_upd = NULL; + } + WT_ERR(__wt_buf_set(session, + current_key, las_key->data, las_key->size)); + break; + WT_ILLEGAL_VALUE_ERR(session); + } + + /* Append the latest update to the list. */ + if (first_upd == NULL) + first_upd = last_upd = upd; + else { + last_upd->next = upd; + last_upd = upd; + } + upd = NULL; + } + WT_ERR_NOTFOUND_OK(ret); + + /* Insert the last set of updates, if any. */ + if (first_upd != NULL) + switch (page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + WT_ERR(__col_instantiate(session, + current_recno, ref, &cbt, first_upd)); + first_upd = NULL; + break; + case WT_PAGE_ROW_LEAF: + WT_ERR(__row_instantiate(session, + current_key, ref, &cbt, first_upd)); + first_upd = NULL; + break; + WT_ILLEGAL_VALUE_ERR(session); + } + + /* Discard the cursor. */ + WT_ERR(__wt_las_cursor_close(session, &cursor, session_flags)); + + if (total_incr != 0) { + __wt_cache_page_inmem_incr(session, page, total_incr); + + /* + * We've modified/dirtied the page, but that's not necessary and + * if we keep the page clean, it's easier to evict. We leave the + * lookaside table updates in place, so if we evict this page + * without dirtying it, any future instantiation of it will find + * the records it needs. If the page is dirtied before eviction, + * then we'll write any needed lookaside table records for the + * new location of the page. + */ + __wt_page_modify_clear(session, page); + } + +err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); + WT_TRET(__wt_btcur_close(&cbt, 1)); + + /* + * On error, upd points to a single unlinked WT_UPDATE structure, + * first_upd points to a list. + */ + if (upd != NULL) + __wt_free(session, upd); + if (first_upd != NULL) + __wt_free_update_list(session, first_upd); + + __wt_scr_free(session, ¤t_key); + __wt_scr_free(session, &las_addr); + __wt_scr_free(session, &las_key); + __wt_scr_free(session, &las_value); + + return (ret); +} + +/* + * __evict_force_check -- + * Check if a page matches the criteria for forced eviction. + */ +static int +__evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_BTREE *btree; + + btree = S2BT(session); + + /* Pages are usually small enough, check that first. */ + if (page->memory_footprint < btree->maxmempage) + return (0); + + /* Leaf pages only. */ + if (WT_PAGE_IS_INTERNAL(page)) + return (0); + + /* + * It's hard to imagine a page with a huge memory footprint that has + * never been modified, but check to be sure. + */ + if (page->modify == NULL) + return (0); + + /* Trigger eviction on the next page release. */ + __wt_page_evict_soon(page); + + /* Bump the oldest ID, we're about to do some visibility checks. */ + __wt_txn_update_oldest(session, 0); + + /* If eviction cannot succeed, don't try. */ + return (__wt_page_can_evict(session, page, 1, NULL)); +} + +/* + * __page_read -- + * Read a page from the file. + */ +static int +__page_read(WT_SESSION_IMPL *session, WT_REF *ref) +{ + const WT_PAGE_HEADER *dsk; + WT_BTREE *btree; WT_DECL_RET; WT_ITEM tmp; WT_PAGE *page; - WT_PAGE_STATE previous_state; size_t addr_size; + uint32_t previous_state; const uint8_t *addr; + btree = S2BT(session); page = NULL; /* @@ -35,9 +344,9 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref) * WT_REF_LOCKED, for deleted pages. If successful, we've won the * race, read the page. */ - if (WT_ATOMIC_CAS4(ref->state, WT_REF_DISK, WT_REF_READING)) + if (__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_READING)) previous_state = WT_REF_DISK; - else if (WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED)) + else if (__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED)) previous_state = WT_REF_DELETED; else return (0); @@ -45,8 +354,6 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref) /* * Get the address: if there is no address, the page was deleted, but a * subsequent search or insert is forcing re-creation of the name space. - * Otherwise, there's an address, read the backing disk page and build - * an in-memory version of the page. */ WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); if (addr == NULL) { @@ -54,27 +361,51 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref) WT_ERR(__wt_btree_new_leaf_page(session, &page)); ref->page = page; - } else { - /* - * Read the page, then build the in-memory version of the page. - * Clear any local reference to an allocated copy of the disk - * image on return, the page steals it. - */ - WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size)); - WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize, - WT_DATA_IN_ITEM(&tmp) ? - WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); - tmp.mem = NULL; - - /* If the page was deleted, instantiate that information. */ - if (previous_state == WT_REF_DELETED) - WT_ERR(__wt_delete_page_instantiate(session, ref)); + goto done; } - WT_ERR(__wt_verbose(session, WT_VERB_READ, - "page %p: %s", page, __wt_page_type_string(page->type))); + /* + * There's an address, read or map the backing disk page and build an + * in-memory version of the page. + */ + WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size)); + WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize, + WT_DATA_IN_ITEM(&tmp) ? + WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); + + /* + * Clear the local reference to an allocated copy of the disk image on + * return; the page steals it, errors in this code should not free it. + */ + tmp.mem = NULL; - WT_PUBLISH(ref->state, WT_REF_MEM); + /* + * If reading for a checkpoint, there's no additional work to do, the + * page on disk is correct as written. + */ + if (session->dhandle->checkpoint != NULL) + goto done; + + /* If the page was deleted, instantiate that information. */ + if (previous_state == WT_REF_DELETED) + WT_ERR(__wt_delete_page_instantiate(session, ref)); + + /* + * Instantiate updates from the database's lookaside table. The page + * flag was set when the page was written, potentially a long time ago. + * We only care if the lookaside table is currently active, check that + * before doing any work. + */ + dsk = tmp.data; + if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE) && __wt_las_is_written(session)) { + WT_STAT_FAST_CONN_INCR(session, cache_read_lookaside); + WT_STAT_FAST_DATA_INCR(session, cache_read_lookaside); + + WT_ERR(__las_page_instantiate( + session, ref, btree->id, addr, addr_size)); + } + +done: WT_PUBLISH(ref->state, WT_REF_MEM); return (0); err: /* @@ -90,3 +421,183 @@ err: /* return (ret); } + +/* + * __wt_page_in_func -- + * Acquire a hazard pointer to a page; if the page is not in-memory, + * read it from the disk and build an in-memory version. + */ +int +__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags +#ifdef HAVE_DIAGNOSTIC + , const char *file, int line +#endif + ) +{ + WT_BTREE *btree; + WT_DECL_RET; + WT_PAGE *page; + u_int sleep_cnt, wait_cnt; + int busy, cache_work, force_attempts, oldgen, stalled; + + btree = S2BT(session); + stalled = 0; + + for (force_attempts = oldgen = 0, sleep_cnt = wait_cnt = 0;;) { + switch (ref->state) { + case WT_REF_DISK: + case WT_REF_DELETED: + if (LF_ISSET(WT_READ_CACHE)) + return (WT_NOTFOUND); + + /* + * The page isn't in memory, read it. If this thread is + * allowed to do eviction work, check for space in the + * cache. + */ + if (!LF_ISSET(WT_READ_NO_EVICT)) + WT_RET(__wt_cache_eviction_check( + session, 1, NULL)); + WT_RET(__page_read(session, ref)); + oldgen = LF_ISSET(WT_READ_WONT_NEED) || + F_ISSET(session, WT_SESSION_NO_CACHE); + continue; + case WT_REF_READING: + if (LF_ISSET(WT_READ_CACHE)) + return (WT_NOTFOUND); + if (LF_ISSET(WT_READ_NO_WAIT)) + return (WT_NOTFOUND); + + /* Waiting on another thread's read, stall. */ + WT_STAT_FAST_CONN_INCR(session, page_read_blocked); + stalled = 1; + break; + case WT_REF_LOCKED: + if (LF_ISSET(WT_READ_NO_WAIT)) + return (WT_NOTFOUND); + + /* Waiting on eviction, stall. */ + WT_STAT_FAST_CONN_INCR(session, page_locked_blocked); + stalled = 1; + break; + case WT_REF_SPLIT: + return (WT_RESTART); + case WT_REF_MEM: + /* + * The page is in memory. + * + * Get a hazard pointer if one is required. We cannot + * be evicting if no hazard pointer is required, we're + * done. + */ + if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) + goto skip_evict; + + /* + * The expected reason we can't get a hazard pointer is + * because the page is being evicted, yield, try again. + */ +#ifdef HAVE_DIAGNOSTIC + WT_RET( + __wt_hazard_set(session, ref, &busy, file, line)); +#else + WT_RET(__wt_hazard_set(session, ref, &busy)); +#endif + if (busy) { + WT_STAT_FAST_CONN_INCR( + session, page_busy_blocked); + break; + } + + /* + * If eviction is configured for this file, check to see + * if the page qualifies for forced eviction and update + * the page's generation number. If eviction isn't being + * done on this file, we're done. + */ + if (LF_ISSET(WT_READ_NO_EVICT) || + F_ISSET(session, WT_SESSION_NO_EVICTION) || + F_ISSET(btree, WT_BTREE_NO_EVICTION)) + goto skip_evict; + + /* + * Forcibly evict pages that are too big. + */ + page = ref->page; + if (force_attempts < 10 && + __evict_force_check(session, page)) { + ++force_attempts; + ret = __wt_page_release_evict(session, ref); + /* If forced eviction fails, stall. */ + if (ret == EBUSY) { + ret = 0; + WT_STAT_FAST_CONN_INCR(session, + page_forcible_evict_blocked); + stalled = 1; + break; + } + WT_RET(ret); + + /* + * The result of a successful forced eviction + * is a page-state transition (potentially to + * an in-memory page we can use, or a restart + * return for our caller), continue the outer + * page-acquisition loop. + */ + continue; + } + + /* + * If we read the page and we are configured to not + * trash the cache, set the oldest read generation so + * the page is forcibly evicted as soon as possible. + * + * Otherwise, update the page's read generation. + */ + if (oldgen && page->read_gen == WT_READGEN_NOTSET) + __wt_page_evict_soon(page); + else if (!LF_ISSET(WT_READ_NO_GEN) && + page->read_gen != WT_READGEN_OLDEST && + page->read_gen < __wt_cache_read_gen(session)) + page->read_gen = + __wt_cache_read_gen_bump(session); +skip_evict: + /* + * Check if we need an autocommit transaction. + * Starting a transaction can trigger eviction, so skip + * it if eviction isn't permitted. + */ + return (LF_ISSET(WT_READ_NO_EVICT) ? 0 : + __wt_txn_autocommit_check(session)); + WT_ILLEGAL_VALUE(session); + } + + /* + * We failed to get the page -- yield before retrying, and if + * we've yielded enough times, start sleeping so we don't burn + * CPU to no purpose. + */ + if (stalled) + wait_cnt += 1000; + else if (++wait_cnt < 1000) { + __wt_yield(); + continue; + } + + /* + * If stalling and this thread is allowed to do eviction work, + * check if the cache needs help. If we do work for the cache, + * substitute that for a sleep. + */ + if (!LF_ISSET(WT_READ_NO_EVICT)) { + WT_RET( + __wt_cache_eviction_check(session, 1, &cache_work)); + if (cache_work) + continue; + } + sleep_cnt = WT_MIN(sleep_cnt + 1000, 10000); + WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt); + __wt_sleep(0, sleep_cnt); + } +} diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c index f41a5d86e9f..c2a211bdd2d 100644 --- a/src/btree/bt_slvg.c +++ b/src/btree/bt_slvg.c @@ -197,9 +197,9 @@ __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) * Turn off read checksum and verification error messages while we're * reading the file, we expect to see corrupted blocks. */ - F_SET(session, WT_SESSION_SALVAGE_CORRUPT_OK); + F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE); ret = __slvg_read(session, ss); - F_CLR(session, WT_SESSION_SALVAGE_CORRUPT_OK); + F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE); WT_ERR(ret); /* @@ -349,9 +349,6 @@ err: WT_TRET(bm->salvage_end(bm, session)); __wt_scr_free(session, &ss->tmp1); __wt_scr_free(session, &ss->tmp2); - /* Wrap up reporting. */ - WT_TRET(__wt_progress(session, NULL, ss->fcnt)); - return (ret); } @@ -381,8 +378,9 @@ __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss) if (eof) break; - /* Report progress every 10 chunks. */ - if (++ss->fcnt % 10 == 0) + /* Report progress occasionally. */ +#define WT_SALVAGE_PROGRESS_INTERVAL 100 + if (++ss->fcnt % WT_SALVAGE_PROGRESS_INTERVAL == 0) WT_ERR(__wt_progress(session, NULL, ss->fcnt)); /* @@ -1305,7 +1303,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) /* Write the new version of the leaf page to disk. */ WT_ERR(__slvg_modify_init(session, page)); - WT_ERR(__wt_reconcile(session, ref, cookie, WT_SKIP_UPDATE_ERR)); + WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR)); /* Reset the page. */ page->pg_var_d = save_col_var; @@ -2011,7 +2009,7 @@ __slvg_row_build_leaf( /* Write the new version of the leaf page to disk. */ WT_ERR(__slvg_modify_init(session, page)); - WT_ERR(__wt_reconcile(session, ref, cookie, WT_SKIP_UPDATE_ERR)); + WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR)); /* Reset the page. */ page->pg_row_entries += skip_stop; diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index dbd4042129d..4b9ab45c678 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -45,10 +45,13 @@ static int __split_stash_add( WT_SESSION_IMPL *session, uint64_t split_gen, void *p, size_t len) { + WT_CONNECTION_IMPL *conn; WT_SPLIT_STASH *stash; WT_ASSERT(session, p != NULL); + conn = S2C(session); + /* Grow the list as necessary. */ WT_RET(__wt_realloc_def(session, &session->split_stash_alloc, session->split_stash_cnt + 1, &session->split_stash)); @@ -58,8 +61,8 @@ __split_stash_add( stash->p = p; stash->len = len; - WT_STAT_FAST_CONN_ATOMIC_INCRV(session, rec_split_stashed_bytes, len); - WT_STAT_FAST_CONN_ATOMIC_INCR(session, rec_split_stashed_objects); + (void)__wt_atomic_add64(&conn->split_stashed_bytes, len); + (void)__wt_atomic_add64(&conn->split_stashed_objects, 1); /* See if we can free any previous entries. */ if (session->split_stash_cnt > 1) @@ -75,10 +78,13 @@ __split_stash_add( void __wt_split_stash_discard(WT_SESSION_IMPL *session) { + WT_CONNECTION_IMPL *conn; WT_SPLIT_STASH *stash; uint64_t oldest; size_t i; + conn = S2C(session); + /* Get the oldest split generation. */ oldest = __split_oldest_gen(session); @@ -93,10 +99,8 @@ __wt_split_stash_discard(WT_SESSION_IMPL *session) * It's a bad thing if another thread is in this memory after * we free it, make sure nothing good happens to that thread. */ - WT_STAT_FAST_CONN_ATOMIC_DECRV( - session, rec_split_stashed_bytes, stash->len); - WT_STAT_FAST_CONN_ATOMIC_DECR( - session, rec_split_stashed_objects); + (void)__wt_atomic_sub64(&conn->split_stashed_bytes, stash->len); + (void)__wt_atomic_sub64(&conn->split_stashed_objects, 1); __wt_overwrite_and_free_len(session, stash->p, stash->len); } @@ -169,7 +173,7 @@ __split_safe_free(WT_SESSION_IMPL *session, * __split_should_deepen -- * Return if we should deepen the tree. */ -static int +static bool __split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref) { WT_BTREE *btree; @@ -192,7 +196,7 @@ __split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref) * pressure on the cache). */ if (page->memory_footprint < btree->maxmempage) - return (0); + return (false); /* * Ensure the page has enough entries to make it worth splitting and @@ -200,7 +204,7 @@ __split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref) * splitting won't help). */ if (pindex->entries > btree->split_deepen_min_child) - return (1); + return (true); /* * Don't allow a single page to put pressure on cache usage. The root @@ -212,9 +216,9 @@ __split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref) if (pindex->entries >= 100 && (__wt_ref_is_root(ref) || page->memory_footprint >= S2C(session)->cache_size / 4)) - return (1); + return (true); - return (0); + return (false); } /* @@ -339,7 +343,7 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) switch (page->type) { case WT_PAGE_COL_INT: - recno = 0; + recno = 0; /* Less than any valid record number. */ WT_INTL_FOREACH_BEGIN(session, page, ref) { WT_ASSERT(session, ref->key.recno > recno); recno = ref->key.recno; @@ -557,7 +561,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent) */ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex); WT_INTL_INDEX_SET(parent, alloc_index); - split_gen = WT_ATOMIC_ADD8(S2C(session)->split_gen, 1); + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); panic = 1; #ifdef HAVE_DIAGNOSTIC @@ -680,13 +684,11 @@ __split_multi_inmem( WT_DECL_RET; WT_PAGE *page; WT_UPDATE *upd; - WT_UPD_SKIPPED *skip; + WT_SAVE_UPD *supd; uint64_t recno; uint32_t i, slot; - WT_CLEAR(cbt); - cbt.iface.session = &session->iface; - cbt.btree = S2BT(session); + __wt_btcur_init(session, &cbt); __wt_btcur_open(&cbt); /* @@ -700,22 +702,22 @@ __split_multi_inmem( * allocated page on error, when discarding the allocated WT_REF. */ WT_RET(__wt_page_inmem(session, ref, - multi->skip_dsk, ((WT_PAGE_HEADER *)multi->skip_dsk)->mem_size, + multi->supd_dsk, ((WT_PAGE_HEADER *)multi->supd_dsk)->mem_size, WT_PAGE_DISK_ALLOC, &page)); - multi->skip_dsk = NULL; + multi->supd_dsk = NULL; if (orig->type == WT_PAGE_ROW_LEAF) WT_RET(__wt_scr_alloc(session, 0, &key)); /* Re-create each modification we couldn't write. */ - for (i = 0, skip = multi->skip; i < multi->skip_entries; ++i, ++skip) + for (i = 0, supd = multi->supd; i < multi->supd_entries; ++i, ++supd) switch (orig->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: /* Build a key. */ - upd = skip->ins->upd; - skip->ins->upd = NULL; - recno = WT_INSERT_RECNO(skip->ins); + upd = supd->ins->upd; + supd->ins->upd = NULL; + recno = WT_INSERT_RECNO(supd->ins); /* Search the page. */ WT_ERR(__wt_col_search(session, recno, ref, &cbt)); @@ -726,19 +728,19 @@ __split_multi_inmem( break; case WT_PAGE_ROW_LEAF: /* Build a key. */ - if (skip->ins == NULL) { - slot = WT_ROW_SLOT(orig, skip->rip); + if (supd->ins == NULL) { + slot = WT_ROW_SLOT(orig, supd->rip); upd = orig->pg_row_upd[slot]; orig->pg_row_upd[slot] = NULL; WT_ERR(__wt_row_leaf_key( - session, orig, skip->rip, key, 0)); + session, orig, supd->rip, key, 0)); } else { - upd = skip->ins->upd; - skip->ins->upd = NULL; + upd = supd->ins->upd; + supd->ins->upd = NULL; - key->data = WT_INSERT_KEY(skip->ins); - key->size = WT_INSERT_KEY_SIZE(skip->ins); + key->data = WT_INSERT_KEY(supd->ins); + key->size = WT_INSERT_KEY_SIZE(supd->ins); } /* Search the page. */ @@ -761,7 +763,7 @@ __split_multi_inmem( page->modify->first_dirty_txn = WT_TXN_FIRST; err: /* Free any resources that may have been cached in the cursor. */ - WT_TRET(__wt_btcur_close(&cbt)); + WT_TRET(__wt_btcur_close(&cbt, 1)); __wt_scr_free(session, &key); return (ret); @@ -797,7 +799,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, */ ref->home = NULL; - if (multi->skip == NULL) { + if (multi->supd == NULL) { /* * Copy the address: we could simply take the buffer, but that * would complicate error handling, freeing the reference array @@ -826,7 +828,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, break; } - ref->state = multi->skip == NULL ? WT_REF_DISK : WT_REF_MEM; + ref->state = multi->supd == NULL ? WT_REF_DISK : WT_REF_MEM; /* * If our caller wants to track the memory allocations, we have a return @@ -837,16 +839,13 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, return (0); } -#define WT_SPLIT_EXCLUSIVE 0x01 /* Page held exclusively */ -#define WT_SPLIT_INMEM 0x02 /* In-memory split */ - /* * __split_parent -- * Resolve a multi-page split, inserting new information into the parent. */ static int __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, - WT_REF **ref_new, uint32_t new_entries, size_t parent_incr, uint32_t flags) + WT_REF **ref_new, uint32_t new_entries, size_t parent_incr, int exclusive) { WT_DECL_RET; WT_IKEY *ikey; @@ -874,26 +873,39 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, * memory inside of the lock and may want to invest effort in making the * locked period shorter. * - * We could race with another thread deepening our parent. To deal - * with that, read the parent pointer each time we try to lock it, and - * check that it's still correct after it is locked. + * We use the reconciliation lock here because not only do we have to + * single-thread the split, we have to lock out reconciliation of the + * parent because reconciliation of the parent can't deal with finding + * a split child during internal page traversal. Basically, there's no + * reason to use a different lock if we have to block reconciliation + * anyway. */ for (;;) { parent = ref->home; - F_CAS_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED, ret); + F_CAS_ATOMIC(parent, WT_PAGE_RECONCILIATION, ret); if (ret == 0) { + /* + * We can race with another thread deepening our parent. + * To deal with that, read the parent pointer each time + * we try to lock it, and check it's still correct after + * it's locked. + */ if (parent == ref->home) break; - F_CLR_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED); + F_CLR_ATOMIC(parent, WT_PAGE_RECONCILIATION); continue; } + /* - * If we're attempting an in-memory split and we can't lock the - * parent, give up. This avoids an infinite loop where we are - * trying to split a page while its parent is being - * checkpointed. + * A checkpoint reconciling this parent page can deadlock with + * our split. We have an exclusive page lock on the child before + * we acquire the page's reconciliation lock, and reconciliation + * acquires the page's reconciliation lock before it encounters + * the child's exclusive lock (which causes reconciliation to + * loop until the exclusive lock is resolved). If we can't lock + * the parent, give up to avoid that deadlock. */ - if (LF_ISSET(WT_SPLIT_INMEM)) + if (S2BT(session)->checkpointing) return (EBUSY); __wt_yield(); } @@ -905,9 +917,10 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, * could conceivably be evicted. Get a hazard pointer on the parent * now, so that we can safely access it after updating the index. * - * Take care that getting the page doesn't trigger eviction, or we - * could block trying to split a different child of our parent and - * deadlock. + * Take care getting the page doesn't trigger eviction work: we could + * block trying to split a different child of our parent and deadlock + * or we could be the eviction server relied upon by other threads to + * populate the eviction queue. */ if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) { WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT)); @@ -933,8 +946,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_ASSERT(session, next_ref->state != WT_REF_SPLIT); if (next_ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, next_ref) && - WT_ATOMIC_CAS4(next_ref->state, - WT_REF_DELETED, WT_REF_SPLIT)) + __wt_atomic_casv32( + &next_ref->state, WT_REF_DELETED, WT_REF_SPLIT)) deleted_entries++; } @@ -994,7 +1007,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, */ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex); WT_INTL_INDEX_SET(parent, alloc_index); - split_gen = WT_ATOMIC_ADD8(S2C(session)->split_gen, 1); + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); alloc_index = NULL; #ifdef HAVE_DIAGNOSTIC @@ -1089,8 +1102,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, * Add it to the session discard list, to be freed when it's safe. */ size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *); - WT_TRET(__split_safe_free(session, - split_gen, LF_ISSET(WT_SPLIT_EXCLUSIVE) ? 1 : 0, pindex, size)); + WT_TRET(__split_safe_free(session, split_gen, exclusive, pindex, size)); parent_decr += size; /* @@ -1115,7 +1127,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, * Do the check here because we've just grown the parent page and * are holding it locked. */ - if (ret == 0 && !LF_ISSET(WT_SPLIT_EXCLUSIVE) && + if (ret == 0 && !exclusive && __split_should_deepen(session, parent_ref)) ret = __split_deepen(session, parent); @@ -1125,7 +1137,7 @@ err: if (!complete) if (next_ref->state == WT_REF_SPLIT) next_ref->state = WT_REF_DELETED; } - F_CLR_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED); + F_CLR_ATOMIC(parent, WT_PAGE_RECONCILIATION); if (hazard) WT_TRET(__wt_hazard_clear(session, parent)); @@ -1164,7 +1176,13 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) right = NULL; page_decr = parent_incr = right_incr = 0; + /* + * Assert splitting makes sense; specifically assert the page is dirty, + * we depend on that, otherwise the page might be evicted based on its + * last reconciliation which no longer matches reality after the split. + */ WT_ASSERT(session, __wt_page_can_split(session, page)); + WT_ASSERT(session, __wt_page_is_modified(page)); /* Find the last item on the page. */ ins_head = page->pg_row_entries == 0 ? @@ -1192,7 +1210,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) * The key-instantiation code checks for races, clear the key fields so * we don't trigger them. */ - child->key.recno = 0; + child->key.recno = WT_RECNO_OOB; child->key.ikey = NULL; child->state = WT_REF_MEM; @@ -1367,7 +1385,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) */ page = NULL; if ((ret = __split_parent( - session, ref, split_ref, 2, parent_incr, WT_SPLIT_INMEM)) != 0) { + session, ref, split_ref, 2, parent_incr, 0)) != 0) { /* * Move the insert list element back to the original page list. * For simplicity, the previous skip list pointers originally @@ -1384,8 +1402,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) * We marked the new page dirty; we're going to discard it, but * first mark it clean and fix up the cache statistics. */ - right->modify->write_gen = 0; - __wt_cache_dirty_decr(session, right); + __wt_page_modify_clear(session, right); WT_ERR(ret); } @@ -1442,8 +1459,7 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref) * Pages with unresolved changes are not marked clean during * reconciliation, do it now. */ - mod->write_gen = 0; - __wt_cache_dirty_decr(session, page); + __wt_page_modify_clear(session, page); __wt_ref_out(session, ref); /* Swap the new page into place. */ @@ -1486,8 +1502,8 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing) * Split into the parent; if we're closing the file, we hold it * exclusively. */ - WT_ERR(__split_parent( session, ref, ref_new, - new_entries, parent_incr, closing ? WT_SPLIT_EXCLUSIVE : 0)); + WT_ERR(__split_parent( + session, ref, ref_new, new_entries, parent_incr, closing)); WT_STAT_FAST_CONN_INCR(session, cache_eviction_split); WT_STAT_FAST_DATA_INCR(session, cache_eviction_split); @@ -1500,10 +1516,7 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing) * Pages with unresolved changes are not marked clean during * reconciliation, do it now. */ - if (__wt_page_is_modified(page)) { - mod->write_gen = 0; - __wt_cache_dirty_decr(session, page); - } + __wt_page_modify_clear(session, page); __wt_page_out(session, &page); return (0); diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c index 6285edde217..b379712f6e7 100644 --- a/src/btree/bt_stat.c +++ b/src/btree/bt_stat.c @@ -8,10 +8,11 @@ #include "wt_internal.h" -static int __stat_page(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS *); -static void __stat_page_col_var(WT_PAGE *, WT_DSRC_STATS *); -static void __stat_page_row_int(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS *); -static void __stat_page_row_leaf(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS *); +static int __stat_page(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS **); +static void __stat_page_col_var(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS **); +static void __stat_page_row_int(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS **); +static void + __stat_page_row_leaf(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS **); /* * __wt_btree_stat_init -- @@ -23,22 +24,22 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) WT_BM *bm; WT_BTREE *btree; WT_DECL_RET; - WT_DSRC_STATS *stats; + WT_DSRC_STATS **stats; WT_REF *next_walk; btree = S2BT(session); bm = btree->bm; - stats = &btree->dhandle->stats; + stats = btree->dhandle->stats; - WT_RET(bm->stat(bm, session, stats)); + WT_RET(bm->stat(bm, session, stats[0])); - WT_STAT_SET(stats, btree_fixed_len, btree->bitcnt); - WT_STAT_SET(stats, btree_maximum_depth, btree->maximum_depth); - WT_STAT_SET(stats, btree_maxintlpage, btree->maxintlpage); - WT_STAT_SET(stats, btree_maxintlkey, btree->maxintlkey); - WT_STAT_SET(stats, btree_maxleafpage, btree->maxleafpage); - WT_STAT_SET(stats, btree_maxleafkey, btree->maxleafkey); - WT_STAT_SET(stats, btree_maxleafvalue, btree->maxleafvalue); + WT_STAT_SET(session, stats, btree_fixed_len, btree->bitcnt); + WT_STAT_SET(session, stats, btree_maximum_depth, btree->maximum_depth); + WT_STAT_SET(session, stats, btree_maxintlpage, btree->maxintlpage); + WT_STAT_SET(session, stats, btree_maxintlkey, btree->maxintlkey); + WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage); + WT_STAT_SET(session, stats, btree_maxleafkey, btree->maxleafkey); + WT_STAT_SET(session, stats, btree_maxleafvalue, btree->maxleafvalue); /* Everything else is really, really expensive. */ if (!F_ISSET(cst, WT_CONN_STAT_ALL)) @@ -47,14 +48,15 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) /* * Clear the statistics we're about to count. */ - WT_STAT_SET(stats, btree_column_deleted, 0); - WT_STAT_SET(stats, btree_column_fix, 0); - WT_STAT_SET(stats, btree_column_internal, 0); - WT_STAT_SET(stats, btree_column_variable, 0); - WT_STAT_SET(stats, btree_entries, 0); - WT_STAT_SET(stats, btree_overflow, 0); - WT_STAT_SET(stats, btree_row_internal, 0); - WT_STAT_SET(stats, btree_row_leaf, 0); + WT_STAT_SET(session, stats, btree_column_deleted, 0); + WT_STAT_SET(session, stats, btree_column_fix, 0); + WT_STAT_SET(session, stats, btree_column_internal, 0); + WT_STAT_SET(session, stats, btree_column_rle, 0); + WT_STAT_SET(session, stats, btree_column_variable, 0); + WT_STAT_SET(session, stats, btree_entries, 0); + WT_STAT_SET(session, stats, btree_overflow, 0); + WT_STAT_SET(session, stats, btree_row_internal, 0); + WT_STAT_SET(session, stats, btree_row_leaf, 0); next_walk = NULL; while ((ret = __wt_tree_walk(session, &next_walk, NULL, 0)) == 0 && @@ -71,7 +73,7 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) * Stat any Btree page. */ static int -__stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats) +__stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats) { /* * All internal pages and overflow pages are trivial, all we track is @@ -79,14 +81,15 @@ __stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats) */ switch (page->type) { case WT_PAGE_COL_FIX: - WT_STAT_INCR(stats, btree_column_fix); - WT_STAT_INCRV(stats, btree_entries, page->pg_fix_entries); + WT_STAT_INCR(session, stats, btree_column_fix); + WT_STAT_INCRV( + session, stats, btree_entries, page->pg_fix_entries); break; case WT_PAGE_COL_INT: - WT_STAT_INCR(stats, btree_column_internal); + WT_STAT_INCR(session, stats, btree_column_internal); break; case WT_PAGE_COL_VAR: - __stat_page_col_var(page, stats); + __stat_page_col_var(session, page, stats); break; case WT_PAGE_ROW_INT: __stat_page_row_int(session, page, stats); @@ -104,21 +107,22 @@ __stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats) * Stat a WT_PAGE_COL_VAR page. */ static void -__stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats) +__stat_page_col_var( + WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats) { WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_COL *cip; WT_INSERT *ins; WT_UPDATE *upd; - uint64_t deleted_cnt, entry_cnt, ovfl_cnt; + uint64_t deleted_cnt, entry_cnt, ovfl_cnt, rle_cnt; uint32_t i; int orig_deleted; unpack = &_unpack; - deleted_cnt = entry_cnt = ovfl_cnt = 0; + deleted_cnt = entry_cnt = ovfl_cnt = rle_cnt = 0; - WT_STAT_INCR(stats, btree_column_variable); + WT_STAT_INCR(session, stats, btree_column_variable); /* * Walk the page counting regular items, adjusting if the item has been @@ -137,8 +141,10 @@ __stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats) __wt_cell_unpack(cell, unpack); if (unpack->type == WT_CELL_ADDR_DEL) orig_deleted = 1; - else + else { entry_cnt += __wt_cell_rle(unpack); + rle_cnt += __wt_cell_rle(unpack) - 1; + } if (unpack->ovfl) ++ovfl_cnt; } @@ -169,9 +175,10 @@ __stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats) else ++entry_cnt; - WT_STAT_INCRV(stats, btree_column_deleted, deleted_cnt); - WT_STAT_INCRV(stats, btree_entries, entry_cnt); - WT_STAT_INCRV(stats, btree_overflow, ovfl_cnt); + WT_STAT_INCRV(session, stats, btree_column_deleted, deleted_cnt); + WT_STAT_INCRV(session, stats, btree_column_rle, rle_cnt); + WT_STAT_INCRV(session, stats, btree_entries, entry_cnt); + WT_STAT_INCRV(session, stats, btree_overflow, ovfl_cnt); } /* @@ -180,7 +187,7 @@ __stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats) */ static void __stat_page_row_int( - WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats) + WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats) { WT_BTREE *btree; WT_CELL *cell; @@ -190,7 +197,7 @@ __stat_page_row_int( btree = S2BT(session); ovfl_cnt = 0; - WT_STAT_INCR(stats, btree_row_internal); + WT_STAT_INCR(session, stats, btree_row_internal); /* * Overflow keys are hard: we have to walk the disk image to count them, @@ -204,7 +211,7 @@ __stat_page_row_int( ++ovfl_cnt; } - WT_STAT_INCRV(stats, btree_overflow, ovfl_cnt); + WT_STAT_INCRV(session, stats, btree_overflow, ovfl_cnt); } /* @@ -213,7 +220,7 @@ __stat_page_row_int( */ static void __stat_page_row_leaf( - WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats) + WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats) { WT_BTREE *btree; WT_CELL *cell; @@ -226,7 +233,7 @@ __stat_page_row_leaf( btree = S2BT(session); entry_cnt = ovfl_cnt = 0; - WT_STAT_INCR(stats, btree_row_leaf); + WT_STAT_INCR(session, stats, btree_row_leaf); /* * Walk any K/V pairs inserted into the page before the first from-disk @@ -267,6 +274,6 @@ __stat_page_row_leaf( ++ovfl_cnt; } - WT_STAT_INCRV(stats, btree_entries, entry_cnt); - WT_STAT_INCRV(stats, btree_overflow, ovfl_cnt); + WT_STAT_INCRV(session, stats, btree_entries, entry_cnt); + WT_STAT_INCRV(session, stats, btree_overflow, ovfl_cnt); } diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index 838d778dadf..29ae5b185cd 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -259,7 +259,6 @@ __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op) break; case WT_SYNC_CLOSE: case WT_SYNC_DISCARD: - case WT_SYNC_DISCARD_FORCE: WT_ERR(__wt_evict_file(session, op)); break; WT_ILLEGAL_VALUE_ERR(session); diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c index 3f615babb07..1fd660d4cd4 100644 --- a/src/btree/bt_vrfy.c +++ b/src/btree/bt_vrfy.c @@ -245,9 +245,6 @@ err: /* Inform the underlying block manager we're done. */ if (ckptbase != NULL) __wt_meta_ckptlist_free(session, ckptbase); - /* Wrap up reporting. */ - WT_TRET(__wt_progress(session, NULL, vs->fcnt)); - /* Free allocated memory. */ __wt_scr_free(session, &vs->max_key); __wt_scr_free(session, &vs->max_addr); @@ -343,9 +340,10 @@ __verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs) * of the page to be built, and then a subsequent logical verification * which happens here. * - * Report progress every 10 pages. + * Report progress occasionally. */ - if (++vs->fcnt % 10 == 0) +#define WT_VERIFY_PROGRESS_INTERVAL 100 + if (++vs->fcnt % WT_VERIFY_PROGRESS_INTERVAL == 0) WT_RET(__wt_progress(session, NULL, vs->fcnt)); #ifdef HAVE_DIAGNOSTIC diff --git a/src/btree/bt_vrfy_dsk.c b/src/btree/bt_vrfy_dsk.c index 904a16a7548..e80bde3c91e 100644 --- a/src/btree/bt_vrfy_dsk.c +++ b/src/btree/bt_vrfy_dsk.c @@ -26,13 +26,13 @@ static int __verify_dsk_row( WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *); #define WT_ERR_VRFY(session, ...) do { \ - if (!(F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))) \ + if (!(F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))) \ __wt_errx(session, __VA_ARGS__); \ goto err; \ } while (0) #define WT_RET_VRFY(session, ...) do { \ - if (!(F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))) \ + if (!(F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))) \ __wt_errx(session, __VA_ARGS__); \ return (WT_ERROR); \ } while (0) @@ -43,7 +43,7 @@ static int __verify_dsk_row( */ int __wt_verify_dsk_image(WT_SESSION_IMPL *session, - const char *addr, const WT_PAGE_HEADER *dsk, size_t size, int empty_page_ok) + const char *tag, const WT_PAGE_HEADER *dsk, size_t size, int empty_page_ok) { const uint8_t *p, *end; u_int i; @@ -63,7 +63,7 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session, default: WT_RET_VRFY(session, "page at %s has an invalid type of %" PRIu32, - addr, dsk->type); + tag, dsk->type); } /* Check the page record number. */ @@ -71,51 +71,54 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session, case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: - if (dsk->recno != 0) + if (dsk->recno != WT_RECNO_OOB) break; WT_RET_VRFY(session, - "%s page at %s has a record number of zero", - __wt_page_type_string(dsk->type), addr); + "%s page at %s has an invalid record number of %d", + __wt_page_type_string(dsk->type), tag, WT_RECNO_OOB); case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_OVFL: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: - if (dsk->recno == 0) + if (dsk->recno == WT_RECNO_OOB) break; WT_RET_VRFY(session, - "%s page at %s has a non-zero record number", - __wt_page_type_string(dsk->type), addr); + "%s page at %s has a record number, which is illegal for " + "this page type", + __wt_page_type_string(dsk->type), tag); } /* Check the page flags. */ flags = dsk->flags; if (LF_ISSET(WT_PAGE_COMPRESSED)) LF_CLR(WT_PAGE_COMPRESSED); - if (LF_ISSET(WT_PAGE_ENCRYPTED)) - LF_CLR(WT_PAGE_ENCRYPTED); if (dsk->type == WT_PAGE_ROW_LEAF) { if (LF_ISSET(WT_PAGE_EMPTY_V_ALL) && LF_ISSET(WT_PAGE_EMPTY_V_NONE)) WT_RET_VRFY(session, "page at %s has invalid flags combination: 0x%" PRIx8, - addr, dsk->flags); + tag, dsk->flags); if (LF_ISSET(WT_PAGE_EMPTY_V_ALL)) LF_CLR(WT_PAGE_EMPTY_V_ALL); if (LF_ISSET(WT_PAGE_EMPTY_V_NONE)) LF_CLR(WT_PAGE_EMPTY_V_NONE); } + if (LF_ISSET(WT_PAGE_ENCRYPTED)) + LF_CLR(WT_PAGE_ENCRYPTED); + if (LF_ISSET(WT_PAGE_LAS_UPDATE)) + LF_CLR(WT_PAGE_LAS_UPDATE); if (flags != 0) WT_RET_VRFY(session, "page at %s has invalid flags set: 0x%" PRIx8, - addr, flags); + tag, flags); /* Unused bytes */ for (p = dsk->unused, i = sizeof(dsk->unused); i > 0; --i) if (*p != '\0') WT_RET_VRFY(session, "page at %s has non-zero unused page header bytes", - addr); + tag); /* * Any bytes after the data chunk should be nul bytes; ignore if the @@ -129,7 +132,7 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session, if (*p != '\0') WT_RET_VRFY(session, "%s page at %s has non-zero trailing bytes", - __wt_page_type_string(dsk->type), addr); + __wt_page_type_string(dsk->type), tag); } /* Check for empty pages, then verify the items on the page. */ @@ -141,28 +144,28 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session, case WT_PAGE_ROW_LEAF: if (!empty_page_ok && dsk->u.entries == 0) WT_RET_VRFY(session, "%s page at %s has no entries", - __wt_page_type_string(dsk->type), addr); + __wt_page_type_string(dsk->type), tag); break; case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_OVFL: if (dsk->u.datalen == 0) WT_RET_VRFY(session, "%s page at %s has no data", - __wt_page_type_string(dsk->type), addr); + __wt_page_type_string(dsk->type), tag); break; } switch (dsk->type) { case WT_PAGE_COL_INT: - return (__verify_dsk_col_int(session, addr, dsk)); + return (__verify_dsk_col_int(session, tag, dsk)); case WT_PAGE_COL_FIX: - return (__verify_dsk_col_fix(session, addr, dsk)); + return (__verify_dsk_col_fix(session, tag, dsk)); case WT_PAGE_COL_VAR: - return (__verify_dsk_col_var(session, addr, dsk)); + return (__verify_dsk_col_var(session, tag, dsk)); case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: - return (__verify_dsk_row(session, addr, dsk)); + return (__verify_dsk_row(session, tag, dsk)); case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_OVFL: - return (__verify_dsk_chunk(session, addr, dsk, dsk->u.datalen)); + return (__verify_dsk_chunk(session, tag, dsk, dsk->u.datalen)); WT_ILLEGAL_VALUE(session); } /* NOTREACHED */ @@ -173,9 +176,9 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session, * Verify a single Btree page as read from disk. */ int -__wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf) +__wt_verify_dsk(WT_SESSION_IMPL *session, const char *tag, WT_ITEM *buf) { - return (__wt_verify_dsk_image(session, addr, buf->data, buf->size, 0)); + return (__wt_verify_dsk_image(session, tag, buf->data, buf->size, 0)); } /* @@ -184,7 +187,7 @@ __wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf) */ static int __verify_dsk_row( - WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk) + WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk) { WT_BM *bm; WT_BTREE *btree; @@ -220,16 +223,16 @@ __verify_dsk_row( ++cell_num; /* Carefully unpack the cell. */ - if (__wt_cell_unpack_safe(cell, unpack, end) != 0) { - ret = __err_cell_corrupted(session, cell_num, addr); + if (__wt_cell_unpack_safe(cell, unpack, dsk, end) != 0) { + ret = __err_cell_corrupted(session, cell_num, tag); goto err; } /* Check the raw and collapsed cell types. */ WT_ERR(__err_cell_type( - session, cell_num, addr, unpack->raw, dsk->type)); + session, cell_num, tag, unpack->raw, dsk->type)); WT_ERR(__err_cell_type( - session, cell_num, addr, unpack->type, dsk->type)); + session, cell_num, tag, unpack->type, dsk->type)); cell_type = unpack->type; /* @@ -256,7 +259,7 @@ __verify_dsk_row( WT_ERR_VRFY(session, "cell %" PRIu32 " on page at %s is the " "first of two adjacent keys", - cell_num - 1, addr); + cell_num - 1, tag); } last_cell_type = WAS_KEY; break; @@ -269,14 +272,14 @@ __verify_dsk_row( switch (last_cell_type) { case FIRST: WT_ERR_VRFY(session, - "page at %s begins with a value", addr); + "page at %s begins with a value", tag); case WAS_KEY: break; case WAS_VALUE: WT_ERR_VRFY(session, "cell %" PRIu32 " on page at %s is the " "first of two adjacent values", - cell_num - 1, addr); + cell_num - 1, tag); } last_cell_type = WAS_VALUE; break; @@ -327,7 +330,7 @@ __verify_dsk_row( "the %" PRIu32 " key on page at %s is the first " "non-overflow key on the page and has a non-zero " "prefix compression value", - cell_num, addr); + cell_num, tag); /* Confirm the prefix compression count is possible. */ if (cell_num > 1 && prefix > last->size) @@ -335,7 +338,7 @@ __verify_dsk_row( "key %" PRIu32 " on page at %s has a prefix " "compression count of %" PRIu32 ", larger than " "the length of the previous key, %" WT_SIZET_FMT, - cell_num, addr, prefix, last->size); + cell_num, tag, prefix, last->size); /* * If Huffman decoding required, unpack the cell to build the @@ -394,7 +397,7 @@ key_compare: /* WT_ERR_VRFY(session, "the %" PRIu32 " and %" PRIu32 " keys on " "page at %s are incorrectly sorted", - cell_num - 2, cell_num, addr); + cell_num - 2, cell_num, tag); } /* @@ -414,7 +417,7 @@ key_compare: /* } WT_ASSERT(session, last != current); } - WT_ERR(__verify_dsk_memsize(session, addr, dsk, cell)); + WT_ERR(__verify_dsk_memsize(session, tag, dsk, cell)); /* * On row-store internal pages, and on row-store leaf pages, where the @@ -428,7 +431,7 @@ key_compare: /* "%s page at %s has a key count of %" PRIu32 " and a " "physical entry count of %" PRIu32, __wt_page_type_string(dsk->type), - addr, key_cnt, dsk->u.entries); + tag, key_cnt, dsk->u.entries); if (dsk->type == WT_PAGE_ROW_LEAF && F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL) && key_cnt != dsk->u.entries) @@ -437,7 +440,7 @@ key_compare: /* "key count of %" PRIu32 " and a physical entry count of %" PRIu32, __wt_page_type_string(dsk->type), - addr, key_cnt, dsk->u.entries); + tag, key_cnt, dsk->u.entries); if (dsk->type == WT_PAGE_ROW_LEAF && F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE) && key_cnt * 2 != dsk->u.entries) @@ -446,10 +449,10 @@ key_compare: /* "key count of %" PRIu32 " and a physical entry count of %" PRIu32, __wt_page_type_string(dsk->type), - addr, key_cnt, dsk->u.entries); + tag, key_cnt, dsk->u.entries); if (0) { -eof: ret = __err_eof(session, cell_num, addr); +eof: ret = __err_eof(session, cell_num, tag); } if (0) { @@ -468,7 +471,7 @@ err: if (ret == 0) */ static int __verify_dsk_col_int( - WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk) + WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk) { WT_BM *bm; WT_BTREE *btree; @@ -487,20 +490,20 @@ __verify_dsk_col_int( ++cell_num; /* Carefully unpack the cell. */ - if (__wt_cell_unpack_safe(cell, unpack, end) != 0) - return (__err_cell_corrupted(session, cell_num, addr)); + if (__wt_cell_unpack_safe(cell, unpack, dsk, end) != 0) + return (__err_cell_corrupted(session, cell_num, tag)); /* Check the raw and collapsed cell types. */ WT_RET(__err_cell_type( - session, cell_num, addr, unpack->raw, dsk->type)); + session, cell_num, tag, unpack->raw, dsk->type)); WT_RET(__err_cell_type( - session, cell_num, addr, unpack->type, dsk->type)); + session, cell_num, tag, unpack->type, dsk->type)); /* Check if any referenced item is entirely in the file. */ if (!bm->addr_valid(bm, session, unpack->data, unpack->size)) - return (__err_eof(session, cell_num, addr)); + return (__err_eof(session, cell_num, tag)); } - WT_RET(__verify_dsk_memsize(session, addr, dsk, cell)); + WT_RET(__verify_dsk_memsize(session, tag, dsk, cell)); return (0); } @@ -511,7 +514,7 @@ __verify_dsk_col_int( */ static int __verify_dsk_col_fix( - WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk) + WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk) { WT_BTREE *btree; uint32_t datalen; @@ -519,7 +522,7 @@ __verify_dsk_col_fix( btree = S2BT(session); datalen = __bitstr_size(btree->bitcnt * dsk->u.entries); - return (__verify_dsk_chunk(session, addr, dsk, datalen)); + return (__verify_dsk_chunk(session, tag, dsk, datalen)); } /* @@ -528,7 +531,7 @@ __verify_dsk_col_fix( */ static int __verify_dsk_col_var( - WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk) + WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk) { WT_BM *bm; WT_BTREE *btree; @@ -554,20 +557,20 @@ __verify_dsk_col_var( ++cell_num; /* Carefully unpack the cell. */ - if (__wt_cell_unpack_safe(cell, unpack, end) != 0) - return (__err_cell_corrupted(session, cell_num, addr)); + if (__wt_cell_unpack_safe(cell, unpack, dsk, end) != 0) + return (__err_cell_corrupted(session, cell_num, tag)); /* Check the raw and collapsed cell types. */ WT_RET(__err_cell_type( - session, cell_num, addr, unpack->raw, dsk->type)); + session, cell_num, tag, unpack->raw, dsk->type)); WT_RET(__err_cell_type( - session, cell_num, addr, unpack->type, dsk->type)); + session, cell_num, tag, unpack->type, dsk->type)); cell_type = unpack->type; /* Check if any referenced item is entirely in the file. */ if (cell_type == WT_CELL_VALUE_OVFL && !bm->addr_valid(bm, session, unpack->data, unpack->size)) - return (__err_eof(session, cell_num, addr)); + return (__err_eof(session, cell_num, tag)); /* * Compare the last two items and see if reconciliation missed @@ -586,7 +589,7 @@ match_err: WT_RET_VRFY(session, "data entries %" PRIu32 " and %" PRIu32 " on page at %s are identical and should " "have been run-length encoded", - cell_num - 1, cell_num, addr); + cell_num - 1, cell_num, tag); switch (cell_type) { case WT_CELL_DEL: @@ -604,7 +607,7 @@ match_err: WT_RET_VRFY(session, break; } } - WT_RET(__verify_dsk_memsize(session, addr, dsk, cell)); + WT_RET(__verify_dsk_memsize(session, tag, dsk, cell)); return (0); } @@ -615,7 +618,7 @@ match_err: WT_RET_VRFY(session, */ static int __verify_dsk_memsize(WT_SESSION_IMPL *session, - const char *addr, const WT_PAGE_HEADER *dsk, WT_CELL *cell) + const char *tag, const WT_PAGE_HEADER *dsk, WT_CELL *cell) { size_t len; @@ -630,7 +633,7 @@ __verify_dsk_memsize(WT_SESSION_IMPL *session, WT_RET_VRFY(session, "%s page at %s has %" WT_SIZET_FMT " unexpected bytes of data " "after the last cell", - __wt_page_type_string(dsk->type), addr, len); + __wt_page_type_string(dsk->type), tag, len); } /* @@ -639,7 +642,7 @@ __verify_dsk_memsize(WT_SESSION_IMPL *session, */ static int __verify_dsk_chunk(WT_SESSION_IMPL *session, - const char *addr, const WT_PAGE_HEADER *dsk, uint32_t datalen) + const char *tag, const WT_PAGE_HEADER *dsk, uint32_t datalen) { WT_BTREE *btree; uint8_t *p, *end; @@ -655,14 +658,14 @@ __verify_dsk_chunk(WT_SESSION_IMPL *session, if (p + datalen > end) WT_RET_VRFY(session, "data on page at %s extends past the end of the page", - addr); + tag); /* Any bytes after the data chunk should be nul bytes. */ for (p += datalen; p < end; ++p) if (*p != '\0') WT_RET_VRFY(session, "%s page at %s has non-zero trailing bytes", - __wt_page_type_string(dsk->type), addr); + __wt_page_type_string(dsk->type), tag); return (0); } @@ -673,11 +676,11 @@ __verify_dsk_chunk(WT_SESSION_IMPL *session, */ static int __err_cell_corrupted( - WT_SESSION_IMPL *session, uint32_t entry_num, const char *addr) + WT_SESSION_IMPL *session, uint32_t entry_num, const char *tag) { WT_RET_VRFY(session, "item %" PRIu32 " on page at %s is a corrupted cell", - entry_num, addr); + entry_num, tag); } /* @@ -686,7 +689,7 @@ __err_cell_corrupted( */ static int __err_cell_type(WT_SESSION_IMPL *session, - uint32_t entry_num, const char *addr, uint8_t cell_type, uint8_t dsk_type) + uint32_t entry_num, const char *tag, uint8_t cell_type, uint8_t dsk_type) { switch (cell_type) { case WT_CELL_ADDR_DEL: @@ -735,7 +738,7 @@ __err_cell_type(WT_SESSION_IMPL *session, WT_RET_VRFY(session, "illegal cell and page type combination: cell %" PRIu32 " on page at %s is a %s cell on a %s page", - entry_num, addr, + entry_num, tag, __wt_cell_type_string(cell_type), __wt_page_type_string(dsk_type)); } @@ -744,10 +747,10 @@ __err_cell_type(WT_SESSION_IMPL *session, * Generic item references non-existent file pages error. */ static int -__err_eof(WT_SESSION_IMPL *session, uint32_t entry_num, const char *addr) +__err_eof(WT_SESSION_IMPL *session, uint32_t entry_num, const char *tag) { WT_RET_VRFY(session, "off-page item %" PRIu32 " on page at %s references non-existent file pages", - entry_num, addr); + entry_num, tag); } diff --git a/src/btree/col_modify.c b/src/btree/col_modify.c index 2fe09681090..cbc5143698b 100644 --- a/src/btree/col_modify.c +++ b/src/btree/col_modify.c @@ -17,7 +17,7 @@ static int __col_insert_alloc( */ int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, - uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove) + uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, int is_remove) { WT_BTREE *btree; WT_DECL_RET; @@ -25,7 +25,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, **ins_headp; WT_ITEM _value; WT_PAGE *page; - WT_UPDATE *old_upd; + WT_UPDATE *old_upd, *upd; size_t ins_size, upd_size; u_int i, skipdepth; int append, logged; @@ -33,6 +33,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, btree = cbt->btree; ins = NULL; page = cbt->ref->page; + upd = upd_arg; append = logged = 0; /* This code expects a remove to have a NULL value. */ @@ -48,10 +49,10 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, * There's some chance the application specified a record past * the last record on the page. If that's the case, and we're * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the - * append list, not the update list. In addition, a recno of 0 + * append list, not the update list. Also, an out-of-band recno * implies an append operation, we're allocating a new row. */ - if (recno == 0 || + if (recno == WT_RECNO_OOB || recno > (btree->type == BTREE_COL_VAR ? __col_var_last_recno(page) : __col_fix_last_recno(page))) append = 1; @@ -76,7 +77,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, * If we are restoring updates that couldn't be evicted, the * key must not exist on the new page. */ - WT_ASSERT(session, upd == NULL); + WT_ASSERT(session, upd_arg == NULL); /* Make sure the update can proceed. */ WT_ERR(__wt_txn_update_check( @@ -134,7 +135,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, cbt->ins_head = ins_head; cbt->ins = ins; - if (upd == NULL) { + if (upd_arg == NULL) { WT_ERR( __wt_update_alloc(session, value, &upd, &upd_size)); WT_ERR(__wt_txn_modify(session, upd)); @@ -160,7 +161,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, * The serial mutex acts as our memory barrier to flush these * writes before inserting them into the list. */ - if (WT_SKIP_FIRST(ins_head) == NULL || recno == 0) + if (cbt->ins_stack[0] == NULL || recno == WT_RECNO_OOB) for (i = 0; i < skipdepth; i++) { cbt->ins_stack[i] = &ins_head->head[i]; ins->next[i] = cbt->next_stack[i] = NULL; @@ -192,7 +193,8 @@ err: /* if (logged) __wt_txn_unmodify(session); __wt_free(session, ins); - __wt_free(session, upd); + if (upd_arg == NULL) + __wt_free(session, upd); } return (ret); diff --git a/src/btree/row_key.c b/src/btree/row_key.c index f2868afe13a..4affa7fa62a 100644 --- a/src/btree/row_key.c +++ b/src/btree/row_key.c @@ -448,7 +448,8 @@ next: switch (direction) { * update the page's memory footprint, on failure, free * the allocated memory. */ - if (WT_ATOMIC_CAS8(WT_ROW_KEY_COPY(rip), copy, ikey)) + if (__wt_atomic_cas_ptr( + (void *)&WT_ROW_KEY_COPY(rip), copy, ikey)) __wt_cache_page_inmem_incr(session, page, sizeof(WT_IKEY) + ikey->size); else @@ -525,7 +526,7 @@ __wt_row_ikey(WT_SESSION_IMPL *session, WT_ASSERT(session, oldv == 0 || (oldv & WT_IK_FLAG) != 0); WT_ASSERT(session, ref->state != WT_REF_SPLIT); WT_ASSERT(session, - WT_ATOMIC_CAS8(ref->key.ikey, (WT_IKEY *)oldv, ikey)); + __wt_atomic_cas_ptr(&ref->key.ikey, (WT_IKEY *)oldv, ikey)); } #else ref->key.ikey = ikey; diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c index 62177b7e4c7..888c54d1ec9 100644 --- a/src/btree/row_modify.c +++ b/src/btree/row_modify.c @@ -26,7 +26,7 @@ __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page) * Select a spinlock for the page; let the barrier immediately below * keep things from racing too badly. */ - modify->page_lock = ++conn->page_lock_cnt % WT_PAGE_LOCKS(conn); + modify->page_lock = ++conn->page_lock_cnt % WT_PAGE_LOCKS; /* * Multiple threads of control may be searching and deciding to modify @@ -34,7 +34,7 @@ __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page) * footprint, else discard the modify structure, another thread did the * work. */ - if (WT_ATOMIC_CAS8(page->modify, NULL, modify)) + if (__wt_atomic_cas_ptr(&page->modify, NULL, modify)) __wt_cache_page_inmem_incr(session, page, sizeof(*modify)); else __wt_free(session, modify); @@ -112,6 +112,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, * there should only be one update list per key. */ WT_ASSERT(session, *upd_entry == NULL); + /* * Set the "old" entry to the second update in the list * so that the serialization function succeeds in @@ -192,7 +193,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, * The serial mutex acts as our memory barrier to flush these * writes before inserting them into the list. */ - if (WT_SKIP_FIRST(ins_head) == NULL) + if (cbt->ins_stack[0] == NULL) for (i = 0; i < skipdepth; i++) { cbt->ins_stack[i] = &ins_head->head[i]; ins->next[i] = cbt->next_stack[i] = NULL; @@ -316,7 +317,7 @@ __wt_update_obsolete_check( */ if (first != NULL && (next = first->next) != NULL && - WT_ATOMIC_CAS8(first->next, next, NULL)) + __wt_atomic_cas_ptr(&first->next, next, NULL)) return (next); /* diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c index 9803b924355..d83d3253c44 100644 --- a/src/btree/row_srch.c +++ b/src/btree/row_srch.c @@ -471,6 +471,7 @@ __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_PAGE *page; WT_PAGE_INDEX *pindex; WT_REF *current, *descent; + uint32_t cnt; btree = S2BT(session); @@ -528,18 +529,22 @@ restart: /* * If the tree is new (and not empty), it might have a large insert - * list, pick the key in the middle of that insert list. + * list. Count how many records are in the list. */ F_SET(cbt, WT_CBT_SEARCH_SMALLEST); if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL) WT_ERR(WT_NOTFOUND); - for (p = t = WT_SKIP_FIRST(cbt->ins_head);;) { + for (cnt = 1, p = WT_SKIP_FIRST(cbt->ins_head);; ++cnt) if ((p = WT_SKIP_NEXT(p)) == NULL) break; - if ((p = WT_SKIP_NEXT(p)) == NULL) + + /* + * Select a random number from 0 to (N - 1), return that record. + */ + cnt = __wt_random(&session->rnd) % cnt; + for (p = t = WT_SKIP_FIRST(cbt->ins_head);; t = p) + if (cnt-- == 0 || (p = WT_SKIP_NEXT(p)) == NULL) break; - t = WT_SKIP_NEXT(t); - } cbt->ref = current; cbt->compare = 0; cbt->ins = t; diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c new file mode 100644 index 00000000000..e269e8702e1 --- /dev/null +++ b/src/cache/cache_las.c @@ -0,0 +1,391 @@ +/*- + * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_las_stats_update -- + * Update the lookaside table statistics for return to the application. + */ +void +__wt_las_stats_update(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_CONNECTION_STATS **cstats; + WT_DSRC_STATS **dstats; + + conn = S2C(session); + + /* + * Lookaside table statistics are copied from the underlying lookaside + * table data-source statistics. If there's no lookaside table, values + * remain 0. In the current system, there's always a lookaside table, + * but there's no reason not to be cautious. + */ + if (conn->las_cursor == NULL) + return; + + /* + * We have a cursor, and we need the underlying data handle; we can get + * to it by way of the underlying btree handle, but it's a little ugly. + */ + cstats = conn->stats; + dstats = ((WT_CURSOR_BTREE *)conn->las_cursor)->btree->dhandle->stats; + + WT_STAT_SET(session, cstats, + cache_lookaside_insert, WT_STAT_READ(dstats, cursor_insert)); + WT_STAT_SET(session, cstats, + cache_lookaside_remove, WT_STAT_READ(dstats, cursor_remove)); +} + +/* + * __las_cursor_create -- + * Open a new lookaside table cursor. + */ +static int +__las_cursor_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp) +{ + WT_BTREE *btree; + const char *open_cursor_cfg[] = { + WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL }; + + WT_RET(__wt_open_cursor( + session, WT_LAS_URI, NULL, open_cursor_cfg, cursorp)); + + /* + * Set special flags for the lookaside table: the lookaside flag (used, + * for example, to avoid writing records during reconciliation), also + * turn off checkpoints and logging. + * + * Test flags before setting them so updates can't race in subsequent + * opens (the first update is safe because it's single-threaded from + * wiredtiger_open). + */ + btree = S2BT(session); + if (!F_ISSET(btree, WT_BTREE_LOOKASIDE)) + F_SET(btree, WT_BTREE_LOOKASIDE); + if (!F_ISSET(btree, WT_BTREE_NO_CHECKPOINT)) + F_SET(btree, WT_BTREE_NO_CHECKPOINT); + if (!F_ISSET(btree, WT_BTREE_NO_LOGGING)) + F_SET(btree, WT_BTREE_NO_LOGGING); + + return (0); +} + +/* + * __wt_las_create -- + * Initialize the database's lookaside store. + */ +int +__wt_las_create(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + const char *drop_cfg[] = { + WT_CONFIG_BASE(session, WT_SESSION_drop), "force=true", NULL }; + + conn = S2C(session); + + /* + * Done at startup: we cannot do it on demand because we require the + * schema lock to create and drop the file, and it may not always be + * available. + * + * Open an internal session, used for the shared lookaside cursor. + * + * Sessions associated with a lookaside cursor should never be tapped + * for eviction. + */ + WT_RET(__wt_open_internal_session( + conn, "lookaside table", 1, 1, &conn->las_session)); + session = conn->las_session; + F_SET(session, WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION); + + /* Discard any previous incarnation of the file. */ + WT_RET(__wt_session_drop(session, WT_LAS_URI, drop_cfg)); + + /* Re-create the file. */ + WT_RET(__wt_session_create(session, WT_LAS_URI, WT_LAS_FORMAT)); + + /* Open the shared cursor. */ + WT_WITHOUT_DHANDLE(session, + ret = __las_cursor_create(session, &conn->las_cursor)); + + return (ret); +} + +/* + * __wt_las_destroy -- + * Destroy the database's lookaside store. + */ +int +__wt_las_destroy(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_SESSION *wt_session; + + conn = S2C(session); + + if (conn->las_session == NULL) + return (0); + + wt_session = &conn->las_session->iface; + ret = wt_session->close(wt_session, NULL); + + conn->las_cursor = NULL; + conn->las_session = NULL; + + return (ret); +} + +/* + * __wt_las_set_written -- + * Flag that the lookaside table has been written. + */ +void +__wt_las_set_written(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + if (!conn->las_written) { + conn->las_written = true; + + /* + * Push the flag: unnecessary, but from now page reads must deal + * with lookaside table records, and we only do the write once. + */ + WT_FULL_BARRIER(); + } +} + +/* + * __wt_las_is_written -- + * Return if the lookaside table has been written. + */ +bool +__wt_las_is_written(WT_SESSION_IMPL *session) +{ + return (S2C(session)->las_written); +} + +/* + * __wt_las_cursor -- + * Return a lookaside cursor. + */ +int +__wt_las_cursor( + WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + + *cursorp = NULL; + + /* + * We don't want to get tapped for eviction after we start using the + * lookaside cursor; save a copy of the current eviction state, we'll + * turn eviction off before we return. + * + * Don't cache lookaside table pages, we're here because of eviction + * problems and there's no reason to believe lookaside pages will be + * useful more than once. + */ + *session_flags = + F_ISSET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); + + conn = S2C(session); + + /* Eviction and sweep threads have their own lookaside table cursors. */ + if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) { + if (session->las_cursor == NULL) { + WT_WITHOUT_DHANDLE(session, ret = + __las_cursor_create(session, &session->las_cursor)); + WT_RET(ret); + } + + *cursorp = session->las_cursor; + } else { + /* Lock the shared lookaside cursor. */ + __wt_spin_lock(session, &conn->las_lock); + + *cursorp = conn->las_cursor; + } + + /* Turn caching and eviction off. */ + F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); + + return (0); +} + +/* + * __wt_las_cursor_close -- + * Discard a lookaside cursor. + */ +int +__wt_las_cursor_close( + WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags) +{ + WT_CONNECTION_IMPL *conn; + WT_CURSOR *cursor; + WT_DECL_RET; + + conn = S2C(session); + + if ((cursor = *cursorp) == NULL) + return (0); + *cursorp = NULL; + + /* Reset the cursor. */ + ret = cursor->reset(cursor); + + /* + * We turned off caching and eviction while the lookaside cursor was in + * use, restore the session's flags. + */ + F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); + F_SET(session, session_flags); + + /* + * Eviction and sweep threads have their own lookaside table cursors; + * else, unlock the shared lookaside cursor. + */ + if (!F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) + __wt_spin_unlock(session, &conn->las_lock); + + return (ret); +} + +/* + * __wt_las_sweep -- + * Sweep the lookaside table. + */ +int +__wt_las_sweep(WT_SESSION_IMPL *session) +{ + WT_CONNECTION_IMPL *conn; + WT_CURSOR *cursor; + WT_DECL_ITEM(las_addr); + WT_DECL_ITEM(las_key); + WT_DECL_RET; + WT_ITEM *key; + uint64_t cnt, las_counter, las_txnid; + uint32_t las_id, session_flags; + int notused; + + conn = S2C(session); + cursor = NULL; + key = &conn->las_sweep_key; + session_flags = 0; /* [-Werror=maybe-uninitialized] */ + + WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); + WT_ERR(__wt_scr_alloc(session, 0, &las_key)); + + WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); + + /* + * If we're not starting a new sweep, position the cursor using the key + * from the last call (we don't care if we're before or after the key, + * just roughly in the same spot is fine). + */ + if (conn->las_sweep_call != 0 && key->data != NULL) { + __wt_cursor_set_raw_key(cursor, key); + if ((ret = cursor->search_near(cursor, ¬used)) != 0) + goto srch_notfound; + } + + /* + * The sweep server wakes up every 10 seconds (by default), it's a slow + * moving thread. Try to review the entire lookaside table once every 5 + * minutes, or every 30 calls. + * + * The reason is because the lookaside table exists because we're seeing + * cache/eviction pressure (it allows us to trade performance and disk + * space for cache space), and it's likely lookaside blocks are being + * evicted, and reading them back in doesn't help things. A trickier, + * but possibly better, alternative might be to review all lookaside + * blocks in the cache in order to get rid of them, and slowly review + * lookaside blocks that have already been evicted. + * + * We can't know for sure how many records are in the lookaside table, + * the cursor insert and remove statistics aren't updated atomically. + * Start with reviewing 100 rows, and if it takes more than the target + * number of calls to finish, increase the number of rows checked on + * each call; if it takes less than the target calls to finish, then + * decrease the number of rows reviewed on each call (but never less + * than 100). + */ +#define WT_SWEEP_LOOKASIDE_MIN_CNT 100 +#define WT_SWEEP_LOOKASIDE_PASS_TARGET 30 + ++conn->las_sweep_call; + if ((cnt = conn->las_sweep_cnt) < WT_SWEEP_LOOKASIDE_MIN_CNT) + cnt = conn->las_sweep_cnt = WT_SWEEP_LOOKASIDE_MIN_CNT; + + /* Walk the file. */ + for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) { + /* + * If the loop terminates after completing a work unit, we will + * continue the table sweep next time. Get a local copy of the + * sweep key, we're going to reset the cursor; do so before + * calling cursor.remove, cursor.remove can discard our hazard + * pointer and the page could be evicted from underneath us. + */ + if (cnt == 1) { + WT_ERR(__wt_cursor_get_raw_key(cursor, key)); + if (!WT_DATA_IN_ITEM(key)) + WT_ERR(__wt_buf_set( + session, key, key->data, key->size)); + } + + WT_ERR(cursor->get_key(cursor, + &las_id, las_addr, &las_counter, &las_txnid, las_key)); + + /* + * If the on-page record transaction ID associated with the + * record is globally visible, the record can be discarded. + * + * Cursor opened overwrite=true: won't return WT_NOTFOUND should + * another thread remove the record before we do, and the cursor + * remains positioned in that case. + */ + if (__wt_txn_visible_all(session, las_txnid)) + WT_ERR(cursor->remove(cursor)); + } + + /* + * When reaching the lookaside table end or the target number of calls, + * adjust the row count. Decrease/increase the row count depending on + * if the number of calls is less/more than the target. + */ + if (ret == WT_NOTFOUND || + conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET) { + if (conn->las_sweep_call < WT_SWEEP_LOOKASIDE_PASS_TARGET && + conn->las_sweep_cnt > WT_SWEEP_LOOKASIDE_MIN_CNT) + conn->las_sweep_cnt -= WT_SWEEP_LOOKASIDE_MIN_CNT; + if (conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET) + conn->las_sweep_cnt += WT_SWEEP_LOOKASIDE_MIN_CNT; + } + +srch_notfound: + if (ret == WT_NOTFOUND) + conn->las_sweep_call = 0; + + WT_ERR_NOTFOUND_OK(ret); + + if (0) { +err: __wt_buf_free(session, key); + } + + WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); + + __wt_scr_free(session, &las_addr); + __wt_scr_free(session, &las_key); + + return (ret); +} diff --git a/src/config/config_def.c b/src/config/config_def.c index 73837c46ee8..91cfcedfcaf 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -76,6 +76,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_shared_cache_subconfigs[] = { { "chunk", "int", NULL, "min=1MB,max=10TB", NULL, 0 }, { "name", "string", NULL, NULL, NULL, 0 }, + { "quota", "int", NULL, NULL, NULL, 0 }, { "reserve", "int", NULL, NULL, NULL, 0 }, { "size", "int", NULL, "min=1MB,max=10TB", NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } @@ -121,7 +122,7 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { { "lsm_merge", "boolean", NULL, NULL, NULL, 0 }, { "shared_cache", "category", NULL, NULL, - confchk_wiredtiger_open_shared_cache_subconfigs, 4 }, + confchk_wiredtiger_open_shared_cache_subconfigs, 5 }, { "statistics", "list", NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]", NULL, 0 }, @@ -520,7 +521,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { { "session_scratch_max", "int", NULL, NULL, NULL, 0 }, { "shared_cache", "category", NULL, NULL, - confchk_wiredtiger_open_shared_cache_subconfigs, 4 }, + confchk_wiredtiger_open_shared_cache_subconfigs, 5 }, { "statistics", "list", NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]", NULL, 0 }, @@ -595,7 +596,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { { "session_scratch_max", "int", NULL, NULL, NULL, 0 }, { "shared_cache", "category", NULL, NULL, - confchk_wiredtiger_open_shared_cache_subconfigs, 4 }, + confchk_wiredtiger_open_shared_cache_subconfigs, 5 }, { "statistics", "list", NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]", NULL, 0 }, @@ -668,7 +669,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { { "session_scratch_max", "int", NULL, NULL, NULL, 0 }, { "shared_cache", "category", NULL, NULL, - confchk_wiredtiger_open_shared_cache_subconfigs, 4 }, + confchk_wiredtiger_open_shared_cache_subconfigs, 5 }, { "statistics", "list", NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]", NULL, 0 }, @@ -740,7 +741,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { { "session_scratch_max", "int", NULL, NULL, NULL, 0 }, { "shared_cache", "category", NULL, NULL, - confchk_wiredtiger_open_shared_cache_subconfigs, 4 }, + confchk_wiredtiger_open_shared_cache_subconfigs, 5 }, { "statistics", "list", NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]", NULL, 0 }, @@ -807,8 +808,8 @@ static const WT_CONFIG_ENTRY config_entries[] = { "eviction_dirty_trigger=95,eviction_target=80,eviction_trigger=95" ",file_manager=(close_handle_minimum=250,close_idle_time=30," "close_scan_interval=10),lsm_manager=(merge=,worker_thread_max=4)" - ",lsm_merge=,shared_cache=(chunk=10MB,name=,reserve=0,size=500MB)" - ",statistics=none,statistics_log=(on_close=0," + ",lsm_merge=,shared_cache=(chunk=10MB,name=,quota=0,reserve=0," + "size=500MB),statistics=none,statistics_log=(on_close=0," "path=\"WiredTigerStat.%d.%H\",sources=," "timestamp=\"%b %d %H:%M:%S\",wait=0),verbose=", confchk_WT_CONNECTION_reconfigure, 17 @@ -959,9 +960,9 @@ static const WT_CONFIG_ENTRY config_entries[] = { "log=(archive=,compressor=,enabled=0,file_max=100MB,path=," "prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4)," "lsm_merge=,mmap=,multiprocess=0,session_max=100," - "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,reserve=0" - ",size=500MB),statistics=none,statistics_log=(on_close=0," - "path=\"WiredTigerStat.%d.%H\",sources=," + "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0," + "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0" + ",path=\"WiredTigerStat.%d.%H\",sources=," "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0" ",method=fsync),use_environment_priv=0,verbose=", confchk_wiredtiger_open, 34 @@ -979,9 +980,9 @@ static const WT_CONFIG_ENTRY config_entries[] = { "log=(archive=,compressor=,enabled=0,file_max=100MB,path=," "prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4)," "lsm_merge=,mmap=,multiprocess=0,session_max=100," - "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,reserve=0" - ",size=500MB),statistics=none,statistics_log=(on_close=0," - "path=\"WiredTigerStat.%d.%H\",sources=," + "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0," + "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0" + ",path=\"WiredTigerStat.%d.%H\",sources=," "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0" ",method=fsync),use_environment_priv=0,verbose=,version=(major=0," "minor=0)", @@ -999,9 +1000,9 @@ static const WT_CONFIG_ENTRY config_entries[] = { "log=(archive=,compressor=,enabled=0,file_max=100MB,path=," "prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4)," "lsm_merge=,mmap=,multiprocess=0,session_max=100," - "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,reserve=0" - ",size=500MB),statistics=none,statistics_log=(on_close=0," - "path=\"WiredTigerStat.%d.%H\",sources=," + "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0," + "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0" + ",path=\"WiredTigerStat.%d.%H\",sources=," "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0" ",method=fsync),verbose=,version=(major=0,minor=0)", confchk_wiredtiger_open_basecfg, 31 @@ -1018,9 +1019,9 @@ static const WT_CONFIG_ENTRY config_entries[] = { "log=(archive=,compressor=,enabled=0,file_max=100MB,path=," "prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4)," "lsm_merge=,mmap=,multiprocess=0,session_max=100," - "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,reserve=0" - ",size=500MB),statistics=none,statistics_log=(on_close=0," - "path=\"WiredTigerStat.%d.%H\",sources=," + "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0," + "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0" + ",path=\"WiredTigerStat.%d.%H\",sources=," "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0" ",method=fsync),verbose=", confchk_wiredtiger_open_usercfg, 30 diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index 067ad00560e..b1155d06826 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -432,7 +432,7 @@ __wt_encryptor_config(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cval, "requires connection encryption to be set"); hash = __wt_hash_city64(keyid->str, keyid->len); bucket = hash % WT_HASH_ARRAY_SIZE; - SLIST_FOREACH(kenc, &nenc->keyedhashlh[bucket], l) + TAILQ_FOREACH(kenc, &nenc->keyedhashqh[bucket], q) if (WT_STRING_MATCH(kenc->keyid, keyid->str, keyid->len)) goto out; @@ -450,8 +450,8 @@ __wt_encryptor_config(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cval, WT_ERR(encryptor->sizing(encryptor, &session->iface, &kenc->size_const)); kenc->encryptor = encryptor; - SLIST_INSERT_HEAD(&nenc->keyedlh, kenc, l); - SLIST_INSERT_HEAD(&nenc->keyedhashlh[bucket], kenc, hashl); + TAILQ_INSERT_HEAD(&nenc->keyedqh, kenc, q); + TAILQ_INSERT_HEAD(&nenc->keyedhashqh[bucket], kenc, hashq); out: __wt_spin_unlock(session, &conn->encryptor_lock); *kencryptorp = kenc; @@ -506,9 +506,9 @@ __conn_add_encryptor(WT_CONNECTION *wt_conn, WT_ERR(__wt_calloc_one(session, &nenc)); WT_ERR(__wt_strdup(session, name, &nenc->name)); nenc->encryptor = encryptor; - SLIST_INIT(&nenc->keyedlh); + TAILQ_INIT(&nenc->keyedqh); for (i = 0; i < WT_HASH_ARRAY_SIZE; i++) - SLIST_INIT(&nenc->keyedhashlh[i]); + TAILQ_INIT(&nenc->keyedhashqh[i]); TAILQ_INSERT_TAIL(&conn->encryptqh, nenc, q); nenc = NULL; @@ -537,15 +537,14 @@ __wt_conn_remove_encryptor(WT_SESSION_IMPL *session) conn = S2C(session); while ((nenc = TAILQ_FIRST(&conn->encryptqh)) != NULL) { - while ((kenc = SLIST_FIRST(&nenc->keyedlh)) != NULL) { + while ((kenc = TAILQ_FIRST(&nenc->keyedqh)) != NULL) { /* Call any termination method. */ if (kenc->owned && kenc->encryptor->terminate != NULL) WT_TRET(kenc->encryptor->terminate( kenc->encryptor, (WT_SESSION *)session)); /* Remove from the connection's list, free memory. */ - SLIST_REMOVE( - &nenc->keyedlh, kenc, __wt_keyed_encryptor, l); + TAILQ_REMOVE(&nenc->keyedqh, kenc, q); __wt_free(session, kenc->keyid); __wt_free(session, kenc); } @@ -1725,7 +1724,8 @@ __conn_write_base_config(WT_SESSION_IMPL *session, const char *cfg[]) "encryption=(secretkey=)," "exclusive=," "log=(recover=)," - "use_environment_priv=,", &base_config)); + "use_environment_priv=," + "verbose=,", &base_config)); WT_ERR(__wt_config_init(session, &parser, base_config)); while ((ret = __wt_config_next(&parser, &k, &v)) == 0) { /* Fix quoting for non-trivial settings. */ @@ -1795,6 +1795,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WT_DECL_RET; const WT_NAME_FLAG *ft; WT_SESSION_IMPL *session; + int64_t config_base_set; const char *enc_cfg[] = { NULL, NULL }; char version[64]; @@ -1836,6 +1837,10 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, cfg[0] = WT_CONFIG_BASE(session, wiredtiger_open); cfg[1] = config; + /* Capture the config_base setting file for later use. */ + WT_ERR(__wt_config_gets(session, cfg, "config_base", &cval)); + config_base_set = cval.val; + /* Configure error messages so we get them right early. */ WT_ERR(__wt_config_gets(session, cfg, "error_prefix", &cval)); if (cval.len != 0) @@ -1873,7 +1878,10 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR) >= (int)sizeof(version), ENOMEM); __conn_config_append(cfg, version); - WT_ERR(__conn_config_file(session, WT_BASECONFIG, 0, cfg, i1)); + + /* Ignore the base_config file if we config_base set to false. */ + if (config_base_set != 0) + WT_ERR(__conn_config_file(session, WT_BASECONFIG, 0, cfg, i1)); __conn_config_append(cfg, config); WT_ERR(__conn_config_file(session, WT_USERCONFIG, 1, cfg, i2)); WT_ERR(__conn_config_env(session, cfg, i3)); @@ -1904,7 +1912,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, conn->hazard_max = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "session_max", &cval)); - conn->session_size = (uint32_t)cval.val + WT_NUM_INTERNAL_SESSIONS; + conn->session_size = (uint32_t)cval.val + WT_EXTRA_INTERNAL_SESSIONS; WT_ERR(__wt_config_gets(session, cfg, "session_scratch_max", &cval)); conn->session_scratch_max = (size_t)cval.val; @@ -2023,11 +2031,12 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WT_ERR(__wt_turtle_init(session)); WT_ERR(__wt_metadata_open(session)); - /* - * Start the worker threads last. - */ + /* Start the worker threads and run recovery. */ WT_ERR(__wt_connection_workers(session, cfg)); + /* Create the lookaside table. */ + WT_ERR(__wt_las_create(session)); + WT_STATIC_ASSERT(offsetof(WT_CONNECTION_IMPL, iface) == 0); *wt_connp = &conn->iface; diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c index d62425fe536..8f62c7140c7 100644 --- a/src/conn/conn_cache.c +++ b/src/conn/conn_cache.c @@ -156,7 +156,8 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) /* Allocate the LRU eviction queue. */ cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR; - WT_ERR(__wt_calloc_def(session, cache->evict_slots, &cache->evict)); + WT_ERR(__wt_calloc_def(session, + cache->evict_slots, &cache->evict_queue)); /* * We get/set some values in the cache statistics (rather than have @@ -178,12 +179,12 @@ __wt_cache_stats_update(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; - WT_CONNECTION_STATS *stats; + WT_CONNECTION_STATS **stats; uint64_t inuse, leaf, used; conn = S2C(session); cache = conn->cache; - stats = &conn->stats; + stats = conn->stats; inuse = __wt_cache_bytes_inuse(cache); /* @@ -193,19 +194,23 @@ __wt_cache_stats_update(WT_SESSION_IMPL *session) used = cache->bytes_overflow + cache->bytes_internal; leaf = inuse > used ? inuse - used : 0; - WT_STAT_SET(stats, cache_bytes_max, conn->cache_size); - WT_STAT_SET(stats, cache_bytes_inuse, inuse); + WT_STAT_SET(session, stats, cache_bytes_max, conn->cache_size); + WT_STAT_SET(session, stats, cache_bytes_inuse, inuse); - WT_STAT_SET(stats, cache_overhead, cache->overhead_pct); - WT_STAT_SET(stats, cache_pages_inuse, __wt_cache_pages_inuse(cache)); - WT_STAT_SET(stats, cache_bytes_dirty, __wt_cache_dirty_inuse(cache)); - WT_STAT_SET(stats, + WT_STAT_SET(session, stats, cache_overhead, cache->overhead_pct); + WT_STAT_SET( + session, stats, cache_pages_inuse, __wt_cache_pages_inuse(cache)); + WT_STAT_SET( + session, stats, cache_bytes_dirty, __wt_cache_dirty_inuse(cache)); + WT_STAT_SET(session, stats, cache_eviction_maximum_page_size, cache->evict_max_page_size); - WT_STAT_SET(stats, cache_pages_dirty, cache->pages_dirty); + WT_STAT_SET(session, stats, cache_pages_dirty, cache->pages_dirty); - WT_STAT_SET(stats, cache_bytes_internal, cache->bytes_internal); - WT_STAT_SET(stats, cache_bytes_overflow, cache->bytes_overflow); - WT_STAT_SET(stats, cache_bytes_leaf, leaf); + WT_STAT_SET( + session, stats, cache_bytes_internal, cache->bytes_internal); + WT_STAT_SET( + session, stats, cache_bytes_overflow, cache->bytes_overflow); + WT_STAT_SET(session, stats, cache_bytes_leaf, leaf); } /* @@ -246,7 +251,7 @@ __wt_cache_destroy(WT_SESSION_IMPL *session) __wt_spin_destroy(session, &cache->evict_lock); __wt_spin_destroy(session, &cache->evict_walk_lock); - __wt_free(session, cache->evict); + __wt_free(session, cache->evict_queue); __wt_free(session, conn->cache); return (ret); } diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c index fdc95a32387..aaae58ef168 100644 --- a/src/conn/conn_cache_pool.c +++ b/src/conn/conn_cache_pool.c @@ -22,21 +22,22 @@ */ #define WT_CACHE_POOL_REDUCE_THRESHOLD 20 /* Balancing passes after a bump before a connection is a candidate. */ -#define WT_CACHE_POOL_BUMP_SKIPS 10 +#define WT_CACHE_POOL_BUMP_SKIPS 5 /* Balancing passes after a reduction before a connection is a candidate. */ -#define WT_CACHE_POOL_REDUCE_SKIPS 5 +#define WT_CACHE_POOL_REDUCE_SKIPS 10 /* * Constants that control how much influence different metrics have on * the pressure calculation. */ -#define WT_CACHE_POOL_APP_EVICT_MULTIPLIER 10 -#define WT_CACHE_POOL_APP_WAIT_MULTIPLIER 50 +#define WT_CACHE_POOL_APP_EVICT_MULTIPLIER 3 +#define WT_CACHE_POOL_APP_WAIT_MULTIPLIER 6 #define WT_CACHE_POOL_READ_MULTIPLIER 1 -static int __cache_pool_adjust(WT_SESSION_IMPL *, uint64_t, uint64_t, int *); +static int __cache_pool_adjust( + WT_SESSION_IMPL *, uint64_t, uint64_t, int, int *); static int __cache_pool_assess(WT_SESSION_IMPL *, uint64_t *); -static int __cache_pool_balance(WT_SESSION_IMPL *); +static int __cache_pool_balance(WT_SESSION_IMPL *, int); /* * __wt_cache_pool_config -- @@ -51,7 +52,7 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) WT_DECL_RET; char *pool_name; int created, updating; - uint64_t chunk, reserve, size, used_cache; + uint64_t chunk, quota, reserve, size, used_cache; conn = S2C(session); created = updating = 0; @@ -142,6 +143,11 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) chunk = (uint64_t)cval.val; else chunk = cp->chunk; + if (__wt_config_gets(session, &cfg[1], + "shared_cache.quota", &cval) == 0 && cval.val != 0) + quota = (uint64_t)cval.val; + else + quota = cp->quota; } else { /* * The only time shared cache configuration uses default @@ -155,6 +161,9 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) session, cfg, "shared_cache.chunk", &cval)); WT_ASSERT(session, cval.val != 0); chunk = (uint64_t)cval.val; + WT_ERR(__wt_config_gets( + session, cfg, "shared_cache.quota", &cval)); + quota = (uint64_t)cval.val; } /* @@ -197,8 +206,10 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg) /* The configuration is verified - it's safe to update the pool. */ cp->size = size; cp->chunk = chunk; + cp->quota = quota; conn->cache->cp_reserved = reserve; + conn->cache->cp_quota = quota; /* Wake up the cache pool server so any changes are noticed. */ if (updating) @@ -402,7 +413,7 @@ __wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session) * effectively used. */ static int -__cache_pool_balance(WT_SESSION_IMPL *session) +__cache_pool_balance(WT_SESSION_IMPL *session, int forward) { WT_CACHE_POOL *cp; WT_DECL_RET; @@ -421,16 +432,16 @@ __cache_pool_balance(WT_SESSION_IMPL *session) WT_ERR(__cache_pool_assess(session, &highest)); bump_threshold = WT_CACHE_POOL_BUMP_THRESHOLD; + /* * Actively attempt to: * - Reduce the amount allocated, if we are over the budget * - Increase the amount used if there is capacity and any pressure. */ - for (bump_threshold = WT_CACHE_POOL_BUMP_THRESHOLD; - F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) && - F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN);) { + while (F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) && + F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN)) { WT_ERR(__cache_pool_adjust( - session, highest, bump_threshold, &adjusted)); + session, highest, bump_threshold, forward, &adjusted)); /* * Stop if the amount of cache being used is stable, and we * aren't over capacity. @@ -456,30 +467,39 @@ __cache_pool_assess(WT_SESSION_IMPL *session, uint64_t *phighest) WT_CACHE *cache; WT_CONNECTION_IMPL *entry; uint64_t app_evicts, app_waits, reads; - uint64_t entries, highest, tmp; + uint64_t balanced_size, entries, highest, tmp; cp = __wt_process.cache_pool; - entries = 0; + balanced_size = entries = 0; highest = 1; /* Avoid divide by zero */ + TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) { + if (entry->cache_size == 0 || entry->cache == NULL) + continue; + ++entries; + } + + if (entries > 0) + balanced_size = cp->currently_used / entries; + /* Generate read pressure information. */ TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) { - if (entry->cache_size == 0 || - entry->cache == NULL) + if (entry->cache_size == 0 || entry->cache == NULL) continue; cache = entry->cache; - ++entries; /* * Figure out a delta since the last time we did an assessment * for each metric we are tracking. Watch out for wrapping * of values. + * + * Count pages read, assuming pages are 4KB. */ - tmp = cache->bytes_read; + tmp = cache->bytes_read >> 12; if (tmp >= cache->cp_saved_read) reads = tmp - cache->cp_saved_read; else - reads = (UINT64_MAX - cache->cp_saved_read) + tmp; + reads = tmp; cache->cp_saved_read = tmp; /* Update the application eviction count information */ @@ -500,12 +520,19 @@ __cache_pool_assess(WT_SESSION_IMPL *session, uint64_t *phighest) (UINT64_MAX - cache->cp_saved_app_waits) + tmp; cache->cp_saved_app_waits = tmp; - /* Calculate the weighted pressure for this member */ - cache->cp_pass_pressure = - (app_evicts * WT_CACHE_POOL_APP_EVICT_MULTIPLIER) + + /* Calculate the weighted pressure for this member. */ + tmp = (app_evicts * WT_CACHE_POOL_APP_EVICT_MULTIPLIER) + (app_waits * WT_CACHE_POOL_APP_WAIT_MULTIPLIER) + (reads * WT_CACHE_POOL_READ_MULTIPLIER); + /* Weight smaller caches higher. */ + tmp = (uint64_t)(tmp * + ((double)balanced_size / entry->cache_size)); + + /* Smooth over history. */ + cache->cp_pass_pressure = + (9 * cache->cp_pass_pressure + tmp) / 10; + if (cache->cp_pass_pressure > highest) highest = cache->cp_pass_pressure; @@ -524,24 +551,25 @@ __cache_pool_assess(WT_SESSION_IMPL *session, uint64_t *phighest) /* * __cache_pool_adjust -- - * Adjust the allocation of cache to each connection. If force is set + * Adjust the allocation of cache to each connection. If full is set * ignore cache load information, and reduce the allocation for every * connection allocated more than their reserved size. */ static int __cache_pool_adjust(WT_SESSION_IMPL *session, - uint64_t highest, uint64_t bump_threshold, int *adjustedp) + uint64_t highest, uint64_t bump_threshold, int forward, int *adjustedp) { WT_CACHE_POOL *cp; WT_CACHE *cache; WT_CONNECTION_IMPL *entry; - uint64_t adjusted, highest_percentile, pressure, reserved; - int force, grew; + uint64_t adjustment, highest_percentile, pressure, reserved, smallest; + int busy, pool_full, grow; + u_int pct_full; *adjustedp = 0; cp = __wt_process.cache_pool; - force = (cp->currently_used > cp->size); - grew = 0; + grow = 0; + pool_full = (cp->currently_used >= cp->size); /* Highest as a percentage, avoid 0 */ highest_percentile = (highest / 100) + 1; @@ -549,13 +577,17 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE, "Cache pool distribution: ")); WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE, - "\t" "cache_size, pressure, skips: ")); + "\t" "cache (MB), pressure, skips, busy, %% full:")); } - TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) { + for (entry = forward ? TAILQ_FIRST(&cp->cache_pool_qh) : + TAILQ_LAST(&cp->cache_pool_qh, __wt_cache_pool_qh); + entry != NULL; + entry = forward ? TAILQ_NEXT(entry, cpq) : + TAILQ_PREV(entry, __wt_cache_pool_qh, cpq)) { cache = entry->cache; reserved = cache->cp_reserved; - adjusted = 0; + adjustment = 0; /* * The read pressure is calculated as a percentage of how @@ -565,84 +597,109 @@ __cache_pool_adjust(WT_SESSION_IMPL *session, * assigned. */ pressure = cache->cp_pass_pressure / highest_percentile; + busy = __wt_eviction_needed(entry->default_session, &pct_full); + WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE, - "\t%" PRIu64 ", %" PRIu64 ", %" PRIu32, - entry->cache_size, pressure, cache->cp_skip_count)); + "\t%5" PRIu64 ", %3" PRIu64 ", %2" PRIu32 ", %d, %2u", + entry->cache_size >> 20, pressure, cache->cp_skip_count, + busy, pct_full)); /* Allow to stabilize after changes. */ if (cache->cp_skip_count > 0 && --cache->cp_skip_count > 0) continue; + /* * If the entry is currently allocated less than the reserved - * size, increase it's allocation. This should only happen if: - * - It's the first time we've seen this member - * - The reserved size has been adjusted + * size, increase its allocation. This should only happen if: + * - it's the first time we've seen this member, or + * - the reserved size has been adjusted */ if (entry->cache_size < reserved) { - grew = 1; - adjusted = reserved - entry->cache_size; - + grow = 1; + adjustment = reserved - entry->cache_size; /* * Conditions for reducing the amount of resources for an * entry: - * - If we are forcing and this entry has more than the - * minimum amount of space in use. - * - If the read pressure in this entry is below the - * threshold, other entries need more cache, the entry has - * more than the minimum space and there is no available - * space in the pool. + * - the pool is full, + * - application threads are not busy doing eviction already, + * - this entry has more than the minimum amount of space in + * use, + * - the read pressure in this entry is below the threshold, + * other entries need more cache, the entry has more than + * the minimum space and there is no available space in the + * pool. */ - } else if ((force && entry->cache_size > reserved) || - (pressure < WT_CACHE_POOL_REDUCE_THRESHOLD && - highest > 1 && entry->cache_size > reserved && - cp->currently_used >= cp->size)) { - grew = 0; + } else if (pool_full && !busy && + entry->cache_size > reserved && + pressure < WT_CACHE_POOL_REDUCE_THRESHOLD && highest > 1) { + grow = 0; /* - * Shrink by a chunk size if that doesn't drop us - * below the reserved size. + * Don't drop the size down too much - or it can + * trigger aggressive eviction in the connection, + * which is likely to lead to lower throughput and + * potentially a negative feedback loop in the + * balance algorithm. */ - if (entry->cache_size > cp->chunk + reserved) - adjusted = cp->chunk; - else - adjusted = entry->cache_size - reserved; + smallest = (100 * __wt_cache_bytes_inuse(cache)) / + cache->eviction_trigger; + if (entry->cache_size > smallest) + adjustment = WT_MIN(cp->chunk, + (entry->cache_size - smallest) / 2); + adjustment = + WT_MIN(adjustment, entry->cache_size - reserved); /* * Conditions for increasing the amount of resources for an * entry: - * - There was some activity across the pool - * - This entry is using less than the entire cache pool - * - The connection is using enough cache to require eviction - * - There is space available in the pool - * - Additional cache would benefit the connection OR - * - The pool is less than half distributed + * - there is space available in the pool + * - the connection isn't over quota + * - the connection is using enough cache to require eviction + * - there was some activity across the pool + * - this entry is using less than the entire cache pool + * - additional cache would benefit the connection OR + * - the pool is less than half distributed */ - } else if (entry->cache_size < cp->size && + } else if (!pool_full && + (cache->cp_quota == 0 || + entry->cache_size < cache->cp_quota) && __wt_cache_bytes_inuse(cache) >= (entry->cache_size * cache->eviction_target) / 100 && - ((cp->currently_used < cp->size && - pressure > bump_threshold) || + (pressure > bump_threshold || cp->currently_used < cp->size * 0.5)) { - grew = 1; - adjusted = WT_MIN(cp->chunk, - cp->size - cp->currently_used); + grow = 1; + adjustment = WT_MIN(WT_MIN(cp->chunk, + cp->size - cp->currently_used), + cache->cp_quota - entry->cache_size); } - if (adjusted > 0) { + /* + * Bounds checking: don't go over the pool size or under the + * reserved size for this cache. + * + * Shrink by a chunk size if that doesn't drop us + * below the reserved size. + * + * Limit the reduction to half of the free space in the + * connection's cache. This should reduce cache sizes + * gradually without stalling application threads. + */ + if (adjustment > 0) { *adjustedp = 1; - if (grew > 0) { + if (grow) { cache->cp_skip_count = WT_CACHE_POOL_BUMP_SKIPS; - entry->cache_size += adjusted; - cp->currently_used += adjusted; + entry->cache_size += adjustment; + cp->currently_used += adjustment; } else { cache->cp_skip_count = WT_CACHE_POOL_REDUCE_SKIPS; WT_ASSERT(session, - entry->cache_size >= adjusted && - cp->currently_used >= adjusted); - entry->cache_size -= adjusted; - cp->currently_used -= adjusted; + entry->cache_size >= adjustment && + cp->currently_used >= adjustment); + entry->cache_size -= adjustment; + cp->currently_used -= adjustment; } WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE, "Allocated %s%" PRId64 " to %s", - grew ? "" : "-", adjusted, entry->home)); + grow ? "" : "-", adjustment, entry->home)); + /* * TODO: Add a loop waiting for connection to give up * cache. @@ -663,11 +720,13 @@ __wt_cache_pool_server(void *arg) WT_CACHE_POOL *cp; WT_DECL_RET; WT_SESSION_IMPL *session; + int forward; session = (WT_SESSION_IMPL *)arg; cp = __wt_process.cache_pool; cache = S2C(session)->cache; + forward = 1; while (F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) && F_ISSET(cache, WT_CACHE_POOL_RUN)) { @@ -695,8 +754,10 @@ __wt_cache_pool_server(void *arg) * Continue even if there was an error. Details of errors are * reported in the balance function. */ - if (F_ISSET(cache, WT_CACHE_POOL_MANAGER)) - (void)__cache_pool_balance(session); + if (F_ISSET(cache, WT_CACHE_POOL_MANAGER)) { + (void)__cache_pool_balance(session, forward); + forward = !forward; + } } if (0) { diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index 76f55fa44e5..92497484408 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -55,6 +55,8 @@ __conn_dhandle_alloc(WT_SESSION_IMPL *session, WT_ERR(__wt_spin_init( session, &dhandle->close_lock, "data handle close")); + __wt_stat_dsrc_init(dhandle); + *dhandlep = dhandle; return (0); @@ -81,7 +83,7 @@ __wt_conn_dhandle_find( bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE; if (checkpoint == NULL) { - SLIST_FOREACH(dhandle, &conn->dhhash[bucket], hashl) { + TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) { if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) continue; if (dhandle->checkpoint == NULL && @@ -91,7 +93,7 @@ __wt_conn_dhandle_find( } } } else - SLIST_FOREACH(dhandle, &conn->dhhash[bucket], hashl) { + TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) { if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) continue; if (dhandle->checkpoint != NULL && @@ -404,7 +406,7 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session, if (uri != NULL) { bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE; - SLIST_FOREACH(dhandle, &conn->dhhash[bucket], hashl) + TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && !F_ISSET(dhandle, WT_DHANDLE_DEAD) && strcmp(uri, dhandle->name) == 0 && @@ -412,7 +414,7 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session, WT_RET(__conn_btree_apply_internal( session, dhandle, func, cfg)); } else { - SLIST_FOREACH(dhandle, &conn->dhlh, l) + TAILQ_FOREACH(dhandle, &conn->dhqh, q) if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && !F_ISSET(dhandle, WT_DHANDLE_DEAD) && (apply_checkpoints || @@ -489,7 +491,7 @@ __wt_conn_btree_apply_single(WT_SESSION_IMPL *session, hash = __wt_hash_city64(uri, strlen(uri)); bucket = hash % WT_HASH_ARRAY_SIZE; - SLIST_FOREACH(dhandle, &conn->dhhash[bucket], hashl) + TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && !F_ISSET(dhandle, WT_DHANDLE_DEAD) && (hash == dhandle->name_hash && @@ -538,7 +540,7 @@ __wt_conn_dhandle_close_all( WT_ASSERT(session, session->dhandle == NULL); bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE; - SLIST_FOREACH(dhandle, &conn->dhhash[bucket], hashl) { + TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) { if (strcmp(dhandle->name, uri) != 0 || F_ISSET(dhandle, WT_DHANDLE_DEAD)) continue; @@ -596,6 +598,7 @@ __conn_dhandle_remove(WT_SESSION_IMPL *session, int final) bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE; WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); + WT_ASSERT(session, dhandle != conn->cache->evict_file_next); /* Check if the handle was reacquired by a session while we waited. */ if (!final && @@ -675,7 +678,7 @@ __wt_conn_dhandle_discard(WT_SESSION_IMPL *session) * the list, so we do it the hard way. */ restart: - SLIST_FOREACH(dhandle, &conn->dhlh, l) { + TAILQ_FOREACH(dhandle, &conn->dhqh, q) { if (WT_IS_METADATA(dhandle)) continue; @@ -694,7 +697,7 @@ restart: F_SET(session, WT_SESSION_NO_DATA_HANDLES); /* Close the metadata file handle. */ - while ((dhandle = SLIST_FIRST(&conn->dhlh)) != NULL) + while ((dhandle = TAILQ_FIRST(&conn->dhqh)) != NULL) WT_WITH_DHANDLE(session, dhandle, WT_TRET(__wt_conn_dhandle_discard_single(session, 1, 0))); diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c index 94e69897c1d..7a8a6cba838 100644 --- a/src/conn/conn_handle.c +++ b/src/conn/conn_handle.c @@ -21,14 +21,14 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) session = conn->default_session; for (i = 0; i < WT_HASH_ARRAY_SIZE; i++) { - SLIST_INIT(&conn->dhhash[i]); /* Data handle hash lists */ - SLIST_INIT(&conn->fhhash[i]); /* File handle hash lists */ + TAILQ_INIT(&conn->dhhash[i]); /* Data handle hash lists */ + TAILQ_INIT(&conn->fhhash[i]); /* File handle hash lists */ } - SLIST_INIT(&conn->dhlh); /* Data handle list */ + TAILQ_INIT(&conn->dhqh); /* Data handle list */ TAILQ_INIT(&conn->dlhqh); /* Library list */ TAILQ_INIT(&conn->dsrcqh); /* Data source list */ - SLIST_INIT(&conn->fhlh); /* File list */ + TAILQ_INIT(&conn->fhqh); /* File list */ TAILQ_INIT(&conn->collqh); /* Collator list */ TAILQ_INIT(&conn->compqh); /* Compressor list */ TAILQ_INIT(&conn->encryptqh); /* Encryptor list */ @@ -45,7 +45,7 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_RET(__wt_conn_config_init(session)); /* Statistics. */ - __wt_stat_init_connection_stats(&conn->stats); + __wt_stat_connection_init(conn); /* Locks. */ WT_RET(__wt_spin_init(session, &conn->api_lock, "api")); @@ -55,11 +55,14 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_RET(__wt_spin_init(session, &conn->fh_lock, "file list")); WT_RET(__wt_rwlock_alloc(session, &conn->hot_backup_lock, "hot backup")); + WT_RET(__wt_spin_init(session, &conn->las_lock, "lookaside table")); WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure")); WT_RET(__wt_spin_init(session, &conn->schema_lock, "schema")); WT_RET(__wt_spin_init(session, &conn->table_lock, "table creation")); - WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS(conn), &conn->page_lock)); - for (i = 0; i < WT_PAGE_LOCKS(conn); ++i) + + WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS, &conn->page_lock)); + WT_CACHE_LINE_ALIGNMENT_VERIFY(session, conn->page_lock); + for (i = 0; i < WT_PAGE_LOCKS; ++i) WT_RET( __wt_spin_init(session, &conn->page_lock[i], "btree page")); @@ -91,8 +94,8 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) */ WT_RET(__wt_spin_init(session, &conn->block_lock, "block manager")); for (i = 0; i < WT_HASH_ARRAY_SIZE; i++) - SLIST_INIT(&conn->blockhash[i]);/* Block handle hash lists */ - SLIST_INIT(&conn->blocklh); /* Block manager list */ + TAILQ_INIT(&conn->blockhash[i]);/* Block handle hash lists */ + TAILQ_INIT(&conn->blockqh); /* Block manager list */ return (0); } @@ -138,10 +141,11 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_spin_destroy(session, &conn->encryptor_lock); __wt_spin_destroy(session, &conn->fh_lock); WT_TRET(__wt_rwlock_destroy(session, &conn->hot_backup_lock)); + __wt_spin_destroy(session, &conn->las_lock); __wt_spin_destroy(session, &conn->reconfig_lock); __wt_spin_destroy(session, &conn->schema_lock); __wt_spin_destroy(session, &conn->table_lock); - for (i = 0; i < WT_PAGE_LOCKS(conn); ++i) + for (i = 0; i < WT_PAGE_LOCKS; ++i) __wt_spin_destroy(session, &conn->page_lock[i]); __wt_free(session, conn->page_lock); diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index de4bf7268ed..2b115190b06 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -287,8 +287,9 @@ __log_file_server(void *arg) WT_DECL_RET; WT_FH *close_fh; WT_LOG *log; - WT_LSN close_end_lsn, close_lsn, min_lsn; + WT_LSN close_end_lsn, min_lsn; WT_SESSION_IMPL *session; + uint32_t filenum; int locked; session = arg; @@ -300,66 +301,97 @@ __log_file_server(void *arg) * If there is a log file to close, make sure any outstanding * write operations have completed, then fsync and close it. */ - if ((close_fh = log->log_close_fh) != NULL && - (ret = __wt_log_extract_lognum(session, close_fh->name, - &close_lsn.file)) == 0 && - close_lsn.file < log->write_lsn.file) { + if ((close_fh = log->log_close_fh) != NULL) { + WT_ERR(__wt_log_extract_lognum(session, close_fh->name, + &filenum)); /* - * We've copied the file handle, clear out the one in - * log structure to allow it to be set again. + * We update the close file handle before updating the + * close LSN when changing files. It is possible we + * could see mismatched settings. If we do, yield + * until it is set. This should rarely happen. */ - log->log_close_fh = NULL; - /* - * Set the close_end_lsn to the LSN immediately after - * ours. That is, the beginning of the next log file. - * We need to know the LSN file number of our own close - * in case earlier calls are still in progress and the - * next one to move the sync_lsn into the next file for - * later syncs. - */ - close_lsn.offset = 0; - close_end_lsn = close_lsn; - close_end_lsn.file++; - WT_ERR(__wt_fsync(session, close_fh)); - __wt_spin_lock(session, &log->log_sync_lock); - locked = 1; - WT_ERR(__wt_close(session, &close_fh)); - WT_ASSERT(session, - WT_LOG_CMP(&close_end_lsn, &log->sync_lsn) >= 0); - log->sync_lsn = close_end_lsn; - WT_ERR(__wt_cond_signal(session, log->log_sync_cond)); - locked = 0; - __wt_spin_unlock(session, &log->log_sync_lock); + while (log->log_close_lsn.file < filenum) + __wt_yield(); + + if (__wt_log_cmp( + &log->write_lsn, &log->log_close_lsn) >= 0) { + /* + * We've copied the file handle, clear out the + * one in the log structure to allow it to be + * set again. Copy the LSN before clearing + * the file handle. + * Use a barrier to make sure the compiler does + * not reorder the following two statements. + */ + close_end_lsn = log->log_close_lsn; + WT_FULL_BARRIER(); + log->log_close_fh = NULL; + /* + * Set the close_end_lsn to the LSN immediately + * after ours. That is, the beginning of the + * next log file. We need to know the LSN + * file number of our own close in case earlier + * calls are still in progress and the next one + * to move the sync_lsn into the next file for + * later syncs. + */ + close_end_lsn.file++; + close_end_lsn.offset = 0; + WT_ERR(__wt_fsync(session, close_fh)); + __wt_spin_lock(session, &log->log_sync_lock); + locked = 1; + WT_ERR(__wt_close(session, &close_fh)); + WT_ASSERT(session, __wt_log_cmp( + &close_end_lsn, &log->sync_lsn) >= 0); + log->sync_lsn = close_end_lsn; + WT_ERR(__wt_cond_signal( + session, log->log_sync_cond)); + locked = 0; + __wt_spin_unlock(session, &log->log_sync_lock); + } } /* * If a later thread asked for a background sync, do it now. */ - if (WT_LOG_CMP(&log->bg_sync_lsn, &log->sync_lsn) > 0) { + if (__wt_log_cmp(&log->bg_sync_lsn, &log->sync_lsn) > 0) { /* * Save the latest write LSN which is the minimum * we will have written to disk. */ min_lsn = log->write_lsn; /* - * The sync LSN we asked for better be smaller than - * the current written LSN. + * We have to wait until the LSN we asked for is + * written. If it isn't signal the wrlsn thread + * to get it written. */ - WT_ASSERT(session, - WT_LOG_CMP(&log->bg_sync_lsn, &min_lsn) <= 0); - WT_ERR(__wt_fsync(session, log->log_fh)); - __wt_spin_lock(session, &log->log_sync_lock); - locked = 1; - /* - * The sync LSN could have advanced while we were - * writing to disk. - */ - if (WT_LOG_CMP(&log->sync_lsn, &min_lsn) <= 0) { - log->sync_lsn = min_lsn; + if (__wt_log_cmp(&log->bg_sync_lsn, &min_lsn) <= 0) { + WT_ERR(__wt_fsync(session, log->log_fh)); + __wt_spin_lock(session, &log->log_sync_lock); + locked = 1; + /* + * The sync LSN could have advanced while we + * were writing to disk. + */ + if (__wt_log_cmp( + &log->sync_lsn, &min_lsn) <= 0) { + log->sync_lsn = min_lsn; + WT_ERR(__wt_cond_signal( + session, log->log_sync_cond)); + } + locked = 0; + __wt_spin_unlock(session, &log->log_sync_lock); + } else { WT_ERR(__wt_cond_signal( - session, log->log_sync_cond)); + session, conn->log_wrlsn_cond)); + /* + * We do not want to wait potentially a second + * to process this. Yield to give the wrlsn + * thread a chance to run and try again in + * this case. + */ + __wt_yield(); + continue; } - locked = 0; - __wt_spin_unlock(session, &log->log_sync_lock); } /* Wait until the next event. */ WT_ERR(__wt_cond_wait( @@ -394,26 +426,29 @@ typedef struct { /* * __wt_log_wrlsn -- * Process written log slots and attempt to coalesce them if the LSNs - * are contiguous. Returns 1 if slots were freed, 0 if no slots were - * freed in the progress arg. Must be called with the log slot lock held. + * are contiguous. The purpose of this function is to advance the + * write_lsn in LSN order after the buffer is written to the log file. */ int -__wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield) +__wt_log_wrlsn(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; + WT_DECL_RET; WT_LOG *log; WT_LOG_WRLSN_ENTRY written[WT_SLOT_POOL]; WT_LOGSLOT *coalescing, *slot; + WT_LSN save_lsn; size_t written_i; uint32_t i, save_i; conn = S2C(session); log = conn->log; + __wt_spin_lock(session, &log->log_writelsn_lock); +restart: coalescing = NULL; + WT_INIT_LSN(&save_lsn); written_i = 0; i = 0; - if (free_i != NULL) - *free_i = WT_SLOT_POOL; /* * Walk the array once saving any slots that are in the @@ -422,9 +457,14 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield) while (i < WT_SLOT_POOL) { save_i = i; slot = &log->slot_pool[i++]; - if (free_i != NULL && *free_i == WT_SLOT_POOL && - slot->slot_state == WT_LOG_SLOT_FREE) - *free_i = save_i; + /* + * XXX - During debugging I saw slot 0 become orphaned. + * I believe it is fixed, but check for now. + * This assertion should catch that. + */ + if (slot->slot_state == 0) + WT_ASSERT(session, + slot->slot_release_lsn.file >= log->write_lsn.file); if (slot->slot_state != WT_LOG_SLOT_WRITTEN) continue; written[written_i].slot_index = save_i; @@ -435,15 +475,8 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield) * based on the release LSN, and then look for them in order. */ if (written_i > 0) { - /* - * If wanted, reset the yield variable to indicate that we - * have found written slots. - */ - if (yield != NULL) - *yield = 0; WT_INSERTION_SORT(written, written_i, WT_LOG_WRLSN_ENTRY, WT_WRLSN_ENTRY_CMP_LT); - /* * We know the written array is sorted by LSN. Go * through them either advancing write_lsn or coalesce @@ -451,8 +484,28 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield) */ for (i = 0; i < written_i; i++) { slot = &log->slot_pool[written[i].slot_index]; + /* + * The log server thread pushes out slots periodically. + * Sometimes they are empty slots. If we find an + * empty slot, where empty means the start and end LSN + * are the same, free it and continue. + */ + if (__wt_log_cmp(&slot->slot_start_lsn, + &slot->slot_release_lsn) == 0 && + __wt_log_cmp(&slot->slot_start_lsn, + &slot->slot_end_lsn) == 0) { + __wt_log_slot_free(session, slot); + continue; + } if (coalescing != NULL) { - if (WT_LOG_CMP(&coalescing->slot_end_lsn, + /* + * If the write_lsn changed, we may be able to + * process slots. Try again. + */ + if (__wt_log_cmp( + &log->write_lsn, &save_lsn) != 0) + goto restart; + if (__wt_log_cmp(&coalescing->slot_end_lsn, &written[i].lsn) != 0) { coalescing = slot; continue; @@ -461,6 +514,8 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield) * If we get here we have a slot to coalesce * and free. */ + coalescing->slot_last_offset = + slot->slot_last_offset; coalescing->slot_end_lsn = slot->slot_end_lsn; WT_STAT_FAST_CONN_INCR( session, log_slot_coalesced); @@ -473,8 +528,12 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield) /* * If this written slot is not the next LSN, * try to start coalescing with later slots. + * A synchronous write may update write_lsn + * so save the last one we saw to check when + * coalescing slots. */ - if (WT_LOG_CMP( + save_lsn = log->write_lsn; + if (__wt_log_cmp( &log->write_lsn, &written[i].lsn) != 0) { coalescing = slot; continue; @@ -483,27 +542,29 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield) * If we get here we have a slot to process. * Advance the LSN and process the slot. */ - WT_ASSERT(session, WT_LOG_CMP(&written[i].lsn, + WT_ASSERT(session, __wt_log_cmp(&written[i].lsn, &slot->slot_release_lsn) == 0); + if (slot->slot_start_lsn.offset != + slot->slot_last_offset) + slot->slot_start_lsn.offset = + slot->slot_last_offset; log->write_start_lsn = slot->slot_start_lsn; log->write_lsn = slot->slot_end_lsn; - WT_RET(__wt_cond_signal( + WT_ERR(__wt_cond_signal( session, log->log_write_cond)); WT_STAT_FAST_CONN_INCR(session, log_write_lsn); /* * Signal the close thread if needed. */ if (F_ISSET(slot, WT_SLOT_CLOSEFH)) - WT_RET(__wt_cond_signal( + WT_ERR(__wt_cond_signal( session, conn->log_file_cond)); } - WT_RET(__wt_log_slot_free(session, slot)); - if (free_i != NULL && *free_i == WT_SLOT_POOL && - slot->slot_state == WT_LOG_SLOT_FREE) - *free_i = save_i; + __wt_log_slot_free(session, slot); } } - return (0); +err: __wt_spin_unlock(session, &log->log_writelsn_lock); + return (ret); } /* @@ -515,31 +576,26 @@ __log_wrlsn_server(void *arg) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; - WT_LOG *log; WT_SESSION_IMPL *session; - int locked, yield; session = arg; conn = S2C(session); - log = conn->log; - locked = yield = 0; while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { - __wt_spin_lock(session, &log->log_slot_lock); - locked = 1; - WT_ERR(__wt_log_wrlsn(session, NULL, &yield)); - locked = 0; - __wt_spin_unlock(session, &log->log_slot_lock); - if (++yield < 1000) - __wt_yield(); - else - WT_ERR(__wt_cond_wait(session, - conn->log_wrlsn_cond, 100000)); + /* + * Write out any log record buffers. + */ + WT_ERR(__wt_log_wrlsn(session)); + WT_ERR(__wt_cond_wait(session, conn->log_wrlsn_cond, 10000)); } + /* + * On close we need to do this one more time because there could + * be straggling log writes that need to be written. + */ + WT_ERR(__wt_log_force_write(session, 1)); + WT_ERR(__wt_log_wrlsn(session)); if (0) { err: __wt_err(session, ret, "log wrlsn server error"); } - if (locked) - __wt_spin_unlock(session, &log->log_slot_lock); return (WT_THREAD_RET_VALUE); } @@ -554,44 +610,81 @@ __log_server(void *arg) WT_DECL_RET; WT_LOG *log; WT_SESSION_IMPL *session; - u_int locked; + int freq_per_sec, signalled; session = arg; conn = S2C(session); log = conn->log; - locked = 0; + signalled = 0; + + /* + * Set this to the number of times per second we want to force out the + * log slot buffer. + */ +#define WT_FORCE_PER_SECOND 20 + freq_per_sec = WT_FORCE_PER_SECOND; + + /* + * The log server thread does a variety of work. It forces out any + * buffered log writes. It pre-allocates log files and it performs + * log archiving. The reason the wrlsn thread does not force out + * the buffered writes is because we want to process and move the + * write_lsn forward as quickly as possible. The same reason applies + * to why the log file server thread does not force out the writes. + * That thread does fsync calls which can take a long time and we + * don't want log records sitting in the buffer over the time it + * takes to sync out an earlier file. + */ while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) { /* - * Perform log pre-allocation. + * Slots depend on future activity. Force out buffered + * writes in case we are idle. This cannot be part of the + * wrlsn thread because of interaction advancing the write_lsn + * and a buffer may need to wait for the write_lsn to advance + * in the case of a synchronous buffer. We end up with a hang. */ - if (conn->log_prealloc > 0) - WT_ERR(__log_prealloc_once(session)); + WT_ERR_BUSY_OK(__wt_log_force_write(session, 0)); /* - * Perform the archive. + * We don't want to archive or pre-allocate files as often as + * we want to force out log buffers. Only do it once per second + * or if the condition was signalled. */ - if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) { - if (__wt_try_writelock( - session, log->log_archive_lock) == 0) { - locked = 1; - WT_ERR(__log_archive_once(session, 0)); - WT_ERR( __wt_writeunlock( - session, log->log_archive_lock)); - locked = 0; - } else - WT_ERR(__wt_verbose(session, WT_VERB_LOG, - "log_archive: Blocked due to open log " - "cursor holding archive lock")); + if (--freq_per_sec <= 0 || signalled != 0) { + freq_per_sec = WT_FORCE_PER_SECOND; + + /* + * Perform log pre-allocation. + */ + if (conn->log_prealloc > 0) + WT_ERR(__log_prealloc_once(session)); + + /* + * Perform the archive. + */ + if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) { + if (__wt_try_writelock( + session, log->log_archive_lock) == 0) { + ret = __log_archive_once(session, 0); + WT_TRET(__wt_writeunlock( + session, log->log_archive_lock)); + WT_ERR(ret); + } else + WT_ERR( + __wt_verbose(session, WT_VERB_LOG, + "log_archive: Blocked due to open " + "log cursor holding archive lock")); + } } + /* Wait until the next event. */ - WT_ERR(__wt_cond_wait(session, conn->log_cond, WT_MILLION)); + WT_ERR(__wt_cond_wait_signal(session, conn->log_cond, + WT_MILLION / WT_FORCE_PER_SECOND, &signalled)); } if (0) { err: __wt_err(session, ret, "log server error"); } - if (locked) - (void)__wt_writeunlock(session, log->log_archive_lock); return (WT_THREAD_RET_VALUE); } @@ -624,6 +717,8 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_spin_init(session, &log->log_lock, "log")); WT_RET(__wt_spin_init(session, &log->log_slot_lock, "log slot")); WT_RET(__wt_spin_init(session, &log->log_sync_lock, "log sync")); + WT_RET(__wt_spin_init(session, &log->log_writelsn_lock, + "log write LSN")); WT_RET(__wt_rwlock_alloc(session, &log->log_archive_lock, "log archive lock")); if (FLD_ISSET(conn->direct_io, WT_FILE_TYPE_LOG)) @@ -755,13 +850,11 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) WT_TRET(__wt_thread_join(session, conn->log_tid)); conn->log_tid_set = 0; } - WT_TRET(__wt_cond_destroy(session, &conn->log_cond)); if (conn->log_file_tid_set) { WT_TRET(__wt_cond_signal(session, conn->log_file_cond)); WT_TRET(__wt_thread_join(session, conn->log_file_tid)); conn->log_file_tid_set = 0; } - WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond)); if (conn->log_file_session != NULL) { wt_session = &conn->log_file_session->iface; WT_TRET(wt_session->close(wt_session, NULL)); @@ -772,13 +865,13 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) WT_TRET(__wt_thread_join(session, conn->log_wrlsn_tid)); conn->log_wrlsn_tid_set = 0; } - WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond)); if (conn->log_wrlsn_session != NULL) { wt_session = &conn->log_wrlsn_session->iface; WT_TRET(wt_session->close(wt_session, NULL)); conn->log_wrlsn_session = NULL; } + WT_TRET(__wt_log_slot_destroy(session)); WT_TRET(__wt_log_close(session)); /* Close the server thread's session. */ @@ -788,13 +881,18 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session) conn->log_session = NULL; } - WT_TRET(__wt_log_slot_destroy(session)); + /* Destroy the condition variables now that all threads are stopped */ + WT_TRET(__wt_cond_destroy(session, &conn->log_cond)); + WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond)); + WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond)); + WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log->log_write_cond)); WT_TRET(__wt_rwlock_destroy(session, &conn->log->log_archive_lock)); __wt_spin_destroy(session, &conn->log->log_lock); __wt_spin_destroy(session, &conn->log->log_slot_lock); __wt_spin_destroy(session, &conn->log->log_sync_lock); + __wt_spin_destroy(session, &conn->log->log_writelsn_lock); __wt_free(session, conn->log_path); __wt_free(session, conn->log); return (ret); diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c index c4350d90adb..8bc69bb3e80 100644 --- a/src/conn/conn_open.c +++ b/src/conn/conn_open.c @@ -30,6 +30,7 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]) /* WT_SESSION_IMPL array. */ WT_RET(__wt_calloc(session, conn->session_size, sizeof(WT_SESSION_IMPL), &conn->sessions)); + WT_CACHE_LINE_ALIGNMENT_VERIFY(session, conn->sessions); /* * Open the default session. We open this before starting service @@ -110,14 +111,17 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) F_CLR(conn, WT_CONN_SERVER_RUN); WT_TRET(__wt_async_destroy(session)); WT_TRET(__wt_lsm_manager_destroy(session)); + WT_TRET(__wt_sweep_destroy(session)); F_SET(conn, WT_CONN_CLOSING); WT_TRET(__wt_checkpoint_server_destroy(session)); WT_TRET(__wt_statlog_destroy(session, 1)); - WT_TRET(__wt_sweep_destroy(session)); WT_TRET(__wt_evict_destroy(session)); + /* Shut down the lookaside table, after all eviction is complete. */ + WT_TRET(__wt_las_destroy(session)); + /* Close open data handles. */ WT_TRET(__wt_conn_dhandle_discard(session)); @@ -128,7 +132,8 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) * conditional because we allocate the log path so that printlog can * run without running logging or recovery. */ - if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) + if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && + FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE)) WT_TRET(__wt_txn_checkpoint_log( session, 1, WT_TXN_LOG_CKPT_STOP, NULL)); F_CLR(conn, WT_CONN_LOG_SERVER_RUN); @@ -145,14 +150,14 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) * Complain if files weren't closed, ignoring the lock file, we'll * close it in a minute. */ - SLIST_FOREACH(fh, &conn->fhlh, l) { + TAILQ_FOREACH(fh, &conn->fhqh, q) { if (fh == conn->lock_fh) continue; __wt_errx(session, "Connection has open file handles: %s", fh->name); WT_TRET(__wt_close(session, &fh)); - fh = SLIST_FIRST(&conn->fhlh); + fh = TAILQ_FIRST(&conn->fhqh); } /* Disconnect from shared cache - must be before cache destroy. */ @@ -236,9 +241,7 @@ __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[]) /* Run recovery. */ WT_RET(__wt_txn_recover(session)); - /* - * Start the handle sweep thread. - */ + /* Start the handle sweep thread. */ WT_RET(__wt_sweep_create(session)); /* Start the optional async threads. */ diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c index 9c438c01cd2..3b188bfd22a 100644 --- a/src/conn/conn_stat.c +++ b/src/conn/conn_stat.c @@ -42,11 +42,25 @@ __stat_sources_free(WT_SESSION_IMPL *session, char ***sources) void __wt_conn_stat_init(WT_SESSION_IMPL *session) { + WT_CONNECTION_IMPL *conn; + WT_CONNECTION_STATS **stats; + + conn = S2C(session); + stats = conn->stats; + __wt_async_stats_update(session); __wt_cache_stats_update(session); + __wt_las_stats_update(session); __wt_txn_stats_update(session); - WT_CONN_STAT(session, file_open) = S2C(session)->open_file_count; + WT_STAT_SET(session, stats, file_open, conn->open_file_count); + WT_STAT_SET(session, + stats, session_cursor_open, conn->open_cursor_count); + WT_STAT_SET(session, stats, dh_conn_handle_count, conn->dhandle_count); + WT_STAT_SET(session, + stats, rec_split_stashed_objects, conn->split_stashed_objects); + WT_STAT_SET(session, + stats, rec_split_stashed_bytes, conn->split_stashed_bytes); } /* @@ -135,11 +149,11 @@ __statlog_dump(WT_SESSION_IMPL *session, const char *name, int conn_stats) { WT_CONNECTION_IMPL *conn; WT_CURSOR *cursor; + WT_CURSOR_STAT *cst; WT_DECL_ITEM(tmp); WT_DECL_RET; - WT_STATS *stats; - u_int i; - uint64_t max; + int64_t *stats; + int i; const char *uri; const char *cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL }; @@ -163,15 +177,14 @@ __statlog_dump(WT_SESSION_IMPL *session, const char *name, int conn_stats) */ switch (ret = __wt_curstat_open(session, uri, cfg, &cursor)) { case 0: - max = conn_stats ? - sizeof(WT_CONNECTION_STATS) / sizeof(WT_STATS) : - sizeof(WT_DSRC_STATS) / sizeof(WT_STATS); - for (i = 0, - stats = WT_CURSOR_STATS(cursor); i < max; ++i, ++stats) + cst = (WT_CURSOR_STAT *)cursor; + for (stats = cst->stats, i = 0; i < cst->stats_count; ++i) WT_ERR(__wt_fprintf(conn->stat_fp, - "%s %" PRIu64 " %s %s\n", - conn->stat_stamp, - stats->v, name, stats->desc)); + "%s %" PRId64 " %s %s\n", + conn->stat_stamp, stats[i], + name, conn_stats ? + __wt_stat_connection_desc(i) : + __wt_stat_dsrc_desc(i))); WT_ERR(cursor->close(cursor)); break; case EBUSY: diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c index ec6f628a02e..8da32416242 100644 --- a/src/conn/conn_sweep.c +++ b/src/conn/conn_sweep.c @@ -8,55 +8,58 @@ #include "wt_internal.h" +#define WT_DHANDLE_CAN_DISCARD(dhandle) \ + (!F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_OPEN) && \ + dhandle->session_inuse == 0 && dhandle->session_ref == 0) + /* * __sweep_mark -- * Mark idle handles with a time of death, and note if we see dead * handles. */ static int -__sweep_mark(WT_SESSION_IMPL *session, int *dead_handlesp) +__sweep_mark(WT_SESSION_IMPL *session, time_t now) { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; - time_t now; conn = S2C(session); - *dead_handlesp = 0; - /* Don't discard handles that have been open recently. */ - WT_RET(__wt_seconds(session, &now)); - - WT_STAT_FAST_CONN_INCR(session, dh_conn_sweeps); - SLIST_FOREACH(dhandle, &conn->dhlh, l) { + TAILQ_FOREACH(dhandle, &conn->dhqh, q) { if (WT_IS_METADATA(dhandle)) continue; - if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) { - ++*dead_handlesp; - continue; - } - if (dhandle->session_inuse != 0 || - now <= dhandle->timeofdeath + conn->sweep_idle_time || - conn->sweep_idle_time == 0) - continue; - if (dhandle->timeofdeath == 0) { - dhandle->timeofdeath = now; - WT_STAT_FAST_CONN_INCR(session, dh_conn_tod); + + /* + * There are some internal increments of the in-use count such + * as eviction. Don't keep handles alive because of those + * cases, but if we see multiple cursors open, clear the time + * of death. + */ + if (dhandle->session_inuse > 1) + dhandle->timeofdeath = 0; + + /* + * If the handle is open exclusive or currently in use, or the + * time of death is already set, move on. + */ + if (F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) || + dhandle->session_inuse > 0 || + dhandle->timeofdeath != 0) continue; - } - /* We now have a candidate to close. */ - ++*dead_handlesp; + dhandle->timeofdeath = now; + WT_STAT_FAST_CONN_INCR(session, dh_sweep_tod); } return (0); } /* - * __sweep_expire_handle -- + * __sweep_expire_one -- * Mark a single handle dead. */ static int -__sweep_expire_handle(WT_SESSION_IMPL *session) +__sweep_expire_one(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_DATA_HANDLE *dhandle; @@ -113,42 +116,31 @@ err: WT_TRET(__wt_writeunlock(session, dhandle->rwlock)); * until we have reached the configured minimum number of handles. */ static int -__sweep_expire(WT_SESSION_IMPL *session) +__sweep_expire(WT_SESSION_IMPL *session, time_t now) { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; - time_t now; conn = S2C(session); - /* If sweep_idle_time is 0, then we won't expire any cursors */ - if (conn->sweep_idle_time == 0) - return (0); - - /* Don't discard handles that have been open recently. */ - WT_RET(__wt_seconds(session, &now)); - - WT_STAT_FAST_CONN_INCR(session, dh_conn_sweeps); - SLIST_FOREACH(dhandle, &conn->dhlh, l) { + TAILQ_FOREACH(dhandle, &conn->dhqh, q) { /* - * Ignore open files once the open file count reaches the + * Ignore open files once the btree file count is below the * minimum number of handles. */ - if (conn->open_file_count < conn->sweep_handles_min) + if (conn->open_btree_count < conn->sweep_handles_min) break; - if (WT_IS_METADATA(dhandle)) - continue; - if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) || - F_ISSET(dhandle, WT_DHANDLE_DEAD)) - continue; - if (dhandle->session_inuse != 0 || + if (WT_IS_METADATA(dhandle) || + !F_ISSET(dhandle, WT_DHANDLE_OPEN) || + dhandle->session_inuse != 0 || + dhandle->timeofdeath == 0 || now <= dhandle->timeofdeath + conn->sweep_idle_time) continue; WT_WITH_DHANDLE(session, dhandle, - ret = __sweep_expire_handle(session)); + ret = __sweep_expire_one(session)); WT_RET_BUSY_OK(ret); } @@ -156,11 +148,11 @@ __sweep_expire(WT_SESSION_IMPL *session) } /* - * __sweep_flush -- - * Flush pages from dead trees. + * __sweep_discard_trees -- + * Discard pages from dead trees. */ static int -__sweep_flush(WT_SESSION_IMPL *session) +__sweep_discard_trees(WT_SESSION_IMPL *session, u_int *dead_handlesp) { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; @@ -168,8 +160,12 @@ __sweep_flush(WT_SESSION_IMPL *session) conn = S2C(session); - WT_STAT_FAST_CONN_INCR(session, dh_conn_sweeps); - SLIST_FOREACH(dhandle, &conn->dhlh, l) { + *dead_handlesp = 0; + + TAILQ_FOREACH(dhandle, &conn->dhqh, q) { + if (WT_DHANDLE_CAN_DISCARD(dhandle)) + ++*dead_handlesp; + if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) || !F_ISSET(dhandle, WT_DHANDLE_DEAD)) continue; @@ -178,9 +174,12 @@ __sweep_flush(WT_SESSION_IMPL *session) WT_WITH_DHANDLE(session, dhandle, ret = __wt_conn_btree_sync_and_close(session, 0, 0)); - /* We closed the btree handle, bump the statistic. */ - if (ret == 0) - WT_STAT_FAST_CONN_INCR(session, dh_conn_handles); + /* We closed the btree handle. */ + if (ret == 0) { + WT_STAT_FAST_CONN_INCR(session, dh_sweep_close); + ++*dead_handlesp; + } else + WT_STAT_FAST_CONN_INCR(session, dh_sweep_ref); WT_RET_BUSY_OK(ret); } @@ -189,8 +188,41 @@ __sweep_flush(WT_SESSION_IMPL *session) } /* + * __sweep_remove_one -- + * Remove a closed handle from the connection list. + */ +static int +__sweep_remove_one(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle) +{ + WT_DECL_RET; + + /* Try to get exclusive access. */ + WT_RET(__wt_try_writelock(session, dhandle->rwlock)); + + /* + * If there are no longer any references to the handle in any + * sessions, attempt to discard it. + */ + if (!WT_DHANDLE_CAN_DISCARD(dhandle)) + WT_ERR(EBUSY); + + WT_WITH_DHANDLE(session, dhandle, + ret = __wt_conn_dhandle_discard_single(session, 0, 1)); + + /* + * If the handle was not successfully discarded, unlock it and + * don't retry the discard until it times out again. + */ + if (ret != 0) { +err: WT_TRET(__wt_writeunlock(session, dhandle->rwlock)); + } + + return (ret); +} + +/* * __sweep_remove_handles -- - * Remove closed dhandles from the connection list. + * Remove closed handles from the connection list. */ static int __sweep_remove_handles(WT_SESSION_IMPL *session) @@ -200,41 +232,23 @@ __sweep_remove_handles(WT_SESSION_IMPL *session) WT_DECL_RET; conn = S2C(session); - dhandle = SLIST_FIRST(&conn->dhlh); - for (; dhandle != NULL; dhandle = dhandle_next) { - dhandle_next = SLIST_NEXT(dhandle, l); + for (dhandle = TAILQ_FIRST(&conn->dhqh); + dhandle != NULL; + dhandle = dhandle_next) { + dhandle_next = TAILQ_NEXT(dhandle, q); if (WT_IS_METADATA(dhandle)) continue; - if (F_ISSET(dhandle, WT_DHANDLE_OPEN) || - dhandle->session_inuse != 0 || - dhandle->session_ref != 0) - continue; - - /* Make sure we get exclusive access. */ - if ((ret = - __wt_try_writelock(session, dhandle->rwlock)) == EBUSY) - continue; - WT_RET(ret); - - /* - * If there are no longer any references to the handle in any - * sessions, attempt to discard it. - */ - if (F_ISSET(dhandle, WT_DHANDLE_OPEN) || - dhandle->session_inuse != 0 || dhandle->session_ref != 0) { - WT_RET(__wt_writeunlock(session, dhandle->rwlock)); + if (!WT_DHANDLE_CAN_DISCARD(dhandle)) continue; - } - - WT_WITH_DHANDLE(session, dhandle, - ret = __wt_conn_dhandle_discard_single(session, 0, 1)); - /* If the handle was not successfully discarded, unlock it. */ - if (ret != 0) - WT_TRET(__wt_writeunlock(session, dhandle->rwlock)); + WT_WITH_HANDLE_LIST_LOCK(session, + ret = __sweep_remove_one(session, dhandle)); + if (ret == 0) + WT_STAT_FAST_CONN_INCR(session, dh_sweep_remove); + else + WT_STAT_FAST_CONN_INCR(session, dh_sweep_ref); WT_RET_BUSY_OK(ret); - WT_STAT_FAST_CONN_INCR(session, dh_conn_ref); } return (ret == EBUSY ? 0 : ret); @@ -250,7 +264,8 @@ __sweep_server(void *arg) WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION_IMPL *session; - int dead_handles; + time_t now; + u_int dead_handles; session = arg; conn = S2C(session); @@ -263,35 +278,37 @@ __sweep_server(void *arg) /* Wait until the next event. */ WT_ERR(__wt_cond_wait(session, conn->sweep_cond, (uint64_t)conn->sweep_interval * WT_MILLION)); + WT_ERR(__wt_seconds(session, &now)); + + WT_STAT_FAST_CONN_INCR(session, dh_sweeps); /* - * Mark handles with a time of death, and report whether any - * handles are marked dead. + * Sweep the lookaside table. If the lookaside table hasn't yet + * been written, there's no work to do. */ - WT_ERR(__sweep_mark(session, &dead_handles)); + if (__wt_las_is_written(session)) + WT_ERR(__wt_las_sweep(session)); /* - * We only want to flush and expire if there are no dead handles - * and if either the sweep_idle_time is not 0, or if we have - * reached the configured limit of handles. + * Mark handles with a time of death, and report whether any + * handles are marked dead. If sweep_idle_time is 0, handles + * never become idle. */ - if (dead_handles == 0 && - (conn->open_file_count < conn->sweep_handles_min || - conn->sweep_idle_time != 0)) - continue; + if (conn->sweep_idle_time != 0) + WT_ERR(__sweep_mark(session, now)); - /* Close handles if we have reached the configured limit */ - if (conn->open_file_count >= conn->sweep_handles_min) { - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __sweep_expire(session)); - WT_ERR(ret); - } + /* + * Close handles if we have reached the configured limit. + * If sweep_idle_time is 0, handles never become idle. + */ + if (conn->sweep_idle_time != 0 && + conn->open_btree_count >= conn->sweep_handles_min) + WT_ERR(__sweep_expire(session, now)); - WT_ERR(__sweep_flush(session)); + WT_ERR(__sweep_discard_trees(session, &dead_handles)); - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __sweep_remove_handles(session)); - WT_ERR(ret); + if (dead_handles > 0) + WT_ERR(__sweep_remove_handles(session)); } if (0) { @@ -349,8 +366,14 @@ __wt_sweep_create(WT_SESSION_IMPL *session) /* * Handle sweep does enough I/O it may be called upon to perform slow * operations for the block manager. + * + * The sweep thread sweeps the lookaside table for outdated records, + * it gets its own cursor for that purpose. + * + * Don't tap the sweep thread for eviction. */ - F_SET(session, WT_SESSION_CAN_WAIT); + F_SET(session, WT_SESSION_CAN_WAIT | + WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION); WT_RET(__wt_cond_alloc( session, "handle sweep server", 0, &conn->sweep_cond)); @@ -389,5 +412,9 @@ __wt_sweep_destroy(WT_SESSION_IMPL *session) conn->sweep_session = NULL; } + + /* Discard any saved lookaside key. */ + __wt_buf_free(session, &conn->las_sweep_key); + return (ret); } diff --git a/src/cursor/cur_backup.c b/src/cursor/cur_backup.c index 60d94697189..3d9e5e405e8 100644 --- a/src/cursor/cur_backup.c +++ b/src/cursor/cur_backup.c @@ -514,17 +514,23 @@ static int __backup_list_all_append(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CURSOR_BACKUP *cb; + const char *name; WT_UNUSED(cfg); cb = session->bkp_cursor; + name = session->dhandle->name; /* Ignore files in the process of being bulk-loaded. */ if (F_ISSET(S2BT(session), WT_BTREE_BULK)) return (0); + /* Ignore the lookaside table. */ + if (strcmp(name, WT_LAS_URI) == 0) + return (0); + /* Add the file to the list of files to be copied. */ - return (__backup_list_append(session, cb, session->dhandle->name)); + return (__backup_list_append(session, cb, name)); } /* diff --git a/src/cursor/cur_ds.c b/src/cursor/cur_ds.c index c58d6899150..8ee57d24413 100644 --- a/src/cursor/cur_ds.c +++ b/src/cursor/cur_ds.c @@ -510,7 +510,7 @@ __wt_curds_open( source = data_source->source; source->session = (WT_SESSION *)session; memset(&source->q, 0, sizeof(source->q)); - source->recno = 0; + source->recno = WT_RECNO_OOB; memset(source->raw_recno_buf, 0, sizeof(source->raw_recno_buf)); memset(&source->key, 0, sizeof(source->key)); memset(&source->value, 0, sizeof(source->value)); diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c index d30a2a04c22..436227847af 100644 --- a/src/cursor/cur_file.c +++ b/src/cursor/cur_file.c @@ -369,15 +369,20 @@ __curfile_close(WT_CURSOR *cursor) __wt_buf_free(session, &cbulk->last); } - WT_TRET(__wt_btcur_close(cbt)); - if (cbt->btree != NULL) { + WT_TRET(__wt_btcur_close(cbt, 0)); + /* The URI is owned by the btree handle. */ + cursor->internal_uri = NULL; + WT_TRET(__wt_cursor_close(cursor)); + + /* + * Note: release the data handle last so that cursor statistics are + * updated correctly. + */ + if (session->dhandle != NULL) { /* Increment the data-source's in-use counter. */ __wt_cursor_dhandle_decr_use(session); WT_TRET(__wt_session_release_btree(session)); } - /* The URI is owned by the btree handle. */ - cursor->internal_uri = NULL; - WT_TRET(__wt_cursor_close(cursor)); err: API_END_RET(session, ret); } diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c index 7dad85e9d38..045663b3614 100644 --- a/src/cursor/cur_index.c +++ b/src/cursor/cur_index.c @@ -130,7 +130,8 @@ __curindex_move(WT_CURSOR_INDEX *cindex) (*cp)->recno = first->recno; } F_SET(*cp, WT_CURSTD_KEY_EXT); - WT_RET((*cp)->search(*cp)); + if (cindex->cg_needvalue[i]) + WT_RET((*cp)->search(*cp)); } F_SET(&cindex->iface, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); @@ -320,6 +321,7 @@ __curindex_close(WT_CURSOR *cursor) *cp = NULL; } + __wt_free(session, cindex->cg_needvalue); __wt_free(session, cindex->cg_cursors); if (cindex->key_plan != idx->key_plan) __wt_free(session, cindex->key_plan); @@ -353,14 +355,19 @@ __curindex_open_colgroups( /* Child cursors are opened with dump disabled. */ const char *cfg[] = { cfg_arg[0], cfg_arg[1], "dump=\"\"", NULL }; char *proj; + size_t cgcnt; table = cindex->table; - WT_RET(__wt_calloc_def(session, WT_COLGROUPS(table), &cp)); + cgcnt = WT_COLGROUPS(table); + WT_RET(__wt_calloc_def(session, cgcnt, &cindex->cg_needvalue)); + WT_RET(__wt_calloc_def(session, cgcnt, &cp)); cindex->cg_cursors = cp; /* Work out which column groups we need. */ for (proj = (char *)cindex->value_plan; *proj != '\0'; proj++) { arg = strtoul(proj, &proj, 10); + if (*proj == WT_PROJ_VALUE) + cindex->cg_needvalue[arg] = 1; if ((*proj != WT_PROJ_KEY && *proj != WT_PROJ_VALUE) || cp[arg] != NULL) continue; diff --git a/src/cursor/cur_log.c b/src/cursor/cur_log.c index 3376f2a3166..ade9fd18962 100644 --- a/src/cursor/cur_log.c +++ b/src/cursor/cur_log.c @@ -74,7 +74,7 @@ __curlog_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) acl = (WT_CURSOR_LOG *)a; bcl = (WT_CURSOR_LOG *)b; WT_ASSERT(session, cmpp != NULL); - *cmpp = WT_LOG_CMP(acl->cur_lsn, bcl->cur_lsn); + *cmpp = __wt_log_cmp(acl->cur_lsn, bcl->cur_lsn); /* * If both are on the same LSN, compare step counter. */ @@ -392,6 +392,12 @@ __wt_curlog_open(WT_SESSION_IMPL *session, WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp)); + /* + * The user may be trying to read a log record they just wrote. + * Log records may be buffered, so force out any now. + */ + WT_ERR(__wt_log_force_write(session, 1)); + /* Log cursors block archiving. */ WT_ERR(__wt_readlock(session, log->log_archive_lock)); diff --git a/src/cursor/cur_stat.c b/src/cursor/cur_stat.c index 82568401319..2216a1d969d 100644 --- a/src/cursor/cur_stat.c +++ b/src/cursor/cur_stat.c @@ -113,12 +113,12 @@ __curstat_get_value(WT_CURSOR *cursor, ...) if (F_ISSET(cursor, WT_CURSTD_RAW)) { WT_ERR(__wt_struct_size(session, &size, cursor->value_format, - cst->stats_first[WT_STAT_KEY_OFFSET(cst)].desc, + cst->stats_desc(WT_STAT_KEY_OFFSET(cst)), cst->pv.data, cst->v)); WT_ERR(__wt_buf_initsize(session, &cursor->value, size)); WT_ERR(__wt_struct_pack(session, cursor->value.mem, size, cursor->value_format, - cst->stats_first[WT_STAT_KEY_OFFSET(cst)].desc, + cst->stats_desc(WT_STAT_KEY_OFFSET(cst)), cst->pv.data, cst->v)); item = va_arg(ap, WT_ITEM *); @@ -130,7 +130,7 @@ __curstat_get_value(WT_CURSOR *cursor, ...) * pointer support isn't documented, but it's a cheap test. */ if ((p = va_arg(ap, const char **)) != NULL) - *p = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].desc; + *p = cst->stats_desc(WT_STAT_KEY_OFFSET(cst)); if ((p = va_arg(ap, const char **)) != NULL) *p = cst->pv.data; if ((v = va_arg(ap, uint64_t *)) != NULL) @@ -215,7 +215,7 @@ __curstat_next(WT_CURSOR *cursor) F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); WT_ERR(WT_NOTFOUND); } - cst->v = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].v; + cst->v = (uint64_t)cst->stats[WT_STAT_KEY_OFFSET(cst)]; WT_ERR(__curstat_print_value(session, cst->v, &cst->pv)); F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); @@ -254,7 +254,7 @@ __curstat_prev(WT_CURSOR *cursor) WT_ERR(WT_NOTFOUND); } - cst->v = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].v; + cst->v = (uint64_t)cst->stats[WT_STAT_KEY_OFFSET(cst)]; WT_ERR(__curstat_print_value(session, cst->v, &cst->pv)); F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); @@ -308,7 +308,7 @@ __curstat_search(WT_CURSOR *cursor) if (cst->key < WT_STAT_KEY_MIN(cst) || cst->key > WT_STAT_KEY_MAX(cst)) WT_ERR(WT_NOTFOUND); - cst->v = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].v; + cst->v = (uint64_t)cst->stats[WT_STAT_KEY_OFFSET(cst)]; WT_ERR(__curstat_print_value(session, cst->v, &cst->pv)); F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); @@ -354,13 +354,14 @@ __curstat_conn_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) * Optionally clear the connection statistics. */ __wt_conn_stat_init(session); - cst->u.conn_stats = conn->stats; + __wt_stat_connection_aggregate(conn->stats, &cst->u.conn_stats); if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) - __wt_stat_refresh_connection_stats(&conn->stats); + __wt_stat_connection_clear_all(conn->stats); - cst->stats_first = cst->stats = (WT_STATS *)&cst->u.conn_stats; + cst->stats = (int64_t *)&cst->u.conn_stats; cst->stats_base = WT_CONNECTION_STATS_BASE; - cst->stats_count = sizeof(WT_CONNECTION_STATS) / sizeof(WT_STATS); + cst->stats_count = sizeof(WT_CONNECTION_STATS) / sizeof(int64_t); + cst->stats_desc = __wt_stat_connection_desc; } /* @@ -383,7 +384,7 @@ __curstat_file_init(WT_SESSION_IMPL *session, filename = uri; if (!WT_PREFIX_SKIP(filename, "file:")) return (EINVAL); - __wt_stat_init_dsrc_stats(&cst->u.dsrc_stats); + __wt_stat_dsrc_init_single(&cst->u.dsrc_stats); WT_RET(__wt_block_manager_size( session, filename, &cst->u.dsrc_stats)); __wt_curstat_dsrc_final(cst); @@ -398,9 +399,10 @@ __curstat_file_init(WT_SESSION_IMPL *session, * Optionally clear the data source statistics. */ if ((ret = __wt_btree_stat_init(session, cst)) == 0) { - cst->u.dsrc_stats = dhandle->stats; + __wt_stat_dsrc_init_single(&cst->u.dsrc_stats); + __wt_stat_dsrc_aggregate(dhandle->stats, &cst->u.dsrc_stats); if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) - __wt_stat_refresh_dsrc_stats(&dhandle->stats); + __wt_stat_dsrc_clear_all(dhandle->stats); __wt_curstat_dsrc_final(cst); } @@ -417,10 +419,10 @@ __curstat_file_init(WT_SESSION_IMPL *session, void __wt_curstat_dsrc_final(WT_CURSOR_STAT *cst) { - - cst->stats_first = cst->stats = (WT_STATS *)&cst->u.dsrc_stats; + cst->stats = (int64_t *)&cst->u.dsrc_stats; cst->stats_base = WT_DSRC_STATS_BASE; - cst->stats_count = sizeof(WT_DSRC_STATS) / sizeof(WT_STATS); + cst->stats_count = sizeof(WT_DSRC_STATS) / sizeof(int64_t); + cst->stats_desc = __wt_stat_dsrc_desc; } /* @@ -495,7 +497,7 @@ __wt_curstat_open(WT_SESSION_IMPL *session, conn = S2C(session); - WT_ERR(__wt_calloc_one(session, &cst)); + WT_RET(__wt_calloc_one(session, &cst)); cursor = &cst->iface; *cursor = iface; cursor->session = &session->iface; diff --git a/src/cursor/cur_std.c b/src/cursor/cur_std.c index 858c6af6853..701bd845ae9 100644 --- a/src/cursor/cur_std.c +++ b/src/cursor/cur_std.c @@ -258,9 +258,9 @@ __wt_cursor_set_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap) item->data, item->size, "q", &cursor->recno)); } else cursor->recno = va_arg(ap, uint64_t); - if (cursor->recno == 0) + if (cursor->recno == WT_RECNO_OOB) WT_ERR_MSG(session, EINVAL, - "Record numbers must be greater than zero"); + "%d is an invalid record number", WT_RECNO_OOB); buf->data = &cursor->recno; sz = sizeof(cursor->recno); } else { @@ -463,16 +463,17 @@ __wt_cursor_close(WT_CURSOR *cursor) WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)cursor->session; - __wt_buf_free(session, &cursor->key); - __wt_buf_free(session, &cursor->value); if (F_ISSET(cursor, WT_CURSTD_OPEN)) { TAILQ_REMOVE(&session->cursors, cursor, q); + (void)__wt_atomic_sub32(&S2C(session)->open_cursor_count, 1); WT_STAT_FAST_DATA_DECR(session, session_cursor_open); - WT_STAT_FAST_CONN_ATOMIC_DECR(session, session_cursor_open); } + __wt_buf_free(session, &cursor->key); + __wt_buf_free(session, &cursor->value); + __wt_free(session, cursor->internal_uri); __wt_free(session, cursor->uri); __wt_overwrite_and_free(session, cursor); @@ -683,8 +684,8 @@ __wt_cursor_init(WT_CURSOR *cursor, TAILQ_INSERT_HEAD(&session->cursors, cursor, q); F_SET(cursor, WT_CURSTD_OPEN); + (void)__wt_atomic_add32(&S2C(session)->open_cursor_count, 1); WT_STAT_FAST_DATA_INCR(session, session_cursor_open); - WT_STAT_FAST_CONN_ATOMIC_INCR(session, session_cursor_open); *cursorp = (cdump != NULL) ? cdump : cursor; return (0); diff --git a/src/docs/cursor-random.dox b/src/docs/cursor-random.dox index 70a28407ea5..5d0b89d6547 100644 --- a/src/docs/cursor-random.dox +++ b/src/docs/cursor-random.dox @@ -2,15 +2,11 @@ The \c next_random configuration to the WT_SESSION::open_cursor method configures the cursor to return a pseudo-random record from a row-store -object. - -The ability to return a random record was added to support a particular -application, and as a result has somewhat unusual semantics. First, the -returned record may not be random at all in the case of objects with only a few -rows (especially when the object has never been written to the backing store). -In such objects, the WT_CURSOR::next method for cursors configured with \c -next_random may return the same row on each call. Additionally, even in larger -objects, the WT_CURSOR::next method usually returns the first record from a -random page in the underlying file, not a random record from a random page. +object (the configuration is not supported on other types of objects). +The configuration has somewhat unusual semantics: first, the returned +record may not be very random in the case of objects with only a few +rows. Additionally, even in larger objects, the WT_CURSOR::next method +generally returns the first record from a random page in the underlying +file, not a random record from a random page. */ diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox index d9ac58103c5..e0640660b0a 100644 --- a/src/docs/upgrading.dox +++ b/src/docs/upgrading.dox @@ -3,6 +3,15 @@ @section version_262 Upgrading to Version 2.6.2 <dl> +<dt>Change to config_base=false</dt> +<dd> +If \c config_base=false in the config passed directly to ::wiredtiger_open, +any existing base configuration file will now be ignored. If an application +was relying on this behavior, a connection will be opened with different +settings after upgrading, which could lead to errors or unexpected behavior. +</dd> + +<dl> <dt>WT_SESSION.verify</dt> <dd> The WT_SESSION.verify method in this release has a new configuration diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c index 38cfc07ac5b..66fabe48fb2 100644 --- a/src/evict/evict_file.c +++ b/src/evict/evict_file.c @@ -79,26 +79,19 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop) WT_ERR(__wt_evict(session, ref, 1)); break; case WT_SYNC_DISCARD: - WT_ASSERT(session, - __wt_page_can_evict(session, page, 0, NULL)); - __wt_evict_page_clean_update(session, ref, 1); - break; - case WT_SYNC_DISCARD_FORCE: /* - * Forced discard of the page, whether clean or dirty. - * If we see a dirty page in a forced discard, clean - * the page, both to keep statistics correct, and to - * let the page-discard function assert no dirty page - * is ever discarded. + * Dead handles may reference dirty pages; clean the + * page, both to keep statistics correct, and to let + * the page-discard function assert no dirty page is + * ever discarded. */ - if (__wt_page_is_modified(page)) { - page->modify->write_gen = 0; - __wt_cache_dirty_decr(session, page); - } + if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) + __wt_page_modify_clear(session, page); - F_SET(session, WT_SESSION_DISCARD_FORCE); + WT_ASSERT(session, + F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || + __wt_page_can_evict(session, page, 0, NULL)); __wt_evict_page_clean_update(session, ref, 1); - F_CLR(session, WT_SESSION_DISCARD_FORCE); break; WT_ILLEGAL_VALUE_ERR(session); } diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 6aa61b4137b..ce61aa2c798 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -10,14 +10,13 @@ static int __evict_clear_all_walks(WT_SESSION_IMPL *); static int __evict_clear_walks(WT_SESSION_IMPL *); -static int __evict_has_work(WT_SESSION_IMPL *, uint32_t *); static int WT_CDECL __evict_lru_cmp(const void *, const void *); static int __evict_lru_pages(WT_SESSION_IMPL *, int); -static int __evict_lru_walk(WT_SESSION_IMPL *, uint32_t); +static int __evict_lru_walk(WT_SESSION_IMPL *); static int __evict_page(WT_SESSION_IMPL *, int); static int __evict_pass(WT_SESSION_IMPL *); -static int __evict_walk(WT_SESSION_IMPL *, uint32_t); -static int __evict_walk_file(WT_SESSION_IMPL *, u_int *, uint32_t); +static int __evict_walk(WT_SESSION_IMPL *); +static int __evict_walk_file(WT_SESSION_IMPL *, u_int *); static WT_THREAD_RET __evict_worker(void *); static int __evict_server_work(WT_SESSION_IMPL *); @@ -107,7 +106,7 @@ __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref) __wt_spin_lock(session, &cache->evict_lock); elem = cache->evict_max; - for (i = 0, evict = cache->evict; i < elem; i++, evict++) + for (i = 0, evict = cache->evict_queue; i < elem; i++, evict++) if (evict->ref == ref) { __evict_list_clear(session, evict); break; @@ -159,6 +158,7 @@ __evict_server(void *arg) WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION_IMPL *session; + u_int spins; session = arg; conn = S2C(session); @@ -176,7 +176,27 @@ __evict_server(void *arg) * otherwise we can block applications evicting large pages. */ if (!F_ISSET(cache, WT_CACHE_STUCK)) { - WT_ERR(__evict_clear_walks(session)); + for (spins = 0; (ret = __wt_spin_trylock( + session, &conn->dhandle_lock)) == EBUSY && + !F_ISSET(cache, WT_CACHE_CLEAR_WALKS); + spins++) { + if (spins < 1000) + __wt_yield(); + else + __wt_sleep(0, 1000); + } + /* + * If we gave up acquiring the lock, that indicates a + * session is waiting for us to clear walks. Do that + * as part of a normal pass (without the handle list + * lock) to avoid deadlock. + */ + if (ret == EBUSY) + continue; + WT_ERR(ret); + ret = __evict_clear_all_walks(session); + __wt_spin_unlock(session, &conn->dhandle_lock); + WT_ERR(ret); /* Next time we wake up, reverse the sweep direction. */ cache->flags ^= WT_CACHE_WALK_REVERSE; @@ -227,9 +247,16 @@ __evict_workers_resize(WT_SESSION_IMPL *session) for (i = conn->evict_workers_alloc; i < conn->evict_workers_max; i++) { WT_ERR(__wt_open_internal_session(conn, - "eviction-worker", 0, 0, &workers[i].session)); + "eviction-worker", 1, 0, &workers[i].session)); workers[i].id = i; - F_SET(workers[i].session, WT_SESSION_CAN_WAIT); + + /* + * Eviction worker threads get their own lookaside table cursor. + * Eviction worker threads may be called upon to perform slow + * operations for the block manager. + */ + F_SET(workers[i].session, + WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_CAN_WAIT); if (i < conn->evict_workers_min) { ++conn->evict_workers; @@ -259,7 +286,7 @@ __wt_evict_create(WT_SESSION_IMPL *session) /* We need a session handle because we're reading/writing pages. */ WT_RET(__wt_open_internal_session( - conn, "eviction-server", 0, 0, &conn->evict_session)); + conn, "eviction-server", 1, 0, &conn->evict_session)); session = conn->evict_session; /* @@ -276,6 +303,9 @@ __wt_evict_create(WT_SESSION_IMPL *session) else F_SET(session, WT_SESSION_CAN_WAIT); + /* The eviction server gets its own lookaside table cursor. */ + F_SET(session, WT_SESSION_LOOKASIDE_CURSOR); + /* * Start the primary eviction server thread after the worker threads * have started to avoid it starting additional worker threads before @@ -385,47 +415,62 @@ err: WT_PANIC_MSG(session, ret, "cache eviction worker error"); } /* - * __evict_has_work -- - * Find out if there is eviction work to be done. + * __evict_update_work -- + * Configure eviction work state. */ -static int -__evict_has_work(WT_SESSION_IMPL *session, uint32_t *flagsp) +static bool +__evict_update_work(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; - uint32_t flags; - int evict, dirty; + uint64_t bytes_inuse, bytes_max, dirty_inuse; conn = S2C(session); cache = conn->cache; - *flagsp = flags = 0; + + /* Clear previous state. */ + cache->state = 0; if (!F_ISSET(conn, WT_CONN_EVICTION_RUN)) - return (0); + return (false); - /* Check to see if the eviction server should run. */ - __wt_cache_status(session, &evict, &dirty); - if (evict) - /* The cache is too small. */ - LF_SET(WT_EVICT_PASS_ALL); - else if (dirty) - /* Too many dirty pages, ignore clean pages. */ - LF_SET(WT_EVICT_PASS_DIRTY); - else if (F_ISSET(cache, WT_CACHE_WOULD_BLOCK)) { - /* - * Evict pages with oldest generation (which would otherwise - * block application threads) set regardless of whether we have - * reached the eviction trigger. - */ - LF_SET(WT_EVICT_PASS_WOULD_BLOCK); - F_CLR(cache, WT_CACHE_WOULD_BLOCK); + /* + * Page eviction overrides the dirty target and other types of eviction, + * that is, we don't care where we are with respect to the dirty target + * if page eviction is configured. + * + * Avoid division by zero if the cache size has not yet been set in a + * shared cache. + */ + bytes_max = conn->cache_size + 1; + bytes_inuse = __wt_cache_bytes_inuse(cache); + if (bytes_inuse > (cache->eviction_target * bytes_max) / 100) { + FLD_SET(cache->state, WT_EVICT_PASS_ALL); + goto done; } - if (F_ISSET(cache, WT_CACHE_STUCK)) - LF_SET(WT_EVICT_PASS_AGGRESSIVE); + dirty_inuse = __wt_cache_dirty_inuse(cache); + if (dirty_inuse > (cache->eviction_dirty_target * bytes_max) / 100) { + FLD_SET(cache->state, WT_EVICT_PASS_DIRTY); + goto done; + } - *flagsp = flags; - return (0); + /* + * Evict pages with oldest generation (which would otherwise block + * application threads), set regardless of whether we have reached + * the eviction trigger. + */ + if (F_ISSET(cache, WT_CACHE_WOULD_BLOCK)) { + FLD_SET(cache->state, WT_EVICT_PASS_WOULD_BLOCK); + + F_CLR(cache, WT_CACHE_WOULD_BLOCK); + goto done; + } + return (false); + +done: if (F_ISSET(cache, WT_CACHE_STUCK)) + FLD_SET(cache->state, WT_EVICT_PASS_AGGRESSIVE); + return (true); } /* @@ -439,7 +484,6 @@ __evict_pass(WT_SESSION_IMPL *session) WT_CONNECTION_IMPL *conn; WT_EVICT_WORKER *worker; uint64_t pages_evicted; - uint32_t flags; int loop; conn = S2C(session); @@ -462,25 +506,36 @@ __evict_pass(WT_SESSION_IMPL *session) } /* - * Increment the shared read generation. We do this - * occasionally even if eviction is not currently required, so - * that pages have some relative read generation when the - * eviction server does need to do some work. + * Increment the shared read generation. Do this occasionally + * even if eviction is not currently required, so that pages + * have some relative read generation when the eviction server + * does need to do some work. */ __wt_cache_read_gen_incr(session); - WT_RET(__evict_has_work(session, &flags)); - if (flags == 0) + /* + * Update the oldest ID: we use it to decide whether pages are + * candidates for eviction. Without this, if all threads are + * blocked after a long-running transaction (such as a + * checkpoint) completes, we may never start evicting again. + * + * Do this every time the eviction server wakes up, regardless + * of whether the cache is full, to prevent the oldest ID + * falling too far behind. + */ + __wt_txn_update_oldest(session, 1); + + if (!__evict_update_work(session)) break; if (loop > 10) - LF_SET(WT_EVICT_PASS_AGGRESSIVE); + FLD_SET(cache->state, WT_EVICT_PASS_AGGRESSIVE); /* * Start a worker if we have capacity and we haven't reached * the eviction targets. */ - if (LF_ISSET(WT_EVICT_PASS_ALL | + if (FLD_ISSET(cache->state, WT_EVICT_PASS_ALL | WT_EVICT_PASS_DIRTY | WT_EVICT_PASS_WOULD_BLOCK) && conn->evict_workers < conn->evict_workers_max) { WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER, @@ -499,7 +554,7 @@ __evict_pass(WT_SESSION_IMPL *session) " In use: %" PRIu64 " Dirty: %" PRIu64, conn->cache_size, cache->bytes_inmem, cache->bytes_dirty)); - WT_RET(__evict_lru_walk(session, flags)); + WT_RET(__evict_lru_walk(session)); WT_RET(__evict_server_work(session)); /* @@ -520,7 +575,8 @@ __evict_pass(WT_SESSION_IMPL *session) * Mark the cache as stuck if we need space * and aren't evicting any pages. */ - if (!LF_ISSET(WT_EVICT_PASS_WOULD_BLOCK)) { + if (!FLD_ISSET(cache->state, + WT_EVICT_PASS_WOULD_BLOCK)) { F_SET(cache, WT_CACHE_STUCK); WT_STAT_FAST_CONN_INCR( session, cache_eviction_slow); @@ -546,9 +602,14 @@ static int __evict_clear_walk(WT_SESSION_IMPL *session) { WT_BTREE *btree; + WT_CACHE *cache; WT_REF *ref; btree = S2BT(session); + cache = S2C(session)->cache; + + if (session->dhandle == cache->evict_file_next) + cache->evict_file_next = NULL; if ((ref = btree->evict_ref) == NULL) return (0); @@ -568,21 +629,17 @@ __evict_clear_walk(WT_SESSION_IMPL *session) static int __evict_clear_walks(WT_SESSION_IMPL *session) { - WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION_IMPL *s; u_int i, session_cnt; conn = S2C(session); - cache = conn->cache; WT_ORDERED_READ(session_cnt, conn->session_cnt); for (s = conn->sessions, i = 0; i < session_cnt; ++s, ++i) { if (!s->active || !F_ISSET(s, WT_SESSION_CLEAR_EVICT_WALK)) continue; - if (s->dhandle == cache->evict_file_next) - cache->evict_file_next = NULL; WT_WITH_DHANDLE( session, s->dhandle, WT_TRET(__evict_clear_walk(session))); } @@ -606,7 +663,8 @@ __evict_request_walk_clear(WT_SESSION_IMPL *session) F_SET(session, WT_SESSION_CLEAR_EVICT_WALK); - while (btree->evict_ref != NULL && ret == 0) { + while (ret == 0 && (btree->evict_ref != NULL || + cache->evict_file_next == session->dhandle)) { F_SET(cache, WT_CACHE_CLEAR_WALKS); ret = __wt_cond_wait( session, cache->evict_waiter_cond, 100000); @@ -630,7 +688,7 @@ __evict_clear_all_walks(WT_SESSION_IMPL *session) conn = S2C(session); - SLIST_FOREACH(dhandle, &conn->dhlh, l) + TAILQ_FOREACH(dhandle, &conn->dhqh, q) if (WT_PREFIX_MATCH(dhandle->name, "file:")) WT_WITH_DHANDLE(session, dhandle, WT_TRET(__evict_clear_walk(session))); @@ -638,44 +696,6 @@ __evict_clear_all_walks(WT_SESSION_IMPL *session) } /* - * __wt_evict_page -- - * Evict a given page. - */ -int -__wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref) -{ - WT_DECL_RET; - WT_TXN *txn; - WT_TXN_ISOLATION saved_iso; - - /* - * We have to take care when evicting pages not to write a change that: - * (a) is not yet committed; or - * (b) is committed more recently than an in-progress checkpoint. - * - * We handle both of these cases by setting up the transaction context - * before evicting, using a special "eviction" isolation level, where - * only globally visible updates can be evicted. - */ - __wt_txn_update_oldest(session, 1); - txn = &session->txn; - saved_iso = txn->isolation; - txn->isolation = WT_ISO_EVICTION; - - /* - * Sanity check: if a transaction has updates, its updates should not - * be visible to eviction. - */ - WT_ASSERT(session, !F_ISSET(txn, WT_TXN_HAS_ID) || - !__wt_txn_visible(session, txn->id)); - - ret = __wt_evict(session, ref, 0); - txn->isolation = saved_iso; - - return (ret); -} - -/* * __wt_evict_file_exclusive_on -- * Get exclusive eviction access to a file and discard any of the file's * blocks queued for eviction. @@ -719,7 +739,7 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, int *evict_resetp) * clear it. */ elem = cache->evict_max; - for (i = 0, evict = cache->evict; i < elem; i++, evict++) + for (i = 0, evict = cache->evict_queue; i < elem; i++, evict++) if (evict->btree == btree) __evict_list_clear(session, evict); __wt_spin_unlock(session, &cache->evict_lock); @@ -773,7 +793,7 @@ __evict_lru_pages(WT_SESSION_IMPL *session, int is_server) * Add pages to the LRU queue to be evicted from cache. */ static int -__evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags) +__evict_lru_walk(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_DECL_RET; @@ -784,17 +804,17 @@ __evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags) cache = S2C(session)->cache; /* Get some more pages to consider for eviction. */ - if ((ret = __evict_walk(session, flags)) != 0) + if ((ret = __evict_walk(session)) != 0) return (ret == EBUSY ? 0 : ret); /* Sort the list into LRU order and restart. */ __wt_spin_lock(session, &cache->evict_lock); entries = cache->evict_entries; - qsort(cache->evict, + qsort(cache->evict_queue, entries, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp); - while (entries > 0 && cache->evict[entries - 1].ref == NULL) + while (entries > 0 && cache->evict_queue[entries - 1].ref == NULL) --entries; cache->evict_entries = entries; @@ -811,12 +831,13 @@ __evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags) return (0); } - WT_ASSERT(session, cache->evict[0].ref != NULL); + WT_ASSERT(session, cache->evict_queue[0].ref != NULL); /* Track the oldest read generation we have in the queue. */ - cache->read_gen_oldest = cache->evict[0].ref->page->read_gen; + cache->read_gen_oldest = cache->evict_queue[0].ref->page->read_gen; - if (LF_ISSET(WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK)) + if (FLD_ISSET(cache->state, + WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK)) /* * Take all candidates if we only gathered pages with an oldest * read generation set. @@ -824,8 +845,8 @@ __evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags) cache->evict_candidates = entries; else { /* Find the bottom 25% of read generations. */ - cutoff = (3 * __evict_read_gen(&cache->evict[0]) + - __evict_read_gen(&cache->evict[entries - 1])) / 4; + cutoff = (3 * __evict_read_gen(&cache->evict_queue[0]) + + __evict_read_gen(&cache->evict_queue[entries - 1])) / 4; /* * Don't take less than 10% or more than 50% of entries, * regardless. That said, if there is only one entry, which is @@ -835,21 +856,21 @@ __evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags) candidates < entries / 2; candidates++) if (__evict_read_gen( - &cache->evict[candidates]) > cutoff) + &cache->evict_queue[candidates]) > cutoff) break; cache->evict_candidates = candidates; } /* If we have more than the minimum number of entries, clear them. */ if (cache->evict_entries > WT_EVICT_WALK_BASE) { - for (i = WT_EVICT_WALK_BASE, evict = cache->evict + i; + for (i = WT_EVICT_WALK_BASE, evict = cache->evict_queue + i; i < cache->evict_entries; i++, evict++) __evict_list_clear(session, evict); cache->evict_entries = WT_EVICT_WALK_BASE; } - cache->evict_current = cache->evict; + cache->evict_current = cache->evict_queue; __wt_spin_unlock(session, &cache->evict_lock); /* @@ -894,7 +915,7 @@ __evict_server_work(WT_SESSION_IMPL *session) * Fill in the array by walking the next set of pages. */ static int -__evict_walk(WT_SESSION_IMPL *session, uint32_t flags) +__evict_walk(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; @@ -910,14 +931,6 @@ __evict_walk(WT_SESSION_IMPL *session, uint32_t flags) incr = dhandle_locked = 0; retries = 0; - /* - * Update the oldest ID: we use it to decide whether pages are - * candidates for eviction. Without this, if all threads are blocked - * after a long-running transaction (such as a checkpoint) completes, - * we may never start evicting again. - */ - __wt_txn_update_oldest(session, 1); - if (cache->evict_current == NULL) WT_STAT_FAST_CONN_INCR(session, cache_eviction_queue_empty); else @@ -957,15 +970,24 @@ retry: while (slot < max_entries && ret == 0) { dhandle_locked = 1; } - if (dhandle == NULL) - dhandle = SLIST_FIRST(&conn->dhlh); - else { + if (dhandle == NULL) { + /* + * On entry, continue from wherever we got to in the + * scan last time through. If we don't have a saved + * handle, start from the beginning of the list. + */ + if ((dhandle = cache->evict_file_next) != NULL) + cache->evict_file_next = NULL; + else + dhandle = TAILQ_FIRST(&conn->dhqh); + } else { if (incr) { WT_ASSERT(session, dhandle->session_inuse > 0); - (void)WT_ATOMIC_SUB4(dhandle->session_inuse, 1); + (void)__wt_atomic_subi32( + &dhandle->session_inuse, 1); incr = 0; } - dhandle = SLIST_NEXT(dhandle, l); + dhandle = TAILQ_NEXT(dhandle, q); } /* If we reach the end of the list, we're done. */ @@ -977,15 +999,6 @@ retry: while (slot < max_entries && ret == 0) { !F_ISSET(dhandle, WT_DHANDLE_OPEN)) continue; - /* - * Each time we reenter this function, start at the next handle - * on the list. - */ - if (cache->evict_file_next != NULL && - cache->evict_file_next != dhandle) - continue; - cache->evict_file_next = NULL; - /* Skip files that don't allow eviction. */ btree = dhandle->handle; if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) @@ -996,7 +1009,7 @@ retry: while (slot < max_entries && ret == 0) { * stick in cache until we get aggressive. */ if ((btree->checkpointing || btree->evict_priority != 0) && - !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE)) + !FLD_ISSET(cache->state, WT_EVICT_PASS_AGGRESSIVE)) continue; /* Skip files if we have used all available hazard pointers. */ @@ -1015,7 +1028,7 @@ retry: while (slot < max_entries && ret == 0) { btree->evict_walk_skips = 0; prev_slot = slot; - (void)WT_ATOMIC_ADD4(dhandle->session_inuse, 1); + (void)__wt_atomic_addi32(&dhandle->session_inuse, 1); incr = 1; __wt_spin_unlock(session, &conn->dhandle_lock); dhandle_locked = 0; @@ -1028,7 +1041,7 @@ retry: while (slot < max_entries && ret == 0) { */ if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) { WT_WITH_DHANDLE(session, dhandle, - ret = __evict_walk_file(session, &slot, flags)); + ret = __evict_walk_file(session, &slot)); WT_ASSERT(session, session->split_gen == 0); } @@ -1046,8 +1059,11 @@ retry: while (slot < max_entries && ret == 0) { } if (incr) { + /* Remember the file we should visit first, next loop. */ + cache->evict_file_next = dhandle; + WT_ASSERT(session, dhandle->session_inuse > 0); - (void)WT_ATOMIC_SUB4(dhandle->session_inuse, 1); + (void)__wt_atomic_subi32(&dhandle->session_inuse, 1); incr = 0; } @@ -1059,21 +1075,18 @@ retry: while (slot < max_entries && ret == 0) { /* * Walk the list of files a few times if we don't find enough pages. * Try two passes through all the files, give up when we have some - * candidates and we aren't finding more. Take care not to skip files - * on subsequent passes. + * candidates and we aren't finding more. */ if (!F_ISSET(cache, WT_CACHE_CLEAR_WALKS) && ret == 0 && slot < max_entries && (retries < 2 || - (!LF_ISSET(WT_EVICT_PASS_WOULD_BLOCK) && retries < 10 && + (retries < 10 && + !FLD_ISSET(cache->state, WT_EVICT_PASS_WOULD_BLOCK) && (slot == cache->evict_entries || slot > start_slot)))) { - cache->evict_file_next = NULL; start_slot = slot; ++retries; goto retry; } - /* Remember the file we should visit first, next loop. */ - cache->evict_file_next = dhandle; cache->evict_entries = slot; return (ret); } @@ -1092,7 +1105,7 @@ __evict_init_candidate( cache = S2C(session)->cache; /* Keep track of the maximum slot we are using. */ - slot = (u_int)(evict - cache->evict); + slot = (u_int)(evict - cache->evict_queue); if (slot >= cache->evict_max) cache->evict_max = slot + 1; @@ -1110,10 +1123,11 @@ __evict_init_candidate( * Get a few page eviction candidates from a single underlying file. */ static int -__evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags) +__evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp) { WT_BTREE *btree; WT_CACHE *cache; + WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_EVICT_ENTRY *end, *evict, *start; WT_PAGE *page; @@ -1123,11 +1137,12 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags) uint32_t walk_flags; int enough, internal_pages, modified, restarts; + conn = S2C(session); btree = S2BT(session); - cache = S2C(session)->cache; - start = cache->evict + *slotp; + cache = conn->cache; + start = cache->evict_queue + *slotp; end = WT_MIN(start + WT_EVICT_WALK_PER_FILE, - cache->evict + cache->evict_slots); + cache->evict_queue + cache->evict_slots); enough = internal_pages = restarts = 0; walk_flags = WT_READ_CACHE | WT_READ_NO_EVICT | @@ -1178,21 +1193,21 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags) goto fast; /* Optionally ignore clean pages. */ - if (!modified && LF_ISSET(WT_EVICT_PASS_DIRTY)) + if (!modified && FLD_ISSET(cache->state, WT_EVICT_PASS_DIRTY)) continue; /* * If we are only trickling out pages marked for definite * eviction, skip anything that isn't marked. */ - if (LF_ISSET(WT_EVICT_PASS_WOULD_BLOCK) && + if (FLD_ISSET(cache->state, WT_EVICT_PASS_WOULD_BLOCK) && page->read_gen != WT_READGEN_OLDEST) continue; /* Limit internal pages to 50% unless we get aggressive. */ if (WT_PAGE_IS_INTERNAL(page) && ++internal_pages > WT_EVICT_WALK_PER_FILE / 2 && - !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE)) + !FLD_ISSET(cache->state, WT_EVICT_PASS_AGGRESSIVE)) continue; /* @@ -1207,36 +1222,44 @@ fast: /* If the page can't be evicted, give up. */ continue; /* - * If the page is clean but has modifications that appear too - * new to evict, skip it. + * Additional tests if eviction is likely to succeed. * - * Note: take care with ordering: if we detected that the page - * is modified above, we expect mod != NULL. + * If eviction is stuck or we are helping with forced eviction, + * try anyway: maybe a transaction that was running last time + * we wrote the page has since rolled back, or we can help the + * checkpoint complete sooner. Additionally, being stuck will + * configure lookaside table writes in reconciliation, allowing + * us to evict pages we can't usually evict. */ - mod = page->modify; - if (!modified && mod != NULL && !LF_ISSET( - WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK) && - !__wt_txn_visible_all(session, mod->rec_max_txn)) - continue; + if (!FLD_ISSET(cache->state, + WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK)) { + /* + * Note: take care with ordering: if we detected that + * the page is modified above, we expect mod != NULL. + */ + mod = page->modify; - /* - * If the oldest transaction hasn't changed since the last time - * this page was written, it's unlikely that we can make - * progress. Similarly, if the most recent update on the page - * is not yet globally visible, eviction will fail. These - * heuristics attempt to avoid repeated attempts to evict the - * same page. - * - * That said, if eviction is stuck, or we are helping with - * forced eviction, try anyway: maybe a transaction that was - * running last time we wrote the page has since rolled back, - * or we can help get the checkpoint completed sooner. - */ - if (modified && !LF_ISSET( - WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK) && - (mod->disk_snap_min == S2C(session)->txn_global.oldest_id || - !__wt_txn_visible_all(session, mod->update_txn))) - continue; + /* + * If the page is clean but has modifications that + * appear too new to evict, skip it. + */ + if (!modified && mod != NULL && + !__wt_txn_visible_all(session, mod->rec_max_txn)) + continue; + + /* + * If the oldest transaction hasn't changed since the + * last time this page was written, it's unlikely we + * can make progress. Similarly, if the most recent + * update on the page is not yet globally visible, + * eviction will fail. These heuristics attempt to + * avoid repeated attempts to evict the same page. + */ + if (modified && + (mod->disk_snap_min == conn->txn_global.oldest_id || + !__wt_txn_visible_all(session, mod->update_txn))) + continue; + } WT_ASSERT(session, evict->ref == NULL); __evict_init_candidate(session, evict, ref); @@ -1245,28 +1268,28 @@ fast: /* If the page can't be evicted, give up. */ WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER, "select: %p, size %" PRIu64, page, page->memory_footprint)); } + WT_RET_NOTFOUND_OK(ret); + + *slotp += (u_int)(evict - start); /* * If we happen to end up on the root page, clear it. We have to track * hazard pointers, and the root page complicates that calculation. * - * Also clear the walk if we land on a page requiring forced eviction. - * The eviction server may go to sleep, and we want this page evicted - * as quickly as possible. + * If we land on a page requiring forced eviction, move on to the next + * page: we want this page evicted as quickly as possible. */ - if ((ref = btree->evict_ref) != NULL && (__wt_ref_is_root(ref) || - ref->page->read_gen == WT_READGEN_OLDEST)) { - btree->evict_ref = NULL; - __wt_page_release(session, ref, WT_READ_NO_EVICT); + if ((ref = btree->evict_ref) != NULL) { + if (__wt_ref_is_root(ref)) + WT_RET(__evict_clear_walk(session)); + else if (ref->page->read_gen == WT_READGEN_OLDEST) + WT_RET_NOTFOUND_OK(__wt_tree_walk(session, + &btree->evict_ref, &pages_walked, walk_flags)); } - /* If the walk was interrupted by a locked page, that's okay. */ - if (ret == WT_NOTFOUND) - ret = 0; - - *slotp += (u_int)(evict - start); WT_STAT_FAST_CONN_INCRV(session, cache_eviction_walk, pages_walked); - return (ret); + + return (0); } /* @@ -1310,7 +1333,7 @@ __evict_get_ref( /* Get the next page queued for eviction. */ while ((evict = cache->evict_current) != NULL && - evict < cache->evict + candidates && evict->ref != NULL) { + evict < cache->evict_queue + candidates && evict->ref != NULL) { WT_ASSERT(session, evict->btree != NULL); /* Move to the next item. */ @@ -1321,8 +1344,8 @@ __evict_get_ref( * multiple attempts to evict it. For pages that are already * being evicted, this operation will fail and we will move on. */ - if (!WT_ATOMIC_CAS4( - evict->ref->state, WT_REF_MEM, WT_REF_LOCKED)) { + if (!__wt_atomic_casv32( + &evict->ref->state, WT_REF_MEM, WT_REF_LOCKED)) { __evict_list_clear(session, evict); continue; } @@ -1331,7 +1354,7 @@ __evict_get_ref( * Increment the busy count in the btree handle to prevent it * from being closed under us. */ - (void)WT_ATOMIC_ADD4(evict->btree->evict_busy, 1); + (void)__wt_atomic_addv32(&evict->btree->evict_busy, 1); *btreep = evict->btree; *refp = evict->ref; @@ -1345,7 +1368,7 @@ __evict_get_ref( } /* Clear the current pointer if there are no more candidates. */ - if (evict >= cache->evict + cache->evict_candidates) + if (evict >= cache->evict_queue + cache->evict_candidates) cache->evict_current = NULL; __wt_spin_unlock(session, &cache->evict_lock); @@ -1402,15 +1425,12 @@ __evict_page(WT_SESSION_IMPL *session, int is_server) * page-discard function assert that no dirty pages are ever * discarded. */ - if (F_ISSET(btree->dhandle, WT_DHANDLE_DEAD) && - __wt_page_is_modified(page)) { - page->modify->write_gen = 0; - __wt_cache_dirty_decr(session, page); - } + if (F_ISSET(btree->dhandle, WT_DHANDLE_DEAD)) + __wt_page_modify_clear(session, page); - WT_WITH_BTREE(session, btree, ret = __wt_evict_page(session, ref)); + WT_WITH_BTREE(session, btree, ret = __wt_evict(session, ref, 0)); - (void)WT_ATOMIC_SUB4(btree->evict_busy, 1); + (void)__wt_atomic_subv32(&btree->evict_busy, 1); WT_RET(ret); @@ -1427,7 +1447,7 @@ __evict_page(WT_SESSION_IMPL *session, int is_server) * crosses its boundaries. */ int -__wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, int pct_full) +__wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, u_int pct_full) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; @@ -1544,29 +1564,31 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, int pct_full) * NOTE: this function is not called anywhere, it is intended to be called * from a debugger. */ -void -__wt_cache_dump(WT_SESSION_IMPL *session) +int +__wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile) { - WT_BTREE *btree; + FILE *fp; WT_CONNECTION_IMPL *conn; - WT_DATA_HANDLE *dhandle; - WT_REF *next_walk; + WT_DATA_HANDLE *dhandle, *saved_dhandle; WT_PAGE *page; + WT_REF *next_walk; uint64_t file_intl_pages, file_leaf_pages; uint64_t file_bytes, file_dirty, total_bytes; conn = S2C(session); total_bytes = 0; - SLIST_FOREACH(dhandle, &conn->dhlh, l) { + if (ofile == NULL) + fp = stdout; + else + WT_RET(__wt_fopen(session, ofile, WT_FHANDLE_WRITE, 0, &fp)); + + saved_dhandle = session->dhandle; + TAILQ_FOREACH(dhandle, &conn->dhqh, q) { if (!WT_PREFIX_MATCH(dhandle->name, "file:") || !F_ISSET(dhandle, WT_DHANDLE_OPEN)) continue; - btree = dhandle->handle; - if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) - continue; - file_bytes = file_dirty = file_intl_pages = file_leaf_pages = 0; next_walk = NULL; session->dhandle = dhandle; @@ -1581,12 +1603,14 @@ __wt_cache_dump(WT_SESSION_IMPL *session) file_bytes += page->memory_footprint; if (__wt_page_is_modified(page)) file_dirty += page->memory_footprint; + (void)__wt_fprintf(fp, + "%" WT_SIZET_FMT ", ", page->memory_footprint); } session->dhandle = NULL; - printf("cache dump: %s%s%s%s:" - " %" PRIu64 " intl pages, %" PRIu64 " leaf pages," - " %" PRIu64 "MB, %" PRIu64 "MB dirty\n", + (void)__wt_fprintf(fp, "\n" "cache dump: %s%s%s%s\n\t" + " %" PRIu64 " internal pages, %" PRIu64 " leaf pages," + " %" PRIu64 "MB, %" PRIu64 "MB dirty\n==============\n", dhandle->name, dhandle->checkpoint == NULL ? "" : " [", dhandle->checkpoint == NULL ? "" : dhandle->checkpoint, @@ -1596,9 +1620,13 @@ __wt_cache_dump(WT_SESSION_IMPL *session) total_bytes += file_bytes; } - printf("cache dump: total found = %" PRIu64 "MB" + session->dhandle = saved_dhandle; + + (void)__wt_fprintf(fp, "cache dump: total found = %" PRIu64 "MB" " vs tracked inuse %" PRIu64 "MB\n", total_bytes >> 20, __wt_cache_bytes_inuse(conn->cache) >> 20); - fflush(stdout); + if (fp != stdout) + WT_RET(__wt_fclose(&fp, WT_FHANDLE_WRITE)); + return (0); } #endif diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c index 1e5faf45de2..11284ce7b21 100644 --- a/src/evict/evict_page.c +++ b/src/evict/evict_page.c @@ -150,17 +150,12 @@ done: if (((inmem_split && ret == 0) || (forced_eviction && ret == EBUSY)) && int __wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, int closing) { - int evict; - /* * If doing normal system eviction, but only in the service of reducing * the number of dirty pages, leave the clean page in cache. */ - if (!closing) { - __wt_cache_status(session, &evict, NULL); - if (!evict) - return (EBUSY); - } + if (!closing && __wt_eviction_dirty_target(session)) + return (EBUSY); /* * Discard the page and update the reference structure; if the page has @@ -184,7 +179,6 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int closing) WT_ADDR *addr; WT_PAGE *parent; WT_PAGE_MODIFY *mod; - int evict; parent = ref->home; mod = ref->page->modify; @@ -229,11 +223,8 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int closing) * push it out of cache (and read it back in, when needed), we * would rather have more, smaller pages than fewer large pages. */ - if (!closing) { - __wt_cache_status(session, &evict, NULL); - if (!evict) - return (EBUSY); - } + if (!closing && __wt_eviction_dirty_target(session)) + return (EBUSY); /* Discard the parent's address. */ if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) { @@ -309,8 +300,7 @@ __evict_review( { WT_DECL_RET; WT_PAGE *page; - WT_PAGE_MODIFY *mod; - uint32_t reconcile_flags; + uint32_t flags; /* * Get exclusive access to the page if our caller doesn't have the tree @@ -331,7 +321,6 @@ __evict_review( /* Now that we have exclusive access, review the page. */ page = ref->page; - mod = page->modify; /* * Fail if an internal has active children, the children must be evicted @@ -347,6 +336,13 @@ __evict_review( /* Check if the page can be evicted. */ if (!closing) { + /* + * Update the oldest ID to avoid wasted effort should it have + * fallen behind current. + */ + if (__wt_page_is_modified(page)) + __wt_txn_update_oldest(session, 1); + if (!__wt_page_can_evict(session, page, 0, inmem_splitp)) return (EBUSY); @@ -361,9 +357,12 @@ __evict_review( return (__wt_split_insert(session, ref)); } + /* If the page is clean, we're done and we can evict. */ + if (!__wt_page_is_modified(page)) + return (0); + /* - * If the page is dirty and can possibly change state, reconcile it to - * determine the final state. + * If the page is dirty, reconcile it to decide if we can evict it. * * If we have an exclusive lock (we're discarding the tree), assert * there are no updates we cannot read. @@ -377,30 +376,38 @@ __evict_review( * in-memory pages, (restoring the updates that stopped us from writing * the block), and inserting the whole mess into the page's parent. * - * Don't set the update-restore flag for internal pages, they don't have - * updates that can be saved and restored. + * Otherwise, if eviction is getting pressed, configure reconciliation + * to write not-yet-globally-visible updates to the lookaside table, + * allowing the eviction of pages we'd otherwise have to retain in cache + * to support older readers. + * + * Don't set the update-restore or lookaside table flags for internal + * pages, they don't have update lists that can be saved and restored. */ - reconcile_flags = WT_EVICTING; - if (__wt_page_is_modified(page)) { - if (closing) - FLD_SET(reconcile_flags, WT_SKIP_UPDATE_ERR); - else if (!WT_PAGE_IS_INTERNAL(page) && - page->read_gen == WT_READGEN_OLDEST) - FLD_SET(reconcile_flags, WT_SKIP_UPDATE_RESTORE); - WT_RET(__wt_reconcile(session, ref, NULL, reconcile_flags)); - WT_ASSERT(session, - !__wt_page_is_modified(page) || - FLD_ISSET(reconcile_flags, WT_SKIP_UPDATE_RESTORE)); + flags = WT_EVICTING; + if (closing) + LF_SET(WT_VISIBILITY_ERR); + else if (!WT_PAGE_IS_INTERNAL(page)) { + if (page->read_gen == WT_READGEN_OLDEST) + LF_SET(WT_EVICT_UPDATE_RESTORE); + else if (__wt_eviction_aggressive(session)) + LF_SET(WT_EVICT_LOOKASIDE); } + WT_RET(__wt_reconcile(session, ref, NULL, flags)); + /* - * If the page was ever modified, make sure all of the updates - * on the page are old enough they can be discarded from cache. + * Success: assert the page is clean or reconciliation was configured + * for an update/restore split, and if the page is clean, reconciliation + * was configured for a lookaside table or all updates on the page are + * globally visible. */ - if (!closing && mod != NULL && - !__wt_txn_visible_all(session, mod->rec_max_txn) && - !FLD_ISSET(reconcile_flags, WT_SKIP_UPDATE_RESTORE)) - return (EBUSY); + WT_ASSERT(session, + LF_ISSET(WT_EVICT_UPDATE_RESTORE) || !__wt_page_is_modified(page)); + WT_ASSERT(session, + LF_SET(WT_EVICT_LOOKASIDE) || + __wt_page_is_modified(page) || + __wt_txn_visible_all(session, page->modify->rec_max_txn)); return (0); } diff --git a/src/include/async.h b/src/include/async.h index 88ecad6eb2c..fb9a64e774d 100644 --- a/src/include/async.h +++ b/src/include/async.h @@ -6,20 +6,6 @@ * See the file LICENSE for redistribution information. */ -typedef enum { - WT_ASYNCOP_ENQUEUED, /* Placed on the work queue */ - WT_ASYNCOP_FREE, /* Able to be allocated to user */ - WT_ASYNCOP_READY, /* Allocated and ready for user to use */ - WT_ASYNCOP_WORKING /* Operation in progress by worker */ -} WT_ASYNC_STATE; - -typedef enum { - WT_ASYNC_FLUSH_NONE=0, /* No flush in progress */ - WT_ASYNC_FLUSH_COMPLETE, /* Notify flush caller it's done */ - WT_ASYNC_FLUSH_IN_PROGRESS, /* Prevent other callers */ - WT_ASYNC_FLUSHING /* Notify workers */ -} WT_ASYNC_FLUSH_STATE; - #define MAX_ASYNC_SLEEP_USECS 100000 /* Maximum sleep waiting for work */ #define MAX_ASYNC_YIELD 200 /* Maximum number of yields for work */ @@ -31,7 +17,7 @@ typedef enum { * The URI/config/format cache. */ struct __wt_async_format { - STAILQ_ENTRY(__wt_async_format) q; + TAILQ_ENTRY(__wt_async_format) q; const char *config; uint64_t cfg_hash; /* Config hash */ const char *uri; @@ -53,7 +39,13 @@ struct __wt_async_op_impl { uint64_t unique_id; /* Unique identifier. */ WT_ASYNC_FORMAT *format; /* Format structure */ - WT_ASYNC_STATE state; /* Op state */ + +#define WT_ASYNCOP_ENQUEUED 0 /* Placed on the work queue */ +#define WT_ASYNCOP_FREE 1 /* Able to be allocated to user */ +#define WT_ASYNCOP_READY 2 /* Allocated, ready for user to use */ +#define WT_ASYNCOP_WORKING 3 /* Operation in progress by worker */ + uint32_t state; + WT_ASYNC_OPTYPE optype; /* Operation type */ }; @@ -88,10 +80,16 @@ struct __wt_async { uint64_t alloc_tail; /* Next slot to dequeue */ uint64_t tail_slot; /* Worker slot consumed */ - STAILQ_HEAD(__wt_async_format_qh, __wt_async_format) formatqh; - int cur_queue; /* Currently enqueued */ - int max_queue; /* Maximum enqueued */ - WT_ASYNC_FLUSH_STATE flush_state; /* Queue flush state */ + TAILQ_HEAD(__wt_async_format_qh, __wt_async_format) formatqh; + uint32_t cur_queue; /* Currently enqueued */ + uint32_t max_queue; /* Maximum enqueued */ + +#define WT_ASYNC_FLUSH_NONE 0 /* No flush in progress */ +#define WT_ASYNC_FLUSH_COMPLETE 1 /* Notify flush caller done */ +#define WT_ASYNC_FLUSH_IN_PROGRESS 2 /* Prevent other callers */ +#define WT_ASYNC_FLUSHING 3 /* Notify workers */ + uint32_t flush_state; + /* Notify any waiting threads when flushing is done. */ WT_CONDVAR *flush_cond; WT_ASYNC_OP_IMPL flush_op; /* Special flush op */ @@ -112,7 +110,7 @@ struct __wt_async { * has a cache of async cursors to reuse for operations. */ struct __wt_async_cursor { - STAILQ_ENTRY(__wt_async_cursor) q; /* Worker cache */ + TAILQ_ENTRY(__wt_async_cursor) q; /* Worker cache */ uint64_t cfg_hash; /* Config hash */ uint64_t uri_hash; /* URI hash */ WT_CURSOR *c; /* WT cursor */ @@ -124,6 +122,6 @@ struct __wt_async_cursor { */ struct __wt_async_worker_state { uint32_t id; - STAILQ_HEAD(__wt_cursor_qh, __wt_async_cursor) cursorqh; + TAILQ_HEAD(__wt_cursor_qh, __wt_async_cursor) cursorqh; uint32_t num_cursors; }; diff --git a/src/include/bitstring.i b/src/include/bitstring.i index c548c12761d..5449ffe6209 100644 --- a/src/include/bitstring.i +++ b/src/include/bitstring.i @@ -84,10 +84,10 @@ __bit_alloc(WT_SESSION_IMPL *session, uint64_t nbits, void *retp) * __bit_test -- * Test one bit in name. */ -static inline int +static inline bool __bit_test(uint8_t *bitf, uint64_t bit) { - return (bitf[__bit_byte(bit)] & __bit_mask(bit) ? 1 : 0); + return ((bitf[__bit_byte(bit)] & __bit_mask(bit)) != 0); } /* diff --git a/src/include/block.h b/src/include/block.h index 795d646db1e..ce33b331e76 100644 --- a/src/include/block.h +++ b/src/include/block.h @@ -215,8 +215,8 @@ struct __wt_block { /* A list of block manager handles, sharing a file descriptor. */ uint32_t ref; /* References */ WT_FH *fh; /* Backing file handle */ - SLIST_ENTRY(__wt_block) l; /* Linked list of handles */ - SLIST_ENTRY(__wt_block) hashl; /* Hashed list of handles */ + TAILQ_ENTRY(__wt_block) q; /* Linked list of handles */ + TAILQ_ENTRY(__wt_block) hashq; /* Hashed list of handles */ /* Configuration information, set when the file is opened. */ uint32_t allocfirst; /* Allocation is first-fit */ diff --git a/src/include/btmem.h b/src/include/btmem.h index f13504d66ca..f214ddb1dc3 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -6,6 +6,8 @@ * See the file LICENSE for redistribution information. */ +#define WT_RECNO_OOB 0 /* Illegal record number */ + /* * WT_PAGE_HEADER -- * Blocks have a common header, a WT_PAGE_HEADER structure followed by a @@ -43,6 +45,7 @@ struct __wt_page_header { #define WT_PAGE_EMPTY_V_ALL 0x02 /* Page has all zero-length values */ #define WT_PAGE_EMPTY_V_NONE 0x04 /* Page has no zero-length values */ #define WT_PAGE_ENCRYPTED 0x08 /* Page is encrypted on disk */ +#define WT_PAGE_LAS_UPDATE 0x10 /* Page updates in lookaside store */ uint8_t flags; /* 25: flags */ /* @@ -168,6 +171,29 @@ struct __wt_ovfl_txnc { }; /* + * Lookaside table support: when a page is being reconciled for eviction and has + * updates that might be required by earlier readers in the system, the updates + * are written into a lookaside table, and restored as necessary if the page is + * read. The key is a unique marker for the page (a file ID plus an address), + * a counter (used to ensure the update records remain in the original order), + * the on-page item's transaction ID (so we can discard any update records from + * the lookaside table once the on-page item's transaction is globally visible), + * and the page key (byte-string for row-store, record number for column-store). + * The value is the WT_UPDATE structure's transaction ID, update size and value. + * + * As the key for the lookaside table is different for row- and column-store, we + * store both key types in a WT_ITEM, building/parsing them in the code, because + * otherwise we'd need two lookaside files with different key formats. We could + * make the lookaside table's key standard by moving the source key into the + * lookaside table value, but that doesn't make the coding any simpler, and it + * makes the lookaside table's value more likely to overflow the page size when + * the row-store key is relatively large. + */ +#define WT_LAS_FORMAT \ + "key_format=" WT_UNCHECKED_STRING(IuQQu) \ + ",value_format=" WT_UNCHECKED_STRING(QIu) + +/* * WT_PAGE_MODIFY -- * When a page is modified, there's additional information to maintain. */ @@ -238,15 +264,17 @@ struct __wt_page_modify { * Eviction, but block wasn't written: unresolved updates and * associated disk image. * - * Skipped updates are either a WT_INSERT, or a row-store leaf - * page entry. + * Saved updates are either a WT_INSERT, or a row-store leaf + * page entry; in the case of creating lookaside records, there + * is an additional value, the committed item's transaction ID. */ - struct __wt_upd_skipped { + struct __wt_save_upd { WT_INSERT *ins; WT_ROW *rip; - } *skip; - uint32_t skip_entries; - void *skip_dsk; + uint64_t onpage_txn; + } *supd; + uint32_t supd_entries; + void *supd_dsk; /* * Block was written: address, size and checksum. @@ -556,9 +584,8 @@ struct __wt_page { #define WT_PAGE_DISK_ALLOC 0x02 /* Disk image in allocated memory */ #define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */ #define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */ -#define WT_PAGE_SCANNING 0x10 /* Obsolete updates are being scanned */ +#define WT_PAGE_RECONCILIATION 0x10 /* Page reconciliation lock */ #define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */ -#define WT_PAGE_SPLIT_LOCKED 0x40 /* An internal page is growing */ uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ /* @@ -656,14 +683,6 @@ struct __wt_page { * to the readers. If the evicting thread does not find a hazard pointer, * the page is evicted. */ -typedef enum __wt_page_state { - WT_REF_DISK=0, /* Page is on disk */ - WT_REF_DELETED, /* Page is on disk, but deleted */ - WT_REF_LOCKED, /* Page locked for exclusive access */ - WT_REF_MEM, /* Page is in cache and valid */ - WT_REF_READING, /* Page being read */ - WT_REF_SPLIT /* Parent page split (WT_REF dead) */ -} WT_PAGE_STATE; /* * WT_PAGE_DELETED -- @@ -691,7 +710,13 @@ struct __wt_ref { WT_PAGE * volatile home; /* Reference page */ uint32_t pindex_hint; /* Reference page index hint */ - volatile WT_PAGE_STATE state; /* Page state */ +#define WT_REF_DISK 0 /* Page is on disk */ +#define WT_REF_DELETED 1 /* Page is on disk, but deleted */ +#define WT_REF_LOCKED 2 /* Page locked for exclusive access */ +#define WT_REF_MEM 3 /* Page is in cache and valid */ +#define WT_REF_READING 4 /* Page being read */ +#define WT_REF_SPLIT 5 /* Parent page split (WT_REF dead) */ + volatile uint32_t state; /* Page state */ /* * Address: on-page cell if read from backing block, off-page WT_ADDR @@ -871,8 +896,9 @@ WT_PACKED_STRUCT_BEGIN(__wt_update) * store 4GB objects; I'd rather do that than increase the size of this * structure for a flag bit. */ -#define WT_UPDATE_DELETED_ISSET(upd) ((upd)->size == UINT32_MAX) -#define WT_UPDATE_DELETED_SET(upd) ((upd)->size = UINT32_MAX) +#define WT_UPDATE_DELETED_VALUE UINT32_MAX +#define WT_UPDATE_DELETED_SET(upd) ((upd)->size = WT_UPDATE_DELETED_VALUE) +#define WT_UPDATE_DELETED_ISSET(upd) ((upd)->size == WT_UPDATE_DELETED_VALUE) uint32_t size; /* update length */ /* The untyped value immediately follows the WT_UPDATE structure. */ @@ -958,7 +984,7 @@ struct __wt_insert { #define WT_PAGE_ALLOC_AND_SWAP(s, page, dest, v, count) do { \ if (((v) = (dest)) == NULL) { \ WT_ERR(__wt_calloc_def(s, count, &(v))); \ - if (WT_ATOMIC_CAS8(dest, NULL, v)) \ + if (__wt_atomic_cas_ptr(&dest, NULL, v)) \ __wt_cache_page_inmem_incr( \ s, page, (count) * sizeof(*(v))); \ else \ diff --git a/src/include/btree.h b/src/include/btree.h index deecd8f6d88..98ce4c22c10 100644 --- a/src/include/btree.h +++ b/src/include/btree.h @@ -146,12 +146,14 @@ struct __wt_btree { /* Flags values up to 0xff are reserved for WT_DHANDLE_* */ #define WT_BTREE_BULK 0x00100 /* Bulk-load handle */ #define WT_BTREE_IN_MEMORY 0x00200 /* Cache-resident object */ -#define WT_BTREE_NO_EVICTION 0x00400 /* Disable eviction */ -#define WT_BTREE_NO_LOGGING 0x00800 /* Disable logging */ -#define WT_BTREE_SALVAGE 0x01000 /* Handle is for salvage */ -#define WT_BTREE_SKIP_CKPT 0x02000 /* Handle skipped checkpoint */ -#define WT_BTREE_UPGRADE 0x04000 /* Handle is for upgrade */ -#define WT_BTREE_VERIFY 0x08000 /* Handle is for verify */ +#define WT_BTREE_LOOKASIDE 0x00400 /* Look-aside table */ +#define WT_BTREE_NO_CHECKPOINT 0x00800 /* Disable checkpoints */ +#define WT_BTREE_NO_EVICTION 0x01000 /* Disable eviction */ +#define WT_BTREE_NO_LOGGING 0x02000 /* Disable logging */ +#define WT_BTREE_SALVAGE 0x04000 /* Handle is for salvage */ +#define WT_BTREE_SKIP_CKPT 0x08000 /* Handle skipped checkpoint */ +#define WT_BTREE_UPGRADE 0x10000 /* Handle is for upgrade */ +#define WT_BTREE_VERIFY 0x20000 /* Handle is for verify */ uint32_t flags; }; diff --git a/src/include/btree.i b/src/include/btree.i index d13ec1972fb..b54cecb6ce0 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -10,17 +10,17 @@ * __wt_ref_is_root -- * Return if the page reference is for the root page. */ -static inline int +static inline bool __wt_ref_is_root(WT_REF *ref) { - return (ref->home == NULL ? 1 : 0); + return (ref->home == NULL); } /* * __wt_page_is_empty -- * Return if the page is empty. */ -static inline int +static inline bool __wt_page_is_empty(WT_PAGE *page) { return (page->modify != NULL && @@ -31,10 +31,10 @@ __wt_page_is_empty(WT_PAGE *page) * __wt_page_is_modified -- * Return if the page is dirty. */ -static inline int +static inline bool __wt_page_is_modified(WT_PAGE *page) { - return (page->modify != NULL && page->modify->write_gen != 0 ? 1 : 0); + return (page->modify != NULL && page->modify->write_gen != 0); } /* @@ -49,46 +49,74 @@ __wt_cache_page_inmem_incr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size) WT_ASSERT(session, size < WT_EXABYTE); cache = S2C(session)->cache; - (void)WT_ATOMIC_ADD8(cache->bytes_inmem, size); - (void)WT_ATOMIC_ADD8(page->memory_footprint, size); + (void)__wt_atomic_add64(&cache->bytes_inmem, size); + (void)__wt_atomic_addsize(&page->memory_footprint, size); if (__wt_page_is_modified(page)) { - (void)WT_ATOMIC_ADD8(cache->bytes_dirty, size); - (void)WT_ATOMIC_ADD8(page->modify->bytes_dirty, size); + (void)__wt_atomic_add64(&cache->bytes_dirty, size); + (void)__wt_atomic_addsize(&page->modify->bytes_dirty, size); } /* Track internal and overflow size in cache. */ if (WT_PAGE_IS_INTERNAL(page)) - (void)WT_ATOMIC_ADD8(cache->bytes_internal, size); + (void)__wt_atomic_add64(&cache->bytes_internal, size); else if (page->type == WT_PAGE_OVFL) - (void)WT_ATOMIC_ADD8(cache->bytes_overflow, size); + (void)__wt_atomic_add64(&cache->bytes_overflow, size); } -/* - * WT_CACHE_DECR -- - * Macro to decrement a field by a size. - * - * Be defensive and don't underflow: a band-aid on a gaping wound, but underflow - * won't make things better no matter the problem (specifically, underflow makes - * eviction crazy trying to evict non-existent memory). +/* + * __wt_cache_decr_check_size -- + * Decrement a size_t cache value and check for underflow. */ +static inline void +__wt_cache_decr_check_size( + WT_SESSION_IMPL *session, size_t *vp, size_t v, const char *fld) +{ + if (__wt_atomic_subsize(vp, v) < WT_EXABYTE) + return; + #ifdef HAVE_DIAGNOSTIC -#define WT_CACHE_DECR(session, f, sz) do { \ - static int __first = 1; \ - if (WT_ATOMIC_SUB8(f, sz) > WT_EXABYTE) { \ - (void)WT_ATOMIC_ADD8(f, sz); \ - if (__first) { \ - __wt_errx(session, \ - "%s underflow: decrementing %" WT_SIZET_FMT,\ - #f, sz); \ - __first = 0; \ - } \ - } \ -} while (0) + (void)__wt_atomic_addsize(vp, v); + + { + static int first = 1; + + if (!first) + return; + __wt_errx(session, "%s underflow: decrementing %" WT_SIZET_FMT, fld, v); + first = 0; + } #else -#define WT_CACHE_DECR(s, f, sz) do { \ - if (WT_ATOMIC_SUB8(f, sz) > WT_EXABYTE) \ - (void)WT_ATOMIC_ADD8(f, sz); \ -} while (0) + WT_UNUSED(fld); + WT_UNUSED(session); #endif +} + +/* + * __wt_cache_decr_check_uint64 -- + * Decrement a uint64_t cache value and check for underflow. + */ +static inline void +__wt_cache_decr_check_uint64( + WT_SESSION_IMPL *session, uint64_t *vp, size_t v, const char *fld) +{ + if (__wt_atomic_sub64(vp, v) < WT_EXABYTE) + return; + +#ifdef HAVE_DIAGNOSTIC + (void)__wt_atomic_add64(vp, v); + + { + static int first = 1; + + if (!first) + return; + __wt_errx(session, "%s underflow: decrementing %" WT_SIZET_FMT, fld, v); + first = 0; + } +#else + WT_UNUSED(fld); + WT_UNUSED(session); +#endif +} /* * __wt_cache_page_byte_dirty_decr -- @@ -128,9 +156,10 @@ __wt_cache_page_byte_dirty_decr( */ orig = page->modify->bytes_dirty; decr = WT_MIN(size, orig); - if (WT_ATOMIC_CAS8( - page->modify->bytes_dirty, orig, orig - decr)) { - WT_CACHE_DECR(session, cache->bytes_dirty, decr); + if (__wt_atomic_cassize( + &page->modify->bytes_dirty, orig, orig - decr)) { + __wt_cache_decr_check_uint64(session, + &cache->bytes_dirty, decr, "WT_CACHE.bytes_dirty"); break; } } @@ -149,15 +178,19 @@ __wt_cache_page_inmem_decr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size) WT_ASSERT(session, size < WT_EXABYTE); - WT_CACHE_DECR(session, cache->bytes_inmem, size); - WT_CACHE_DECR(session, page->memory_footprint, size); + __wt_cache_decr_check_uint64( + session, &cache->bytes_inmem, size, "WT_CACHE.bytes_inmem"); + __wt_cache_decr_check_size( + session, &page->memory_footprint, size, "WT_PAGE.memory_footprint"); if (__wt_page_is_modified(page)) __wt_cache_page_byte_dirty_decr(session, page, size); /* Track internal and overflow size in cache. */ if (WT_PAGE_IS_INTERNAL(page)) - WT_CACHE_DECR(session, cache->bytes_internal, size); + __wt_cache_decr_check_uint64(session, + &cache->bytes_internal, size, "WT_CACHE.bytes_internal"); else if (page->type == WT_PAGE_OVFL) - WT_CACHE_DECR(session, cache->bytes_overflow, size); + __wt_cache_decr_check_uint64(session, + &cache->bytes_overflow, size, "WT_CACHE.bytes_overflow"); } /* @@ -172,15 +205,15 @@ __wt_cache_dirty_incr(WT_SESSION_IMPL *session, WT_PAGE *page) size_t size; cache = S2C(session)->cache; - (void)WT_ATOMIC_ADD8(cache->pages_dirty, 1); + (void)__wt_atomic_add64(&cache->pages_dirty, 1); /* * Take care to read the memory_footprint once in case we are racing * with updates. */ size = page->memory_footprint; - (void)WT_ATOMIC_ADD8(cache->bytes_dirty, size); - (void)WT_ATOMIC_ADD8(page->modify->bytes_dirty, size); + (void)__wt_atomic_add64(&cache->bytes_dirty, size); + (void)__wt_atomic_addsize(&page->modify->bytes_dirty, size); } /* @@ -202,7 +235,7 @@ __wt_cache_dirty_decr(WT_SESSION_IMPL *session, WT_PAGE *page) "count went negative"); cache->pages_dirty = 0; } else - (void)WT_ATOMIC_SUB8(cache->pages_dirty, 1); + (void)__wt_atomic_sub64(&cache->pages_dirty, 1); modify = page->modify; if (modify != NULL && modify->bytes_dirty != 0) @@ -224,12 +257,15 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page) modify = page->modify; /* Update the bytes in-memory to reflect the eviction. */ - WT_CACHE_DECR(session, cache->bytes_inmem, page->memory_footprint); + __wt_cache_decr_check_uint64(session, + &cache->bytes_inmem, + page->memory_footprint, "WT_CACHE.bytes_inmem"); /* Update the bytes_internal value to reflect the eviction */ if (WT_PAGE_IS_INTERNAL(page)) - WT_CACHE_DECR(session, - cache->bytes_internal, page->memory_footprint); + __wt_cache_decr_check_uint64(session, + &cache->bytes_internal, + page->memory_footprint, "WT_CACHE.bytes_internal"); /* Update the cache's dirty-byte count. */ if (modify != NULL && modify->bytes_dirty != 0) { @@ -239,13 +275,14 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page) "dirty byte count went negative"); cache->bytes_dirty = 0; } else - WT_CACHE_DECR( - session, cache->bytes_dirty, modify->bytes_dirty); + __wt_cache_decr_check_uint64(session, + &cache->bytes_dirty, + modify->bytes_dirty, "WT_CACHE.bytes_dirty"); } /* Update pages and bytes evicted. */ - (void)WT_ATOMIC_ADD8(cache->bytes_evict, page->memory_footprint); - (void)WT_ATOMIC_ADD8(cache->pages_evict, 1); + (void)__wt_atomic_add64(&cache->bytes_evict, page->memory_footprint); + (void)__wt_atomic_add64(&cache->pages_evict, 1); } /* @@ -306,7 +343,7 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) * Every time the page transitions from clean to dirty, update the cache * and transactional information. */ - if (WT_ATOMIC_ADD4(page->modify->write_gen, 1) == 1) { + if (__wt_atomic_add32(&page->modify->write_gen, 1) == 1) { __wt_cache_dirty_incr(session, page); /* @@ -321,9 +358,13 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) * have committed in the meantime, and the last_running field * been updated past it. That is all very unlikely, but not * impossible, so we take care to read the global state before - * the atomic increment. If we raced with reconciliation, just - * leave the previous value here: at worst, we will write a - * page in a checkpoint when not absolutely necessary. + * the atomic increment. + * + * If the page was dirty on entry, then last_running == 0. The + * page could have become clean since then, if reconciliation + * completed. In that case, we leave the previous value for + * first_dirty_txn rather than potentially racing to update it, + * at worst, we'll unnecessarily write a page in a checkpoint. */ if (last_running != 0) page->modify->first_dirty_txn = last_running; @@ -335,6 +376,25 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) } /* + * __wt_page_modify_clear -- + * Clean a modified page. + */ +static inline void +__wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + /* + * The page must be held exclusive when this call is made, this call + * can only be used when the page is owned by a single thread. + * + * Allow the call to be made on clean pages. + */ + if (__wt_page_is_modified(page)) { + page->modify->write_gen = 0; + __wt_cache_dirty_decr(session, page); + } +} + +/* * __wt_page_modify_set -- * Mark the page and tree dirty. */ @@ -354,6 +414,9 @@ __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page) * shouldn't cause problems; regardless, let's play it safe.) */ if (S2BT(session)->modified == 0) { + /* Assert we never dirty a checkpoint handle. */ + WT_ASSERT(session, session->dhandle->checkpoint == NULL); + S2BT(session)->modified = 1; WT_FULL_BARRIER(); } @@ -395,7 +458,7 @@ __wt_page_parent_modify_set( * __wt_off_page -- * Return if a pointer references off-page data. */ -static inline int +static inline bool __wt_off_page(WT_PAGE *page, const void *p) { /* @@ -496,7 +559,12 @@ __wt_ref_key_instantiated(WT_REF *ref) static inline void __wt_ref_key_clear(WT_REF *ref) { - /* The key union has 2 fields, both of which are 8B. */ + /* + * The key union has 2 8B fields; this is equivalent to: + * + * ref->key.recno = WT_RECNO_OOB; + * ref->key.ikey = NULL; + */ ref->key.recno = 0; } @@ -506,7 +574,7 @@ __wt_ref_key_clear(WT_REF *ref) * had without unpacking a cell, and information about the cell, if the key * isn't cheaply available. */ -static inline int +static inline bool __wt_row_leaf_key_info(WT_PAGE *page, void *copy, WT_IKEY **ikeyp, WT_CELL **cellp, void *datap, size_t *sizep) { @@ -597,7 +665,7 @@ __wt_row_leaf_key_info(WT_PAGE *page, void *copy, if (cellp != NULL) *cellp = WT_PAGE_REF_OFFSET(page, WT_CELL_DECODE_OFFSET(v)); - return (0); + return (false); case WT_K_FLAG: /* Encoded key: no instantiated key, no cell. */ if (cellp != NULL) @@ -608,9 +676,9 @@ __wt_row_leaf_key_info(WT_PAGE *page, void *copy, *(void **)datap = WT_PAGE_REF_OFFSET(page, WT_K_DECODE_KEY_OFFSET(v)); *sizep = WT_K_DECODE_KEY_LEN(v); - return (1); + return (true); } - return (0); + return (false); case WT_KV_FLAG: /* Encoded key/value pair: no instantiated key, no cell. */ if (cellp != NULL) @@ -621,9 +689,9 @@ __wt_row_leaf_key_info(WT_PAGE *page, void *copy, *(void **)datap = WT_PAGE_REF_OFFSET( page, WT_KV_DECODE_KEY_OFFSET(v)); *sizep = WT_KV_DECODE_KEY_LEN(v); - return (1); + return (true); } - return (0); + return (false); } @@ -636,9 +704,9 @@ __wt_row_leaf_key_info(WT_PAGE *page, void *copy, if (datap != NULL) { *(void **)datap = WT_IKEY_DATA(ikey); *sizep = ikey->size; - return (1); + return (true); } - return (0); + return (false); } /* @@ -826,7 +894,7 @@ __wt_row_leaf_value_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *kpack) * __wt_row_leaf_value -- * Return the value for a row-store leaf page encoded key/value pair. */ -static inline int +static inline bool __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value) { uintptr_t v; @@ -842,9 +910,9 @@ __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value) value->data = WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_VALUE_OFFSET(v)); value->size = WT_KV_DECODE_VALUE_LEN(v); - return (1); + return (true); } - return (0); + return (false); } /* @@ -903,11 +971,13 @@ __wt_ref_info(WT_SESSION_IMPL *session, * __wt_page_can_split -- * Check whether a page can be split in memory. */ -static inline int +static inline bool __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; WT_INSERT_HEAD *ins_head; + WT_INSERT *ins; + int i; btree = S2BT(session); @@ -916,58 +986,54 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) * of the page could continually split without benefit. */ if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT)) - return (0); + return (false); /* * Check for pages with append-only workloads. A common application * pattern is to have multiple threads frantically appending to the * tree. We want to reconcile and evict this page, but we'd like to - * do it without making the appending threads wait. If we're not - * discarding the tree, check and see if it's worth doing a split to - * let the threads continue before doing eviction. - * - * Ignore anything other than large, dirty row-store leaf pages. + * do it without making the appending threads wait. See if it's worth + * doing a split to let the threads continue before doing eviction. * - * XXX KEITH - * Need a better test for append-only workloads. + * Ignore anything other than large, dirty row-store leaf pages. The + * split code only supports row-store pages, and we depend on the page + * being dirty for correctness (the page must be reconciled again + * before being evicted after the split, information from a previous + * reconciliation will be wrong, so we can't evict immediately). */ if (page->type != WT_PAGE_ROW_LEAF || page->memory_footprint < btree->maxmempage || !__wt_page_is_modified(page)) - return (0); - - /* Don't split a page that is pending a multi-block split. */ - if (F_ISSET(page->modify, WT_PM_REC_MULTIBLOCK)) - return (0); + return (false); /* * There is no point splitting if the list is small, no deep items is - * our heuristic for that. (A 1/4 probability of adding a new skiplist - * level means there will be a new 6th level for roughly each 4KB of - * entries in the list. If we have at least two 6th level entries, the - * list is at least large enough to work with.) - * - * The following code requires at least two items on the insert list, - * this test serves the additional purpose of confirming that. + * our heuristic for that. A 1/4 probability of adding a new skiplist + * level, with level-0 always created, means there will be a 5th level + * entry for roughly every 1024 entries in the list. If there are at + * least 4 5th level entries (4K items), the list is large enough. */ -#define WT_MIN_SPLIT_SKIPLIST_DEPTH WT_MIN(6, WT_SKIP_MAXDEPTH - 1) +#define WT_MIN_SPLIT_SKIPLIST_DEPTH WT_MIN(5, WT_SKIP_MAXDEPTH - 1) ins_head = page->pg_row_entries == 0 ? WT_ROW_INSERT_SMALLEST(page) : WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); - if (ins_head == NULL || - ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH] == NULL || - ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH] == - ins_head->tail[WT_MIN_SPLIT_SKIPLIST_DEPTH]) - return (0); - - return (1); + if (ins_head == NULL) + return (false); + for (i = 0, ins = ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH]; + ins != NULL; ins = ins->next[WT_MIN_SPLIT_SKIPLIST_DEPTH]) + if (++i == 4) { + WT_STAT_FAST_CONN_INCR(session, cache_inmem_splittable); + WT_STAT_FAST_DATA_INCR(session, cache_inmem_splittable); + return (true); + } + return (false); } /* * __wt_page_can_evict -- * Check whether a page can be evicted. */ -static inline int +static inline bool __wt_page_can_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int check_splits, int *inmem_splitp) { @@ -980,11 +1046,22 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, btree = S2BT(session); mod = page->modify; - txn_global = &S2C(session)->txn_global; /* Pages that have never been modified can always be evicted. */ if (mod == NULL) - return (1); + return (true); + + /* + * Check for in-memory splits before other eviction tests. If the page + * should split in-memory, return success immediately and skip more + * detailed eviction tests. We don't need further tests since the page + * won't be written or discarded from the cache. + */ + if (__wt_page_can_split(session, page)) { + if (inmem_splitp != NULL) + *inmem_splitp = 1; + return (true); + } /* * If the tree was deepened, there's a requirement that newly created @@ -997,20 +1074,7 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, */ if (check_splits && WT_PAGE_IS_INTERNAL(page) && !__wt_txn_visible_all(session, mod->mod_split_txn)) - return (0); - - /* - * Allow for the splitting of pages when a checkpoint is underway only - * if the allow_splits flag has been passed, we know we are performing - * a checkpoint, the page is larger than the stated maximum and there - * has not already been a split on this page as the WT_PM_REC_MULTIBLOCK - * flag is unset. - */ - if (__wt_page_can_split(session, page)) { - if (inmem_splitp != NULL) - *inmem_splitp = 1; - return (1); - } + return (false); /* * If the file is being checkpointed, we can't evict dirty pages: @@ -1018,48 +1082,27 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, * previous version might be referenced by an internal page already * been written in the checkpoint, leaving the checkpoint inconsistent. */ - if (btree->checkpointing && - (__wt_page_is_modified(page) || - F_ISSET(mod, WT_PM_REC_MULTIBLOCK))) { + if (btree->checkpointing && __wt_page_is_modified(page)) { WT_STAT_FAST_CONN_INCR(session, cache_eviction_checkpoint); WT_STAT_FAST_DATA_INCR(session, cache_eviction_checkpoint); - return (0); + return (false); } /* - * If we aren't (potentially) doing eviction that can restore updates - * and the updates on this page are too recent, give up. - * - * Don't rely on new updates being skipped by the transaction used - * for transaction reads: (1) there are paths that dirty pages for - * artificial reasons; (2) internal pages aren't transactional; and - * (3) if an update was skipped during the checkpoint (leaving the page - * dirty), then rolled back, we could still successfully overwrite a - * page and corrupt the checkpoint. + * If the page was recently split in-memory, don't evict it immediately: + * we want to give application threads that are appending a chance to + * move to the new leaf page created by the split. * - * Further, we can't race with the checkpoint's reconciliation of - * an internal page as we evict a clean child from the page's subtree. - * This works in the usual way: eviction locks the page and then checks - * for existing hazard pointers, the checkpoint thread reconciling an - * internal page acquires hazard pointers on child pages it reads, and - * is blocked by the exclusive lock. - */ - if (page->read_gen != WT_READGEN_OLDEST && - !__wt_txn_visible_all(session, __wt_page_is_modified(page) ? - mod->update_txn : mod->rec_max_txn)) - return (0); - - /* - * If the page was recently split in-memory, don't force it out: we - * hope an eviction thread will find it first. The check here is - * similar to __wt_txn_visible_all, but ignores the checkpoint's - * transaction. + * Note the check here is similar to __wt_txn_visible_all, but ignores + * the checkpoint's transaction. */ - if (check_splits && - WT_TXNID_LE(txn_global->oldest_id, mod->inmem_split_txn)) - return (0); + if (check_splits) { + txn_global = &S2C(session)->txn_global; + if (WT_TXNID_LE(txn_global->oldest_id, mod->inmem_split_txn)) + return (false); + } - return (1); + return (true); } /* @@ -1082,17 +1125,17 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) * reference without first locking the page, it could be evicted in * between. */ - locked = WT_ATOMIC_CAS4(ref->state, WT_REF_MEM, WT_REF_LOCKED); + locked = __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED); if ((ret = __wt_hazard_clear(session, page)) != 0 || !locked) { if (locked) ref->state = WT_REF_MEM; return (ret == 0 ? EBUSY : ret); } - (void)WT_ATOMIC_ADD4(btree->evict_busy, 1); + (void)__wt_atomic_addv32(&btree->evict_busy, 1); too_big = (page->memory_footprint > btree->maxmempage) ? 1 : 0; - if ((ret = __wt_evict_page(session, ref)) == 0) { + if ((ret = __wt_evict(session, ref, 0)) == 0) { if (too_big) WT_STAT_FAST_CONN_INCR(session, cache_eviction_force); else @@ -1106,7 +1149,7 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref) } else WT_STAT_FAST_CONN_INCR(session, cache_eviction_force_fail); - (void)WT_ATOMIC_SUB4(btree->evict_busy, 1); + (void)__wt_atomic_subv32(&btree->evict_busy, 1); return (ret); } @@ -1143,12 +1186,13 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) * memory_page_max setting, when we see many deleted items, and when we * are attempting to scan without trashing the cache. * - * Fast checks if eviction is disabled for this operation or this tree, - * then perform a general check if eviction will be possible. + * Fast checks if eviction is disabled for this handle, operation or + * tree, then perform a general check if eviction will be possible. */ page = ref->page; if (page->read_gen != WT_READGEN_OLDEST || LF_ISSET(WT_READ_NO_EVICT) || + F_ISSET(session, WT_SESSION_NO_EVICTION) || F_ISSET(btree, WT_BTREE_NO_EVICTION) || !__wt_page_can_evict(session, page, 1, NULL)) return (__wt_hazard_clear(session, page)); @@ -1264,13 +1308,13 @@ __wt_skip_choose_depth(WT_SESSION_IMPL *session) } /* - * __wt_btree_lsm_size -- + * __wt_btree_lsm_over_size -- * Return if the size of an in-memory tree with a single leaf page is over * a specified maximum. If called on anything other than a simple tree with a * single leaf page, returns true so our LSM caller will switch to a new tree. */ -static inline int -__wt_btree_lsm_size(WT_SESSION_IMPL *session, uint64_t maxsize) +static inline bool +__wt_btree_lsm_over_size(WT_SESSION_IMPL *session, uint64_t maxsize) { WT_BTREE *btree; WT_PAGE *child, *root; @@ -1282,20 +1326,20 @@ __wt_btree_lsm_size(WT_SESSION_IMPL *session, uint64_t maxsize) /* Check for a non-existent tree. */ if (root == NULL) - return (0); + return (false); /* A tree that can be evicted always requires a switch. */ if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) - return (1); + return (true); /* Check for a tree with a single leaf page. */ WT_INTL_INDEX_GET(session, root, pindex); if (pindex->entries != 1) /* > 1 child page, switch */ - return (1); + return (true); first = pindex->index[0]; if (first->state != WT_REF_MEM) /* no child page, ignore */ - return (0); + return (false); /* * We're reaching down into the page without a hazard pointer, but @@ -1304,7 +1348,7 @@ __wt_btree_lsm_size(WT_SESSION_IMPL *session, uint64_t maxsize) */ child = first->page; if (child->type != WT_PAGE_ROW_LEAF) /* not a single leaf page */ - return (1); + return (true); return (child->memory_footprint > maxsize); } diff --git a/src/include/cache.h b/src/include/cache.h index ed93f82538c..f199372ea5e 100644 --- a/src/include/cache.h +++ b/src/include/cache.h @@ -18,11 +18,6 @@ #define WT_EVICT_WALK_BASE 300 /* Pages tracked across file visits */ #define WT_EVICT_WALK_INCR 100 /* Pages added each walk */ -#define WT_EVICT_PASS_AGGRESSIVE 0x01 -#define WT_EVICT_PASS_ALL 0x02 -#define WT_EVICT_PASS_DIRTY 0x04 -#define WT_EVICT_PASS_WOULD_BLOCK 0x08 - /* * WT_EVICT_ENTRY -- * Encapsulation of an eviction candidate. @@ -96,7 +91,7 @@ struct __wt_cache { /* * LRU eviction list information. */ - WT_EVICT_ENTRY *evict; /* LRU pages being tracked */ + WT_EVICT_ENTRY *evict_queue; /* LRU pages being tracked */ WT_EVICT_ENTRY *evict_current; /* LRU current page to be evicted */ uint32_t evict_candidates; /* LRU list pages to evict */ uint32_t evict_entries; /* LRU entries in the queue */ @@ -109,6 +104,7 @@ struct __wt_cache { * Cache pool information. */ uint64_t cp_pass_pressure; /* Calculated pressure from this pass */ + uint64_t cp_quota; /* Maximum size for this cache */ uint64_t cp_reserved; /* Base size for this cache */ WT_SESSION_IMPL *cp_session; /* May be used for cache management */ uint32_t cp_skip_count; /* Post change stabilization */ @@ -119,6 +115,15 @@ struct __wt_cache { uint64_t cp_saved_read; /* Read count at last review */ /* + * Work state. + */ +#define WT_EVICT_PASS_AGGRESSIVE 0x01 +#define WT_EVICT_PASS_ALL 0x02 +#define WT_EVICT_PASS_DIRTY 0x04 +#define WT_EVICT_PASS_WOULD_BLOCK 0x08 + uint32_t state; + + /* * Flags. */ #define WT_CACHE_POOL_MANAGER 0x01 /* The active cache pool manager */ @@ -140,6 +145,7 @@ struct __wt_cache_pool { const char *name; uint64_t size; uint64_t chunk; + uint64_t quota; uint64_t currently_used; uint32_t refs; /* Reference count for structure. */ /* Locked: List of connections participating in the cache pool. */ diff --git a/src/include/cache.i b/src/include/cache.i index 87f8c5543d1..bc33f82d927 100644 --- a/src/include/cache.i +++ b/src/include/cache.i @@ -104,48 +104,6 @@ __wt_cache_dirty_inuse(WT_CACHE *cache) } /* - * __wt_cache_status -- - * Return if the cache usage exceeds the eviction or dirty targets. - */ -static inline void -__wt_cache_status(WT_SESSION_IMPL *session, int *evictp, int *dirtyp) -{ - WT_CONNECTION_IMPL *conn; - WT_CACHE *cache; - uint64_t bytes_inuse, bytes_max, dirty_inuse; - - conn = S2C(session); - cache = conn->cache; - - /* - * There's an assumption "evict" overrides "dirty", that is, if eviction - * is required, we no longer care where we are with respect to the dirty - * target. - * - * Avoid division by zero if the cache size has not yet been set in a - * shared cache. - */ - bytes_max = conn->cache_size + 1; - if (evictp != NULL) { - bytes_inuse = __wt_cache_bytes_inuse(cache); - if (bytes_inuse > (cache->eviction_target * bytes_max) / 100) { - *evictp = 1; - return; - } - *evictp = 0; - } - if (dirtyp != NULL) { - dirty_inuse = __wt_cache_dirty_inuse(cache); - if (dirty_inuse > - (cache->eviction_dirty_target * bytes_max) / 100) { - *dirtyp = 1; - return; - } - *dirtyp = 0; - } -} - -/* * __wt_session_can_wait -- * Return if a session available for a potentially slow operation. */ @@ -161,29 +119,52 @@ __wt_session_can_wait(WT_SESSION_IMPL *session) return (0); /* - * LSM sets the no-cache-check flag when holding the LSM tree lock, + * LSM sets the no-eviction flag when holding the LSM tree lock, * in that case, or when holding the schema lock, we don't want to * highjack the thread for eviction. */ if (F_ISSET(session, - WT_SESSION_NO_CACHE_CHECK | WT_SESSION_LOCKED_SCHEMA)) + WT_SESSION_NO_EVICTION | WT_SESSION_LOCKED_SCHEMA)) return (0); return (1); } /* + * __wt_eviction_aggressive -- + * Return if the eviction server is running in aggressive mode. + */ +static inline int +__wt_eviction_aggressive(WT_SESSION_IMPL *session) +{ + return (FLD_ISSET( + S2C(session)->cache->state, WT_EVICT_PASS_AGGRESSIVE) ? 1 : 0); +} + +/* + * __wt_eviction_dirty_target -- + * Return if the eviction server is running to reduce the number of dirty + * pages (versus running to discard pages from the cache). + */ +static inline int +__wt_eviction_dirty_target(WT_SESSION_IMPL *session) +{ + return (FLD_ISSET( + S2C(session)->cache->state, WT_EVICT_PASS_DIRTY) ? 1 : 0); +} + +/* * __wt_eviction_needed -- * Return if an application thread should do eviction, and the cache full * percentage as a side-effect. */ -static inline int -__wt_eviction_needed(WT_SESSION_IMPL *session, int *pct_fullp) +static inline bool +__wt_eviction_needed(WT_SESSION_IMPL *session, u_int *pct_fullp) { WT_CONNECTION_IMPL *conn; WT_CACHE *cache; uint64_t bytes_inuse, bytes_max; - int pct_full; + u_int pct_full; conn = S2C(session); cache = conn->cache; @@ -196,25 +177,20 @@ __wt_eviction_needed(WT_SESSION_IMPL *session, int *pct_fullp) bytes_max = conn->cache_size + 1; /* - * Return the cache full percentage; anything over 95% means we involve - * the application thread. + * Calculate the cache full percentage; anything over the trigger means + * we involve the application thread. */ - pct_full = (int)((100 * bytes_inuse) / bytes_max); + pct_full = (u_int)((100 * bytes_inuse) / bytes_max); if (pct_fullp != NULL) *pct_fullp = pct_full; - if (pct_full >= 95) - return (1); + if (pct_full > cache->eviction_trigger) + return (true); - /* - * Return if we're over the trigger cache size or there are too many - * dirty pages. - */ - if (bytes_inuse > (cache->eviction_trigger * bytes_max) / 100) - return (1); + /* Return if there are too many dirty bytes in cache. */ if (__wt_cache_dirty_inuse(cache) > (cache->eviction_dirty_trigger * bytes_max) / 100) - return (1); - return (0); + return (true); + return (false); } /* @@ -225,7 +201,7 @@ static inline int __wt_cache_eviction_check(WT_SESSION_IMPL *session, int busy, int *didworkp) { WT_BTREE *btree; - int pct_full; + u_int pct_full; if (didworkp != NULL) *didworkp = 0; @@ -235,7 +211,7 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, int busy, int *didworkp) * that case, or when holding the schema or handle list locks (which * block eviction), we don't want to highjack the thread for eviction. */ - if (F_ISSET(session, WT_SESSION_NO_CACHE_CHECK | + if (F_ISSET(session, WT_SESSION_NO_EVICTION | WT_SESSION_LOCKED_HANDLE_LIST | WT_SESSION_LOCKED_SCHEMA)) return (0); diff --git a/src/include/cell.i b/src/include/cell.i index 20a4d214015..a517ac4a523 100644 --- a/src/include/cell.i +++ b/src/include/cell.i @@ -182,7 +182,7 @@ __wt_cell_pack_addr(WT_CELL *cell, u_int cell_type, uint64_t recno, size_t size) p = cell->__chunk + 1; - if (recno == 0) + if (recno == WT_RECNO_OOB) cell->__chunk[0] = cell_type; /* Type */ else { cell->__chunk[0] = cell_type | WT_CELL_64V; @@ -547,7 +547,8 @@ __wt_cell_leaf_value_parse(WT_PAGE *page, WT_CELL *cell) * Unpack a WT_CELL into a structure during verification. */ static inline int -__wt_cell_unpack_safe(WT_CELL *cell, WT_CELL_UNPACK *unpack, uint8_t *end) +__wt_cell_unpack_safe( + WT_CELL *cell, WT_CELL_UNPACK *unpack, const void *start, const void *end) { struct { uint32_t len; @@ -560,14 +561,15 @@ __wt_cell_unpack_safe(WT_CELL *cell, WT_CELL_UNPACK *unpack, uint8_t *end) copy.v = 0; /* -Werror=maybe-uninitialized */ /* - * The verification code specifies an end argument, a pointer to 1 past - * the end-of-page. In that case, make sure we don't go past the end - * of the page when reading. If an error occurs, we simply return the - * error code, the verification code takes care of complaining (and, in - * the case of salvage, it won't complain at all, it's OK to fail). + * The verification code specifies start/end arguments, pointers to the + * start of the page and to 1 past the end-of-page. In which case, make + * sure all reads are inside the page image. If an error occurs, return + * an error code but don't output messages, our caller handles that. */ -#define WT_CELL_LEN_CHK(p, len) do { \ - if (end != NULL && (((uint8_t *)p) + (len)) > end) \ +#define WT_CELL_LEN_CHK(t, len) do { \ + if (start != NULL && \ + ((uint8_t *)t < (uint8_t *)start || \ + (((uint8_t *)t) + (len)) > (uint8_t *)end)) \ return (WT_ERROR); \ } while (0) @@ -630,7 +632,7 @@ restart: */ if (cell->__chunk[0] & WT_CELL_64V) /* skip value */ WT_RET(__wt_vunpack_uint( - &p, end == NULL ? 0 : (size_t)(end - p), &unpack->v)); + &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &unpack->v)); /* * Handle special actions for a few different cell types and set the @@ -647,7 +649,7 @@ restart: * earlier cell. */ WT_RET(__wt_vunpack_uint( - &p, end == NULL ? 0 : (size_t)(end - p), &v)); + &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &v)); copy.len = WT_PTRDIFF32(p, cell); copy.v = unpack->v; cell = (WT_CELL *)((uint8_t *)cell - v); @@ -675,7 +677,7 @@ restart: * data. */ WT_RET(__wt_vunpack_uint( - &p, end == NULL ? 0 : (size_t)(end - p), &v)); + &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &v)); if (unpack->raw == WT_CELL_KEY || unpack->raw == WT_CELL_KEY_PFX || @@ -716,7 +718,7 @@ done: WT_CELL_LEN_CHK(cell, unpack->__len); static inline void __wt_cell_unpack(WT_CELL *cell, WT_CELL_UNPACK *unpack) { - (void)__wt_cell_unpack_safe(cell, unpack, NULL); + (void)__wt_cell_unpack_safe(cell, unpack, NULL, NULL); } /* diff --git a/src/include/connection.h b/src/include/connection.h index cd55aadfc07..d8ff261cd82 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -38,8 +38,8 @@ struct __wt_keyed_encryptor { size_t size_const; /* The result of the sizing callback */ WT_ENCRYPTOR *encryptor; /* User supplied callbacks */ /* Linked list of encryptors */ - SLIST_ENTRY(__wt_keyed_encryptor) hashl; - SLIST_ENTRY(__wt_keyed_encryptor) l; + TAILQ_ENTRY(__wt_keyed_encryptor) hashq; + TAILQ_ENTRY(__wt_keyed_encryptor) q; }; /* @@ -82,9 +82,9 @@ struct __wt_named_encryptor { const char *name; /* Name of encryptor */ WT_ENCRYPTOR *encryptor; /* User supplied callbacks */ /* Locked: list of encryptors by key */ - SLIST_HEAD(__wt_keyedhash, __wt_keyed_encryptor) - keyedhashlh[WT_HASH_ARRAY_SIZE]; - SLIST_HEAD(__wt_keyed_lh, __wt_keyed_encryptor) keyedlh; + TAILQ_HEAD(__wt_keyedhash, __wt_keyed_encryptor) + keyedhashqh[WT_HASH_ARRAY_SIZE]; + TAILQ_HEAD(__wt_keyed_qh, __wt_keyed_encryptor) keyedqh; /* Linked list of encryptors */ TAILQ_ENTRY(__wt_named_encryptor) q; }; @@ -100,10 +100,10 @@ struct __wt_named_extractor { }; /* - * Allocate some additional slots for internal sessions. There is a default - * session for each connection, plus a session for each server thread. + * Allocate some additional slots for internal sessions so the user cannot + * configure too few sessions for us to run. */ -#define WT_NUM_INTERNAL_SESSIONS 10 +#define WT_EXTRA_INTERNAL_SESSIONS 10 /* * WT_CONN_CHECK_PANIC -- @@ -119,14 +119,15 @@ struct __wt_named_extractor { * main queue and the hashed queue. */ #define WT_CONN_DHANDLE_INSERT(conn, dhandle, bucket) do { \ - SLIST_INSERT_HEAD(&(conn)->dhlh, dhandle, l); \ - SLIST_INSERT_HEAD(&(conn)->dhhash[bucket], dhandle, hashl); \ + TAILQ_INSERT_HEAD(&(conn)->dhqh, dhandle, q); \ + TAILQ_INSERT_HEAD(&(conn)->dhhash[bucket], dhandle, hashq); \ + ++conn->dhandle_count; \ } while (0) #define WT_CONN_DHANDLE_REMOVE(conn, dhandle, bucket) do { \ - SLIST_REMOVE(&(conn)->dhlh, dhandle, __wt_data_handle, l); \ - SLIST_REMOVE(&(conn)->dhhash[bucket], \ - dhandle, __wt_data_handle, hashl); \ + TAILQ_REMOVE(&(conn)->dhqh, dhandle, q); \ + TAILQ_REMOVE(&(conn)->dhhash[bucket], dhandle, hashq); \ + --conn->dhandle_count; \ } while (0) /* @@ -134,14 +135,13 @@ struct __wt_named_extractor { * main queue and the hashed queue. */ #define WT_CONN_BLOCK_INSERT(conn, block, bucket) do { \ - SLIST_INSERT_HEAD(&(conn)->blocklh, block, l); \ - SLIST_INSERT_HEAD(&(conn)->blockhash[bucket], block, hashl); \ + TAILQ_INSERT_HEAD(&(conn)->blockqh, block, q); \ + TAILQ_INSERT_HEAD(&(conn)->blockhash[bucket], block, hashq); \ } while (0) #define WT_CONN_BLOCK_REMOVE(conn, block, bucket) do { \ - SLIST_REMOVE(&(conn)->blocklh, block, __wt_block, l); \ - SLIST_REMOVE( \ - &(conn)->blockhash[bucket], block, __wt_block, hashl); \ + TAILQ_REMOVE(&(conn)->blockqh, block, q); \ + TAILQ_REMOVE(&(conn)->blockhash[bucket], block, hashq); \ } while (0) /* @@ -149,13 +149,13 @@ struct __wt_named_extractor { * main queue and the hashed queue. */ #define WT_CONN_FILE_INSERT(conn, fh, bucket) do { \ - SLIST_INSERT_HEAD(&(conn)->fhlh, fh, l); \ - SLIST_INSERT_HEAD(&(conn)->fhhash[bucket], fh, hashl); \ + TAILQ_INSERT_HEAD(&(conn)->fhqh, fh, q); \ + TAILQ_INSERT_HEAD(&(conn)->fhhash[bucket], fh, hashq); \ } while (0) #define WT_CONN_FILE_REMOVE(conn, fh, bucket) do { \ - SLIST_REMOVE(&(conn)->fhlh, fh, __wt_fh, l); \ - SLIST_REMOVE(&(conn)->fhhash[bucket], fh, __wt_fh, hashl); \ + TAILQ_REMOVE(&(conn)->fhqh, fh, q); \ + TAILQ_REMOVE(&(conn)->fhhash[bucket], fh, hashq); \ } while (0) /* @@ -180,13 +180,17 @@ struct __wt_connection_impl { WT_SPINLOCK table_lock; /* Table creation spinlock */ /* - * We distribute the btree page locks across a set of spin locks; it - * can't be an array, we impose cache-line alignment and gcc doesn't - * support that for arrays. Don't use too many: they are only held for - * very short operations, each one is 64 bytes, so 256 will fill the L1 - * cache on most CPUs. + * We distribute the btree page locks across a set of spin locks. Don't + * use too many: they are only held for very short operations, each one + * is 64 bytes, so 256 will fill the L1 cache on most CPUs. + * + * Use a prime number of buckets rather than assuming a good hash + * (Reference Sedgewick, Algorithms in C, "Hash Functions"). + * + * Note: this can't be an array, we impose cache-line alignment and gcc + * doesn't support that for arrays smaller than the alignment. */ -#define WT_PAGE_LOCKS(conn) 16 +#define WT_PAGE_LOCKS 17 WT_SPINLOCK *page_lock; /* Btree page spinlocks */ u_int page_lock_cnt; /* Next spinlock to use */ @@ -211,6 +215,8 @@ struct __wt_connection_impl { WT_FH *lock_fh; /* Lock file handle */ volatile uint64_t split_gen; /* Generation number for splits */ + uint64_t split_stashed_bytes; /* Atomic: split statistics */ + uint64_t split_stashed_objects; /* * The connection keeps a cache of data handles. The set of handles @@ -219,24 +225,26 @@ struct __wt_connection_impl { * URI. */ /* Locked: data handle hash array */ - SLIST_HEAD(__wt_dhhash, __wt_data_handle) dhhash[WT_HASH_ARRAY_SIZE]; + TAILQ_HEAD(__wt_dhhash, __wt_data_handle) dhhash[WT_HASH_ARRAY_SIZE]; /* Locked: data handle list */ - SLIST_HEAD(__wt_dhandle_lh, __wt_data_handle) dhlh; + TAILQ_HEAD(__wt_dhandle_qh, __wt_data_handle) dhqh; /* Locked: LSM handle list. */ TAILQ_HEAD(__wt_lsm_qh, __wt_lsm_tree) lsmqh; /* Locked: file list */ - SLIST_HEAD(__wt_fhhash, __wt_fh) fhhash[WT_HASH_ARRAY_SIZE]; - SLIST_HEAD(__wt_fh_lh, __wt_fh) fhlh; + TAILQ_HEAD(__wt_fhhash, __wt_fh) fhhash[WT_HASH_ARRAY_SIZE]; + TAILQ_HEAD(__wt_fh_qh, __wt_fh) fhqh; /* Locked: library list */ TAILQ_HEAD(__wt_dlh_qh, __wt_dlh) dlhqh; WT_SPINLOCK block_lock; /* Locked: block manager list */ - SLIST_HEAD(__wt_blockhash, __wt_block) blockhash[WT_HASH_ARRAY_SIZE]; - SLIST_HEAD(__wt_block_lh, __wt_block) blocklh; + TAILQ_HEAD(__wt_blockhash, __wt_block) blockhash[WT_HASH_ARRAY_SIZE]; + TAILQ_HEAD(__wt_block_qh, __wt_block) blockqh; + u_int dhandle_count; /* Locked: handles in the queue */ u_int open_btree_count; /* Locked: open writable btree count */ uint32_t next_file_id; /* Locked: file ID counter */ uint32_t open_file_count; /* Atomic: open file handle count */ + uint32_t open_cursor_count; /* Atomic: open cursor handle count */ /* * WiredTiger allocates space for 50 simultaneous sessions (threads of @@ -262,7 +270,9 @@ struct __wt_connection_impl { uint32_t hazard_max; /* Hazard array size */ WT_CACHE *cache; /* Page cache */ - uint64_t cache_size; /* Configured cache size */ + volatile uint64_t cache_size; /* Cache size (either statically + configured or the current size + within a cache pool). */ WT_TXN_GLOBAL txn_global; /* Global transaction state */ @@ -277,9 +287,12 @@ struct __wt_connection_impl { #define WT_CKPT_LOGSIZE(conn) ((conn)->ckpt_logsize != 0) wt_off_t ckpt_logsize; /* Checkpoint log size period */ uint32_t ckpt_signalled;/* Checkpoint signalled */ - uint64_t ckpt_usecs; /* Checkpoint period */ - int compact_in_memory_pass; /* Compaction serialization */ + uint64_t ckpt_usecs; /* Checkpoint timer */ + uint64_t ckpt_time_max; /* Checkpoint time min/max */ + uint64_t ckpt_time_min; + uint64_t ckpt_time_recent; /* Checkpoint time recent/total */ + uint64_t ckpt_time_total; #define WT_CONN_STAT_ALL 0x01 /* "all" statistics configured */ #define WT_CONN_STAT_CLEAR 0x02 /* clear after gathering */ @@ -289,7 +302,9 @@ struct __wt_connection_impl { #define WT_CONN_STAT_SIZE 0x20 /* "size" statistics configured */ uint32_t stat_flags; - WT_CONNECTION_STATS stats; /* Connection statistics */ + /* Connection statistics */ + WT_CONNECTION_STATS *stats[WT_COUNTER_SLOTS]; + WT_CONNECTION_STATS stat_array[WT_COUNTER_SLOTS]; WT_ASYNC *async; /* Async structure */ int async_cfg; /* Global async configuration */ @@ -325,7 +340,8 @@ struct __wt_connection_impl { #define WT_CONN_LOG_ENABLED 0x02 /* Logging is enabled */ #define WT_CONN_LOG_EXISTED 0x04 /* Log files found */ #define WT_CONN_LOG_PREALLOC 0x08 /* Pre-allocation is enabled */ -#define WT_CONN_LOG_RECOVER_ERR 0x10 /* Error if recovery required */ +#define WT_CONN_LOG_RECOVER_DONE 0x10 /* Recovery completed */ +#define WT_CONN_LOG_RECOVER_ERR 0x20 /* Error if recovery required */ uint32_t log_flags; /* Global logging configuration */ WT_CONDVAR *log_cond; /* Log server wait mutex */ WT_SESSION_IMPL *log_session; /* Log server session */ @@ -354,6 +370,20 @@ struct __wt_connection_impl { time_t sweep_interval;/* Handle sweep interval */ u_int sweep_handles_min;/* Handle sweep minimum open */ + /* + * Shared lookaside lock, session and cursor, used by threads accessing + * the lookaside table (other than eviction server and worker threads + * and the sweep thread, all of which have their own lookaside cursors). + */ + WT_SPINLOCK las_lock; /* Lookaside table spinlock */ + WT_SESSION_IMPL *las_session; /* Lookaside table session */ + WT_CURSOR *las_cursor; /* Lookaside table cursor */ + bool las_written; /* Lookaside table has been written */ + + WT_ITEM las_sweep_key; /* Sweep server's saved key */ + int las_sweep_call;/* Sweep server's call count */ + uint64_t las_sweep_cnt; /* Sweep server's per-call row count */ + /* Locked: collator list */ TAILQ_HEAD(__wt_coll_qh, __wt_named_collator) collqh; diff --git a/src/include/cursor.h b/src/include/cursor.h index 36f36f2c46c..2f55dfc8186 100644 --- a/src/include/cursor.h +++ b/src/include/cursor.h @@ -261,6 +261,7 @@ struct __wt_cursor_index { WT_CURSOR *child; WT_CURSOR **cg_cursors; + uint8_t *cg_needvalue; }; struct __wt_cursor_json { @@ -303,10 +304,10 @@ struct __wt_cursor_stat { int notinitialized; /* Cursor not initialized */ int notpositioned; /* Cursor not positioned */ - WT_STATS *stats; /* Stats owned by the cursor */ - WT_STATS *stats_first; /* First stats reference */ - int stats_base; /* Base statistics value */ - int stats_count; /* Count of stats elements */ + int64_t *stats; /* Statistics */ + int stats_base; /* Base statistics value */ + int stats_count; /* Count of statistics values */ + const char *(*stats_desc)(int); /* Statistics descriptions */ union { /* Copies of the statistics */ WT_DSRC_STATS dsrc_stats; @@ -325,12 +326,10 @@ struct __wt_cursor_stat { /* * WT_CURSOR_STATS -- - * Return a reference to a statistic cursor's stats structures; use the - * WT_CURSOR.stats_first field instead of WT_CURSOR.stats because the latter - * is NULL when non-cursor memory is used to hold the statistics. + * Return a reference to a statistic cursor's stats structures. */ #define WT_CURSOR_STATS(cursor) \ - (((WT_CURSOR_STAT *)cursor)->stats_first) + (((WT_CURSOR_STAT *)cursor)->stats) struct __wt_cursor_table { WT_CURSOR iface; diff --git a/src/include/cursor.i b/src/include/cursor.i index 9e592ede450..e7fed250251 100644 --- a/src/include/cursor.i +++ b/src/include/cursor.i @@ -32,7 +32,7 @@ __cursor_pos_clear(WT_CURSOR_BTREE *cbt) * and it's a minimal set of things we need to clear. It would be a * lot simpler to clear everything, but we call this function a lot. */ - cbt->recno = 0; + cbt->recno = WT_RECNO_OOB; cbt->ins = NULL; cbt->ins_head = NULL; @@ -150,7 +150,7 @@ __wt_cursor_dhandle_incr_use(WT_SESSION_IMPL *session) dhandle = session->dhandle; /* If we open a handle with a time of death set, clear it. */ - if (WT_ATOMIC_ADD4(dhandle->session_inuse, 1) == 1 && + if (__wt_atomic_addi32(&dhandle->session_inuse, 1) == 1 && dhandle->timeofdeath != 0) dhandle->timeofdeath = 0; } @@ -168,7 +168,7 @@ __wt_cursor_dhandle_decr_use(WT_SESSION_IMPL *session) /* If we close a handle with a time of death set, clear it. */ WT_ASSERT(session, dhandle->session_inuse > 0); - if (WT_ATOMIC_SUB4(dhandle->session_inuse, 1) == 0 && + if (__wt_atomic_subi32(&dhandle->session_inuse, 1) == 0 && dhandle->timeofdeath != 0) dhandle->timeofdeath = 0; } @@ -187,6 +187,12 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, int reenter) if (reenter) WT_RET(__curfile_leave(cbt)); + /* + * Any old insert position is now invalid. We rely on this being + * cleared to detect if a new skiplist is installed after a search. + */ + cbt->ins_stack[0] = NULL; + /* If the transaction is idle, check that the cache isn't full. */ WT_RET(__wt_txn_idle_cache_check(session)); diff --git a/src/include/dhandle.h b/src/include/dhandle.h index d41631696b4..9a54b4ddb66 100644 --- a/src/include/dhandle.h +++ b/src/include/dhandle.h @@ -28,14 +28,19 @@ */ #define WT_SAVE_DHANDLE(s, e) WT_WITH_DHANDLE(s, (s)->dhandle, e) +/* Check if a handle is inactive. */ +#define WT_DHANDLE_INACTIVE(dhandle) \ + (F_ISSET(dhandle, WT_DHANDLE_DEAD) || \ + !F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_OPEN)) + /* * WT_DATA_HANDLE -- * A handle for a generic named data source. */ struct __wt_data_handle { WT_RWLOCK *rwlock; /* Lock for shared/exclusive ops */ - SLIST_ENTRY(__wt_data_handle) l; - SLIST_ENTRY(__wt_data_handle) hashl; + TAILQ_ENTRY(__wt_data_handle) q; + TAILQ_ENTRY(__wt_data_handle) hashq; /* * Sessions caching a connection's data handle will have a non-zero @@ -64,7 +69,9 @@ struct __wt_data_handle { */ WT_SPINLOCK close_lock; /* Lock to close the handle */ - WT_DSRC_STATS stats; /* Data-source statistics */ + /* Data-source statistics */ + WT_DSRC_STATS *stats[WT_COUNTER_SLOTS]; + WT_DSRC_STATS stat_array[WT_COUNTER_SLOTS]; /* Flags values over 0xff are reserved for WT_BTREE_* */ #define WT_DHANDLE_DEAD 0x01 /* Dead, awaiting discard */ diff --git a/src/include/error.h b/src/include/error.h index fcb96b16361..abffc02945e 100644 --- a/src/include/error.h +++ b/src/include/error.h @@ -92,7 +92,8 @@ return (__wt_illegal_value(session, NULL)) #define WT_ILLEGAL_VALUE_ERR(session) \ default: \ - WT_ERR(__wt_illegal_value(session, NULL)) + ret = __wt_illegal_value(session, NULL); \ + goto err #define WT_ILLEGAL_VALUE_SET(session) \ default: \ ret = __wt_illegal_value(session, NULL); \ diff --git a/src/include/extern.h b/src/include/extern.h index f0c1a0e310a..a8f11c8694f 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -63,7 +63,7 @@ extern int __wt_block_ext_prealloc(WT_SESSION_IMPL *session, u_int max); extern int __wt_block_ext_discard(WT_SESSION_IMPL *session, u_int max); extern int __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block); extern int __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block); -extern int __wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size); +extern bool __wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size); extern int __wt_block_salvage_next(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t *addr_sizep, int *eofp); extern int __wt_block_salvage_valid(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t addr_size, int valid); extern int __wt_block_verify_start(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase, const char *cfg[]); @@ -101,8 +101,9 @@ extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt); extern int __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp); extern int __wt_btcur_equals( WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *equalp); extern int __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop); +extern void __wt_btcur_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt); extern void __wt_btcur_open(WT_CURSOR_BTREE *cbt); -extern int __wt_btcur_close(WT_CURSOR_BTREE *cbt); +extern int __wt_btcur_close(WT_CURSOR_BTREE *cbt, int lowlevel); extern int __wt_debug_set_verbose(WT_SESSION_IMPL *session, const char *v); extern int __wt_debug_addr_print( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size); extern int __wt_debug_addr(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, const char *ofile); @@ -115,12 +116,13 @@ extern int __wt_debug_tree(WT_SESSION_IMPL *session, WT_PAGE *page, const char * extern int __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile); extern int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp); extern void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref); -extern int __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref); +extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref); extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref); extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep); extern void __wt_free_ref( WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref, int free_pages); extern void __wt_free_ref_index(WT_SESSION_IMPL *session, WT_PAGE *page, WT_PAGE_INDEX *pindex, int free_pages); +extern void __wt_free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd); extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]); extern int __wt_btree_close(WT_SESSION_IMPL *session); extern void __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, int is_recno); @@ -138,15 +140,15 @@ extern const char *__wt_addr_string(WT_SESSION_IMPL *session, const uint8_t *add extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store); extern int __wt_ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, void *cookie, WT_CELL_UNPACK *vpack); extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell); +extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint64_t recno, uint32_t alloc_entries, int alloc_refs, WT_PAGE **pagep); +extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep); +extern int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size); extern int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ); -extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint64_t recno, uint32_t alloc_entries, int alloc_refs, WT_PAGE **pagep); -extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep); -extern int __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd); extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]); extern void __wt_split_stash_discard(WT_SESSION_IMPL *session); @@ -159,10 +161,10 @@ extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst); extern int __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op); extern int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]); -extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk, size_t size, int empty_page_ok); -extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf); +extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk, size_t size, int empty_page_ok); +extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *tag, WT_ITEM *buf); extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags); -extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove); +extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, int is_remove); extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt); extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page); extern int __wt_row_leaf_key_copy( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key); @@ -179,6 +181,14 @@ extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, extern int __wt_search_insert( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key); extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, int insert); extern int __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt); +extern void __wt_las_stats_update(WT_SESSION_IMPL *session); +extern int __wt_las_create(WT_SESSION_IMPL *session); +extern int __wt_las_destroy(WT_SESSION_IMPL *session); +extern void __wt_las_set_written(WT_SESSION_IMPL *session); +extern bool __wt_las_is_written(WT_SESSION_IMPL *session); +extern int __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags); +extern int __wt_las_cursor_close( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags); +extern int __wt_las_sweep(WT_SESSION_IMPL *session); extern int __wt_config_initn( WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str, size_t len); extern int __wt_config_init(WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str); extern int __wt_config_subinit( WT_SESSION_IMPL *session, WT_CONFIG *conf, WT_CONFIG_ITEM *item); @@ -237,7 +247,7 @@ extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session); extern int __wt_connection_init(WT_CONNECTION_IMPL *conn); extern int __wt_connection_destroy(WT_CONNECTION_IMPL *conn); extern int __wt_log_truncate_files( WT_SESSION_IMPL *session, WT_CURSOR *cursor, const char *cfg[]); -extern int __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield); +extern int __wt_log_wrlsn(WT_SESSION_IMPL *session); extern int __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]); extern int __wt_logmgr_open(WT_SESSION_IMPL *session); extern int __wt_logmgr_destroy(WT_SESSION_IMPL *session); @@ -309,14 +319,14 @@ extern void __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_evict_server_wake(WT_SESSION_IMPL *session); extern int __wt_evict_create(WT_SESSION_IMPL *session); extern int __wt_evict_destroy(WT_SESSION_IMPL *session); -extern int __wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, int *evict_resetp); extern void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session); -extern int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, int pct_full); -extern void __wt_cache_dump(WT_SESSION_IMPL *session); +extern int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, u_int pct_full); +extern int __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile); extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int closing); extern int __wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, int closing); extern int __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn); +extern int __wt_log_ckpt_lsn(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn); extern int __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn); extern int __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn); extern int __wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, int *rec); @@ -324,12 +334,14 @@ extern void __wt_log_written_reset(WT_SESSION_IMPL *session); extern int __wt_log_get_all_files(WT_SESSION_IMPL *session, char ***filesp, u_int *countp, uint32_t *maxid, int active_only); extern void __wt_log_files_free(WT_SESSION_IMPL *session, char **files, u_int count); extern int __wt_log_extract_lognum( WT_SESSION_IMPL *session, const char *name, uint32_t *id); +extern int __wt_log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot); extern int __wt_log_allocfile( WT_SESSION_IMPL *session, uint32_t lognum, const char *dest, int prealloc); extern int __wt_log_remove(WT_SESSION_IMPL *session, const char *file_prefix, uint32_t lognum); extern int __wt_log_open(WT_SESSION_IMPL *session); extern int __wt_log_close(WT_SESSION_IMPL *session); -extern int __wt_log_newfile(WT_SESSION_IMPL *session, int conn_create, int *created); +extern int __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep); extern int __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, int (*func)(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, WT_LSN *next_lsnp, void *cookie, int firstrecord), void *cookie); +extern int __wt_log_force_write(WT_SESSION_IMPL *session, int retry); extern int __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags); extern int __wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap); extern int __wt_logrec_alloc(WT_SESSION_IMPL *session, size_t size, WT_ITEM **logrecp); @@ -355,14 +367,15 @@ extern int __wt_logop_row_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logr extern int __wt_logop_row_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *startp, WT_ITEM *stopp, uint32_t *modep); extern int __wt_logop_row_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); extern int __wt_txn_op_printlog( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out); +extern void __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); +extern int __wt_log_slot_close( WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *releasep, int forced); +extern int __wt_log_slot_switch( WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int retry, int forced); +extern int __wt_log_slot_new(WT_SESSION_IMPL *session); extern int __wt_log_slot_init(WT_SESSION_IMPL *session); extern int __wt_log_slot_destroy(WT_SESSION_IMPL *session); -extern int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslotp); -extern int __wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); -extern int __wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); -extern int __wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); -extern int64_t __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size); -extern int __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); +extern void __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot); +extern int64_t __wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size); +extern void __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot); extern int __wt_clsm_request_switch(WT_CURSOR_LSM *clsm); extern int __wt_clsm_await_switch(WT_CURSOR_LSM *clsm); extern int __wt_clsm_init_merge( WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks); @@ -475,7 +488,7 @@ extern int __wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t siz extern int __wt_mmap_discard(WT_SESSION_IMPL *session, void *p, size_t size); extern int __wt_munmap(WT_SESSION_IMPL *session, WT_FH *fh, void *map, size_t len, void **mappingcookie); extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, int is_signalled, WT_CONDVAR **condp); -extern int __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs); +extern int __wt_cond_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, int *signalled); extern int __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond); extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp); extern int __wt_rwlock_alloc( WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name); @@ -489,7 +502,7 @@ extern int __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp); extern int __wt_once(void (*init_routine)(void)); extern int __wt_open(WT_SESSION_IMPL *session, const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp); extern int __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp); -extern int __wt_absolute_path(const char *path); +extern bool __wt_absolute_path(const char *path); extern const char *__wt_path_separator(void); extern int __wt_has_priv(void); extern int __wt_remove(WT_SESSION_IMPL *session, const char *name); @@ -577,6 +590,8 @@ extern int __wt_schema_worker(WT_SESSION_IMPL *session, const char *uri, int (*f extern int __wt_session_reset_cursors(WT_SESSION_IMPL *session, int free_buffers); extern int __wt_session_copy_values(WT_SESSION_IMPL *session); extern int __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); +extern int __wt_session_create( WT_SESSION_IMPL *session, const char *uri, const char *config); +extern int __wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]); extern int __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, int uses_dhandles, int open_metadata, WT_SESSION_IMPL **sessionp); extern int __wt_open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const char *config, int open_metadata, WT_SESSION_IMPL **sessionp); extern int __wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, int *skip); @@ -639,7 +654,7 @@ extern int __wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg, cons extern uint32_t __wt_nlpo2_round(uint32_t v); extern uint32_t __wt_nlpo2(uint32_t v); extern uint32_t __wt_log2_int(uint32_t n); -extern int __wt_ispo2(uint32_t v); +extern bool __wt_ispo2(uint32_t v); extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2); extern void __wt_random_init(WT_RAND_STATE volatile *rnd_state); extern uint32_t __wt_random(WT_RAND_STATE volatile *rnd_state); @@ -655,11 +670,19 @@ __wt_scr_alloc_func(WT_SESSION_IMPL *session, size_t size, WT_ITEM **scratchp extern void __wt_scr_discard(WT_SESSION_IMPL *session); extern void *__wt_ext_scr_alloc( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, size_t size); extern void __wt_ext_scr_free(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, void *p); -extern void __wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats); -extern void __wt_stat_refresh_dsrc_stats(void *stats_arg); -extern void __wt_stat_aggregate_dsrc_stats(const void *child, const void *parent); -extern void __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats); -extern void __wt_stat_refresh_connection_stats(void *stats_arg); +extern const char *__wt_stat_dsrc_desc(int slot); +extern void __wt_stat_dsrc_init_single(WT_DSRC_STATS *stats); +extern void __wt_stat_dsrc_init(WT_DATA_HANDLE *handle); +extern void __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats); +extern void __wt_stat_dsrc_clear_all(WT_DSRC_STATS **stats); +extern void __wt_stat_dsrc_aggregate_single( WT_DSRC_STATS *from, WT_DSRC_STATS *to); +extern void __wt_stat_dsrc_aggregate( WT_DSRC_STATS **from, WT_DSRC_STATS *to); +extern const char *__wt_stat_connection_desc(int slot); +extern void __wt_stat_connection_init_single(WT_CONNECTION_STATS *stats); +extern void __wt_stat_connection_init(WT_CONNECTION_IMPL *handle); +extern void __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats); +extern void __wt_stat_connection_clear_all(WT_CONNECTION_STATS **stats); +extern void __wt_stat_connection_aggregate( WT_CONNECTION_STATS **from, WT_CONNECTION_STATS *to); extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session); extern void __wt_txn_get_snapshot(WT_SESSION_IMPL *session); extern void __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force); diff --git a/src/include/flags.h b/src/include/flags.h index 675ede9a8a0..ca3c3c38245 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -18,6 +18,8 @@ #define WT_CONN_SERVER_SWEEP 0x00002000 #define WT_CONN_WAS_BACKUP 0x00004000 #define WT_EVICTING 0x00000001 +#define WT_EVICT_LOOKASIDE 0x00000002 +#define WT_EVICT_UPDATE_RESTORE 0x00000004 #define WT_FILE_TYPE_CHECKPOINT 0x00000001 #define WT_FILE_TYPE_DATA 0x00000002 #define WT_FILE_TYPE_DIRECTORY 0x00000004 @@ -42,27 +44,25 @@ #define WT_READ_WONT_NEED 0x00000100 #define WT_SESSION_CAN_WAIT 0x00000001 #define WT_SESSION_CLEAR_EVICT_WALK 0x00000002 -#define WT_SESSION_DISCARD_FORCE 0x00000004 -#define WT_SESSION_INTERNAL 0x00000008 -#define WT_SESSION_LOCKED_CHECKPOINT 0x00000010 -#define WT_SESSION_LOCKED_HANDLE_LIST 0x00000020 -#define WT_SESSION_LOCKED_SCHEMA 0x00000040 +#define WT_SESSION_INTERNAL 0x00000004 +#define WT_SESSION_LOCKED_CHECKPOINT 0x00000008 +#define WT_SESSION_LOCKED_HANDLE_LIST 0x00000010 +#define WT_SESSION_LOCKED_SCHEMA 0x00000020 +#define WT_SESSION_LOCKED_SLOT 0x00000040 #define WT_SESSION_LOCKED_TABLE 0x00000080 #define WT_SESSION_LOGGING_INMEM 0x00000100 -#define WT_SESSION_NO_CACHE 0x00000200 -#define WT_SESSION_NO_CACHE_CHECK 0x00000400 +#define WT_SESSION_LOOKASIDE_CURSOR 0x00000200 +#define WT_SESSION_NO_CACHE 0x00000400 #define WT_SESSION_NO_DATA_HANDLES 0x00000800 -#define WT_SESSION_NO_LOGGING 0x00001000 -#define WT_SESSION_NO_SCHEMA_LOCK 0x00002000 -#define WT_SESSION_SALVAGE_CORRUPT_OK 0x00004000 -#define WT_SESSION_SERVER_ASYNC 0x00008000 -#define WT_SKIP_UPDATE_ERR 0x00000002 -#define WT_SKIP_UPDATE_RESTORE 0x00000004 +#define WT_SESSION_NO_EVICTION 0x00001000 +#define WT_SESSION_NO_LOGGING 0x00002000 +#define WT_SESSION_NO_SCHEMA_LOCK 0x00004000 +#define WT_SESSION_QUIET_CORRUPT_FILE 0x00008000 +#define WT_SESSION_SERVER_ASYNC 0x00010000 #define WT_SYNC_CHECKPOINT 0x00000001 #define WT_SYNC_CLOSE 0x00000002 #define WT_SYNC_DISCARD 0x00000004 -#define WT_SYNC_DISCARD_FORCE 0x00000008 -#define WT_SYNC_WRITE_LEAVES 0x00000010 +#define WT_SYNC_WRITE_LEAVES 0x00000008 #define WT_TXN_LOG_CKPT_CLEANUP 0x00000001 #define WT_TXN_LOG_CKPT_PREPARE 0x00000002 #define WT_TXN_LOG_CKPT_START 0x00000004 @@ -92,6 +92,7 @@ #define WT_VERB_VERIFY 0x00200000 #define WT_VERB_VERSION 0x00400000 #define WT_VERB_WRITE 0x00800000 +#define WT_VISIBILITY_ERR 0x00000008 /* * flags section: END * DO NOT EDIT: automatically built by dist/flags.py. diff --git a/src/include/gcc.h b/src/include/gcc.h index 1c61768d372..01e33792d73 100644 --- a/src/include/gcc.h +++ b/src/include/gcc.h @@ -85,56 +85,71 @@ * In summary, locking > barriers > volatile. * * To avoid locking shared data structures such as statistics and to permit - * atomic state changes, we rely on the WT_ATOMIC_ADD and WT_ATOMIC_CAS - * (compare and swap) operations. + * atomic state changes, we rely on the atomic-add and atomic-cas (compare and + * swap) operations. */ -#define __WT_ATOMIC_ADD(v, val, n) \ - (WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_add_and_fetch(&(v), val)) -#define __WT_ATOMIC_FETCH_ADD(v, val, n) \ - (WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_fetch_and_add(&(v), val)) + #ifdef __clang__ /* - * We avoid __sync_bool_compare_and_swap with due to problems with - * optimization with some versions of clang. See - * http://llvm.org/bugs/show_bug.cgi?id=21499 for details. + * We avoid __sync_bool_compare_and_swap with due to problems with optimization + * with some versions of clang. See http://llvm.org/bugs/show_bug.cgi?id=21499 + * for details. */ -#define __WT_ATOMIC_CAS(v, old, new, n) \ - (WT_STATIC_ASSERT(sizeof(v) == (n)), \ - __sync_val_compare_and_swap(&(v), old, new) == (old)) +#define WT_ATOMIC_CAS(ptr, oldval, newval) \ + (__sync_val_compare_and_swap(ptr, oldval, newval) == oldval) #else -#define __WT_ATOMIC_CAS(v, old, new, n) \ - (WT_STATIC_ASSERT(sizeof(v) == (n)), \ - __sync_bool_compare_and_swap(&(v), old, new)) +#define WT_ATOMIC_CAS(ptr, oldval, newval) \ + __sync_bool_compare_and_swap(ptr, oldval, newval) #endif -#define __WT_ATOMIC_STORE(v, val, n) \ - (WT_STATIC_ASSERT(sizeof(v) == (n)), \ - __sync_lock_test_and_set(&(v), val)) -#define __WT_ATOMIC_SUB(v, val, n) \ - (WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_sub_and_fetch(&(v), val)) - -#define WT_ATOMIC_ADD1(v, val) __WT_ATOMIC_ADD(v, val, 1) -#define WT_ATOMIC_FETCH_ADD1(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 1) -#define WT_ATOMIC_CAS1(v, old, new) __WT_ATOMIC_CAS(v, old, new, 1) -#define WT_ATOMIC_STORE1(v, val) __WT_ATOMIC_STORE(v, val, 1) -#define WT_ATOMIC_SUB1(v, val) __WT_ATOMIC_SUB(v, val, 1) - -#define WT_ATOMIC_ADD2(v, val) __WT_ATOMIC_ADD(v, val, 2) -#define WT_ATOMIC_FETCH_ADD2(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 2) -#define WT_ATOMIC_CAS2(v, old, new) __WT_ATOMIC_CAS(v, old, new, 2) -#define WT_ATOMIC_STORE2(v, val) __WT_ATOMIC_STORE(v, val, 2) -#define WT_ATOMIC_SUB2(v, val) __WT_ATOMIC_SUB(v, val, 2) - -#define WT_ATOMIC_ADD4(v, val) __WT_ATOMIC_ADD(v, val, 4) -#define WT_ATOMIC_FETCH_ADD4(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 4) -#define WT_ATOMIC_CAS4(v, old, new) __WT_ATOMIC_CAS(v, old, new, 4) -#define WT_ATOMIC_STORE4(v, val) __WT_ATOMIC_STORE(v, val, 4) -#define WT_ATOMIC_SUB4(v, val) __WT_ATOMIC_SUB(v, val, 4) - -#define WT_ATOMIC_ADD8(v, val) __WT_ATOMIC_ADD(v, val, 8) -#define WT_ATOMIC_FETCH_ADD8(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 8) -#define WT_ATOMIC_CAS8(v, old, new) __WT_ATOMIC_CAS(v, old, new, 8) -#define WT_ATOMIC_STORE8(v, val) __WT_ATOMIC_STORE(v, val, 8) -#define WT_ATOMIC_SUB8(v, val) __WT_ATOMIC_SUB(v, val, 8) + +#define WT_ATOMIC_FUNC(name, ret, type) \ +static inline ret \ +__wt_atomic_add##name(type *vp, type v) \ +{ \ + return (__sync_add_and_fetch(vp, v)); \ +} \ +static inline ret \ +__wt_atomic_fetch_add##name(type *vp, type v) \ +{ \ + return (__sync_fetch_and_add(vp, v)); \ +} \ +static inline ret \ +__wt_atomic_store##name(type *vp, type v) \ +{ \ + return (__sync_lock_test_and_set(vp, v)); \ +} \ +static inline ret \ +__wt_atomic_sub##name(type *vp, type v) \ +{ \ + return (__sync_sub_and_fetch(vp, v)); \ +} \ +static inline bool \ +__wt_atomic_cas##name(type *vp, type old, type new) \ +{ \ + return (WT_ATOMIC_CAS(vp, old, new)); \ +} + +WT_ATOMIC_FUNC(8, uint8_t, uint8_t) +WT_ATOMIC_FUNC(16, uint16_t, uint16_t) +WT_ATOMIC_FUNC(32, uint32_t, uint32_t) +WT_ATOMIC_FUNC(v32, uint32_t, volatile uint32_t) +WT_ATOMIC_FUNC(i32, int32_t, int32_t) +WT_ATOMIC_FUNC(iv32, int32_t, volatile int32_t) +WT_ATOMIC_FUNC(64, uint64_t, uint64_t) +WT_ATOMIC_FUNC(v64, uint64_t, volatile uint64_t) +WT_ATOMIC_FUNC(i64, int64_t, int64_t) +WT_ATOMIC_FUNC(iv64, int64_t, volatile int64_t) +WT_ATOMIC_FUNC(size, size_t, size_t) + +/* + * __wt_atomic_cas_ptr -- + * Pointer compare and swap. + */ +static inline bool +__wt_atomic_cas_ptr(void *vp, void *old, void *new) +{ + return (WT_ATOMIC_CAS((void **)vp, old, new)); +} /* Compile read-write barrier */ #define WT_BARRIER() __asm__ volatile("" ::: "memory") diff --git a/src/include/hardware.h b/src/include/hardware.h index e3c098826d0..32353072c5b 100644 --- a/src/include/hardware.h +++ b/src/include/hardware.h @@ -33,8 +33,8 @@ uint8_t __orig; \ do { \ __orig = (p)->flags_atomic; \ - } while (!WT_ATOMIC_CAS1((p)->flags_atomic, \ - __orig, __orig | (uint8_t)(mask))); \ + } while (!__wt_atomic_cas8( \ + &(p)->flags_atomic, __orig, __orig | (uint8_t)(mask))); \ } while (0) #define F_CAS_ATOMIC(p, mask, ret) do { \ @@ -46,16 +46,30 @@ ret = EBUSY; \ break; \ } \ - } while (!WT_ATOMIC_CAS1((p)->flags_atomic, \ - __orig, __orig | (uint8_t)(mask))); \ + } while (!__wt_atomic_cas8( \ + &(p)->flags_atomic, __orig, __orig | (uint8_t)(mask))); \ +} while (0) + +#define F_CAS_ATOMIC_WAIT(p, mask) do { \ + int __ret; \ + for (;;) { \ + F_CAS_ATOMIC(p, mask, __ret); \ + if (__ret == 0) \ + break; \ + __wt_yield(); \ + } \ } while (0) #define F_CLR_ATOMIC(p, mask) do { \ uint8_t __orig; \ do { \ __orig = (p)->flags_atomic; \ - } while (!WT_ATOMIC_CAS1((p)->flags_atomic, \ - __orig, __orig & ~(uint8_t)(mask))); \ + } while (!__wt_atomic_cas8( \ + &(p)->flags_atomic, __orig, __orig & ~(uint8_t)(mask))); \ } while (0) #define WT_CACHE_LINE_ALIGNMENT 64 /* Cache line alignment */ +#define WT_CACHE_LINE_ALIGNMENT_VERIFY(session, a) \ + WT_ASSERT(session, \ + WT_PTRDIFF(&(a)[1], &(a)[0]) >= WT_CACHE_LINE_ALIGNMENT && \ + WT_PTRDIFF(&(a)[1], &(a)[0]) % WT_CACHE_LINE_ALIGNMENT == 0) diff --git a/src/include/lint.h b/src/include/lint.h index 964aa5c118f..f288fb98683 100644 --- a/src/include/lint.h +++ b/src/include/lint.h @@ -18,40 +18,71 @@ #define WT_GCC_FUNC_ATTRIBUTE(x) #define WT_GCC_FUNC_DECL_ATTRIBUTE(x) -#define __WT_ATOMIC_ADD(v, val) \ - ((v) += (val)) -#define __WT_ATOMIC_FETCH_ADD(v, val) \ - ((v) += (val), (v)) -#define __WT_ATOMIC_CAS(v, old, new) \ - ((v) = ((v) == (old) ? (new) : (old)), (v) == (old)) -#define __WT_ATOMIC_STORE(v, val) \ - ((v) = (val)) -#define __WT_ATOMIC_SUB(v, val) \ - ((v) -= (val), (v)) - -#define WT_ATOMIC_ADD1(v, val) __WT_ATOMIC_ADD(v, val) -#define WT_ATOMIC_FETCH_ADD1(v, val) __WT_ATOMIC_FETCH_ADD(v, val) -#define WT_ATOMIC_CAS1(v, old, new) __WT_ATOMIC_CAS(v, old, new) -#define WT_ATOMIC_STORE1(v, val) __WT_ATOMIC_STORE(v, val) -#define WT_ATOMIC_SUB1(v, val) __WT_ATOMIC_SUB(v, val) - -#define WT_ATOMIC_ADD2(v, val) __WT_ATOMIC_ADD(v, val) -#define WT_ATOMIC_FETCH_ADD2(v, val) __WT_ATOMIC_FETCH_ADD(v, val) -#define WT_ATOMIC_CAS2(v, old, new) __WT_ATOMIC_CAS(v, old, new) -#define WT_ATOMIC_STORE2(v, val) __WT_ATOMIC_STORE(v, val) -#define WT_ATOMIC_SUB2(v, val) __WT_ATOMIC_SUB(v, val) - -#define WT_ATOMIC_ADD4(v, val) __WT_ATOMIC_ADD(v, val) -#define WT_ATOMIC_FETCH_ADD4(v, val) __WT_ATOMIC_FETCH_ADD(v, val) -#define WT_ATOMIC_CAS4(v, old, new) __WT_ATOMIC_CAS(v, old, new) -#define WT_ATOMIC_STORE4(v, val) __WT_ATOMIC_STORE(v, val) -#define WT_ATOMIC_SUB4(v, val) __WT_ATOMIC_SUB(v, val) - -#define WT_ATOMIC_ADD8(v, val) __WT_ATOMIC_ADD(v, val) -#define WT_ATOMIC_FETCH_ADD8(v, val) __WT_ATOMIC_FETCH_ADD(v, val) -#define WT_ATOMIC_CAS8(v, old, new) __WT_ATOMIC_CAS(v, old, new) -#define WT_ATOMIC_STORE8(v, val) __WT_ATOMIC_STORE(v, val) -#define WT_ATOMIC_SUB8(v, val) __WT_ATOMIC_SUB(v, val) +#define WT_ATOMIC_FUNC(name, ret, type) \ +static inline ret \ +__wt_atomic_add##name(type *vp, type v) \ +{ \ + *vp += v; \ + return (*vp); \ +} \ +static inline ret \ +__wt_atomic_fetch_add##name(type *vp, type v) \ +{ \ + type orig; \ + \ + old = *vp; \ + *vp += v; \ + return (old); \ +} \ +static inline ret \ +__wt_atomic_store##name(type *vp, type v) \ +{ \ + type orig; \ + \ + orig = *vp; \ + *vp = v; \ + return (old); \ +} \ +static inline ret \ +__wt_atomic_sub##name(type *vp, type v) \ +{ \ + *vp -= v; \ + return (*vp); \ +} \ +static inline bool \ +__wt_atomic_cas##name(type *vp, type old, type new) \ +{ \ + if (*vp == old) { \ + *vp = new; \ + return (true); \ + } \ + return (false); \ +} + +WT_ATOMIC_FUNC(8, uint8_t, uint8_t) +WT_ATOMIC_FUNC(16, uint16_t, uint16_t) +WT_ATOMIC_FUNC(32, uint32_t, uint32_t) +WT_ATOMIC_FUNC(v32, uint32_t, volatile uint32_t) +WT_ATOMIC_FUNC(i32, int32_t, int32_t) +WT_ATOMIC_FUNC(iv32, int32_t, volatile int32_t) +WT_ATOMIC_FUNC(64, uint64_t, uint64_t) +WT_ATOMIC_FUNC(v64, uint64_t, volatile uint64_t) +WT_ATOMIC_FUNC(i64, int64_t, int64_t) +WT_ATOMIC_FUNC(iv64, int64_t, volatile int64_t) +WT_ATOMIC_FUNC(size, size_t, size_t) + +/* + * __wt_atomic_cas_ptr -- + * Pointer compare and swap. + */ +static inline bool +__wt_atomic_cas_ptr(void *vp, void *old, void *new) { + if (*(void **)vp == old) { + *(void **)vp = new; + return (true); + } + return (false); +} static inline void WT_BARRIER(void) { return; } static inline void WT_FULL_BARRIER(void) { return; } diff --git a/src/include/log.h b/src/include/log.h index fbb0a3e3842..06be95697c7 100644 --- a/src/include/log.h +++ b/src/include/log.h @@ -12,7 +12,6 @@ /* Logging subsystem declarations. */ #define WT_LOG_ALIGN 128 -#define WT_LOG_SLOT_BUF_SIZE 256 * 1024 #define WT_INIT_LSN(l) do { \ (l)->file = 1; \ @@ -48,67 +47,136 @@ ((size) - offsetof(WT_LOG_RECORD, record)) /* - * Compare 2 LSNs, return -1 if lsn0 < lsn1, 0 if lsn0 == lsn1 - * and 1 if lsn0 > lsn1. - */ -#define WT_LOG_CMP(lsn1, lsn2) \ - ((lsn1)->file != (lsn2)->file ? \ - ((lsn1)->file < (lsn2)->file ? -1 : 1) : \ - ((lsn1)->offset != (lsn2)->offset ? \ - ((lsn1)->offset < (lsn2)->offset ? -1 : 1) : 0)) - -/* * Possible values for the consolidation array slot states: - * (NOTE: Any new states must be > WT_LOG_SLOT_DONE and < WT_LOG_SLOT_READY.) * - * < WT_LOG_SLOT_DONE - threads are actively writing to the log. - * WT_LOG_SLOT_DONE - all activity on this slot is complete. + * WT_LOG_SLOT_CLOSE - slot is in use but closed to new joins. * WT_LOG_SLOT_FREE - slot is available for allocation. - * WT_LOG_SLOT_PENDING - slot is transitioning from ready to active. * WT_LOG_SLOT_WRITTEN - slot is written and should be processed by worker. - * WT_LOG_SLOT_READY - slot is ready for threads to join. - * > WT_LOG_SLOT_READY - threads are actively consolidating on this slot. * * The slot state must be volatile: threads loop checking the state and can't * cache the first value they see. + * + * The slot state is divided into two 32 bit sizes. One half is the + * amount joined and the other is the amount released. Since we use + * a few special states, reserve the top few bits for state. That makes + * the maximum size less than 32 bits for both joined and released. + */ + +/* + * The high bit is reserved for the special states. If the high bit is + * set (WT_LOG_SLOT_RESERVED) then we are guaranteed to be in a special state. + */ +#define WT_LOG_SLOT_FREE -1 /* Not in use */ +#define WT_LOG_SLOT_WRITTEN -2 /* Slot data written, not processed */ + +/* + * We allocate the buffer size, but trigger a slot switch when we cross + * the maximum size of half the buffer. If a record is more than the buffer + * maximum then we trigger a slot switch and write that record unbuffered. + * We use a larger buffer to provide overflow space so that we can switch + * once we cross the threshold. + */ +#define WT_LOG_SLOT_BUF_SIZE (256 * 1024) /* Must be power of 2 */ +#define WT_LOG_SLOT_BUF_MAX ((uint32_t)log->slot_buf_size / 2) +#define WT_LOG_SLOT_UNBUFFERED (WT_LOG_SLOT_BUF_SIZE << 1) + +/* + * If new slot states are added, adjust WT_LOG_SLOT_BITS and + * WT_LOG_SLOT_MASK_OFF accordingly for how much of the top 32 + * bits we are using. More slot states here will reduce the maximum + * size that a slot can hold unbuffered by half. If a record is + * larger than the maximum we can account for in the slot state we fall + * back to direct writes. + */ +#define WT_LOG_SLOT_BITS 2 +#define WT_LOG_SLOT_MAXBITS (32 - WT_LOG_SLOT_BITS) +#define WT_LOG_SLOT_CLOSE 0x4000000000000000LL /* Force slot close */ +#define WT_LOG_SLOT_RESERVED 0x8000000000000000LL /* Reserved states */ + +/* + * Check if the unbuffered flag is set in the joined portion of + * the slot state. */ -#define WT_LOG_SLOT_DONE 0 -#define WT_LOG_SLOT_FREE 1 -#define WT_LOG_SLOT_PENDING 2 -#define WT_LOG_SLOT_WRITTEN 3 -#define WT_LOG_SLOT_READY 4 -typedef WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) struct { +#define WT_LOG_SLOT_UNBUFFERED_ISSET(state) \ + ((state) & ((int64_t)WT_LOG_SLOT_UNBUFFERED << 32)) + +#define WT_LOG_SLOT_MASK_OFF 0x3fffffffffffffffLL +#define WT_LOG_SLOT_MASK_ON ~(WT_LOG_SLOT_MASK_OFF) +#define WT_LOG_SLOT_JOIN_MASK (WT_LOG_SLOT_MASK_OFF >> 32) + +/* + * These macros manipulate the slot state and its component parts. + */ +#define WT_LOG_SLOT_FLAGS(state) ((state) & WT_LOG_SLOT_MASK_ON) +#define WT_LOG_SLOT_JOINED(state) (((state) & WT_LOG_SLOT_MASK_OFF) >> 32) +#define WT_LOG_SLOT_JOINED_BUFFERED(state) \ + (WT_LOG_SLOT_JOINED(state) & \ + (WT_LOG_SLOT_UNBUFFERED - 1)) +#define WT_LOG_SLOT_JOIN_REL(j, r, s) (((j) << 32) + (r) + (s)) +#define WT_LOG_SLOT_RELEASED(state) ((int64_t)(int32_t)(state)) +#define WT_LOG_SLOT_RELEASED_BUFFERED(state) \ + ((int64_t)((int32_t)WT_LOG_SLOT_RELEASED(state) & \ + (WT_LOG_SLOT_UNBUFFERED - 1))) + +/* Slot is in use */ +#define WT_LOG_SLOT_ACTIVE(state) \ + (WT_LOG_SLOT_JOINED(state) != WT_LOG_SLOT_JOIN_MASK) +/* Slot is in use, but closed to new joins */ +#define WT_LOG_SLOT_CLOSED(state) \ + (WT_LOG_SLOT_ACTIVE(state) && \ + (FLD64_ISSET((uint64_t)state, WT_LOG_SLOT_CLOSE) && \ + !FLD64_ISSET((uint64_t)state, WT_LOG_SLOT_RESERVED))) +/* Slot is in use, all data copied into buffer */ +#define WT_LOG_SLOT_INPROGRESS(state) \ + (WT_LOG_SLOT_RELEASED(state) != WT_LOG_SLOT_JOINED(state)) +#define WT_LOG_SLOT_DONE(state) \ + (WT_LOG_SLOT_CLOSED(state) && \ + !WT_LOG_SLOT_INPROGRESS(state)) +/* Slot is in use, more threads may join this slot */ +#define WT_LOG_SLOT_OPEN(state) \ + (WT_LOG_SLOT_ACTIVE(state) && \ + !WT_LOG_SLOT_UNBUFFERED_ISSET(state) && \ + !FLD64_ISSET((uint64_t)(state), WT_LOG_SLOT_CLOSE) && \ + WT_LOG_SLOT_JOINED(state) < WT_LOG_SLOT_BUF_MAX) + +struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_logslot { volatile int64_t slot_state; /* Slot state */ - uint64_t slot_group_size; /* Group size */ + int64_t slot_unbuffered; /* Unbuffered data in this slot */ int32_t slot_error; /* Error value */ -#define WT_SLOT_INVALID_INDEX 0xffffffff - uint32_t slot_index; /* Active slot index */ wt_off_t slot_start_offset; /* Starting file offset */ - WT_LSN slot_release_lsn; /* Slot release LSN */ - WT_LSN slot_start_lsn; /* Slot starting LSN */ - WT_LSN slot_end_lsn; /* Slot ending LSN */ + wt_off_t slot_last_offset; /* Last record offset */ + WT_LSN slot_release_lsn; /* Slot release LSN */ + WT_LSN slot_start_lsn; /* Slot starting LSN */ + WT_LSN slot_end_lsn; /* Slot ending LSN */ WT_FH *slot_fh; /* File handle for this group */ - WT_ITEM slot_buf; /* Buffer for grouped writes */ - int32_t slot_churn; /* Active slots are scarce. */ + WT_ITEM slot_buf; /* Buffer for grouped writes */ -#define WT_SLOT_BUFFERED 0x01 /* Buffer writes */ -#define WT_SLOT_CLOSEFH 0x02 /* Close old fh on release */ -#define WT_SLOT_SYNC 0x04 /* Needs sync on release */ -#define WT_SLOT_SYNC_DIR 0x08 /* Directory sync on release */ +#define WT_SLOT_CLOSEFH 0x01 /* Close old fh on release */ +#define WT_SLOT_SYNC 0x02 /* Needs sync on release */ +#define WT_SLOT_SYNC_DIR 0x04 /* Directory sync on release */ uint32_t flags; /* Flags */ -} WT_LOGSLOT; +}; -#define WT_SLOT_INIT_FLAGS (WT_SLOT_BUFFERED) +#define WT_SLOT_INIT_FLAGS 0 -typedef struct { - WT_LOGSLOT *slot; - wt_off_t offset; -} WT_MYSLOT; +#define WT_WITH_SLOT_LOCK(session, log, op) do { \ + WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT)); \ + WT_WITH_LOCK(session, \ + &log->log_slot_lock, WT_SESSION_LOCKED_SLOT, op); \ +} while (0) + +struct __wt_myslot { + WT_LOGSLOT *slot; /* Slot I'm using */ + wt_off_t end_offset; /* My end offset in buffer */ + wt_off_t offset; /* Slot buffer offset */ +#define WT_MYSLOT_CLOSE 0x01 /* This thread is closing the slot */ +#define WT_MYSLOT_UNBUFFERED 0x02 /* Write directly */ + uint32_t flags; /* Flags */ +}; - /* Offset of first record */ #define WT_LOG_FIRST_RECORD log->allocsize -typedef struct { +struct __wt_log { uint32_t allocsize; /* Allocation alignment size */ wt_off_t log_written; /* Amount of log written this period */ /* @@ -119,8 +187,9 @@ typedef struct { uint32_t tmp_fileid; /* Temporary file number */ uint32_t prep_missed; /* Pre-allocated file misses */ WT_FH *log_fh; /* Logging file handle */ - WT_FH *log_close_fh; /* Logging file handle to close */ WT_FH *log_dir_fh; /* Log directory file handle */ + WT_FH *log_close_fh; /* Logging file handle to close */ + WT_LSN log_close_lsn; /* LSN needed to close */ /* * System LSNs @@ -141,8 +210,9 @@ typedef struct { WT_SPINLOCK log_lock; /* Locked: Logging fields */ WT_SPINLOCK log_slot_lock; /* Locked: Consolidation array */ WT_SPINLOCK log_sync_lock; /* Locked: Single-thread fsync */ + WT_SPINLOCK log_writelsn_lock; /* Locked: write LSN */ - WT_RWLOCK *log_archive_lock; /* Archive and log cursors */ + WT_RWLOCK *log_archive_lock; /* Archive and log cursors */ /* Notify any waiting threads when sync_lsn is updated. */ WT_CONDVAR *log_sync_cond; @@ -151,22 +221,25 @@ typedef struct { /* * Consolidation array information - * WT_SLOT_ACTIVE must be less than WT_SLOT_POOL. * Our testing shows that the more consolidation we generate the * better the performance we see which equates to an active slot * slot count of one. + * + * Note: this can't be an array, we impose cache-line alignment and + * gcc doesn't support that for arrays. */ -#define WT_SLOT_ACTIVE 1 #define WT_SLOT_POOL 128 - WT_LOGSLOT *slot_array[WT_SLOT_ACTIVE]; /* Active slots */ + WT_LOGSLOT *active_slot; /* Active slot */ WT_LOGSLOT slot_pool[WT_SLOT_POOL]; /* Pool of all slots */ size_t slot_buf_size; /* Buffer size for slots */ +#ifdef HAVE_DIAGNOSTIC + uint64_t write_calls; /* Calls to log_write */ +#endif -#define WT_LOG_FORCE_CONSOLIDATE 0x01 /* Disable direct writes */ uint32_t flags; -} WT_LOG; +}; -typedef struct { +struct __wt_log_record { uint32_t len; /* 00-03: Record length including hdr */ uint32_t checksum; /* 04-07: Checksum of the record */ @@ -176,7 +249,7 @@ typedef struct { uint8_t unused[2]; /* 10-11: Padding */ uint32_t mem_len; /* 12-15: Uncompressed len if needed */ uint8_t record[0]; /* Beginning of actual data */ -} WT_LOG_RECORD; +}; /* * WT_LOG_DESC -- diff --git a/src/include/log.i b/src/include/log.i new file mode 100644 index 00000000000..ff309c31265 --- /dev/null +++ b/src/include/log.i @@ -0,0 +1,40 @@ +/*- + * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +static inline int __wt_log_cmp(WT_LSN *lsn1, WT_LSN *lsn2); + +/* + * __wt_log_cmp -- + * Compare 2 LSNs, return -1 if lsn1 < lsn2, 0if lsn1 == lsn2 + * and 1 if lsn1 > lsn2. + */ +static inline int +__wt_log_cmp(WT_LSN *lsn1, WT_LSN *lsn2) +{ + WT_LSN l1, l2; + + /* + * Read LSNs into local variables so that we only read each field + * once and all comparisons are on the same values. + */ + l1 = *(volatile WT_LSN *)lsn1; + l2 = *(volatile WT_LSN *)lsn2; + + /* + * If the file numbers are different we don't need to compare the + * offset. + */ + if (l1.file != l2.file) + return (l1.file < l2.file ? -1 : 1); + /* + * If the file numbers are the same, compare the offset. + */ + if (l1.offset != l2.offset) + return (l1.offset < l2.offset ? -1 : 1); + return (0); +} diff --git a/src/include/lsm.h b/src/include/lsm.h index 08e57794fb8..11cf8204aec 100644 --- a/src/include/lsm.h +++ b/src/include/lsm.h @@ -74,7 +74,7 @@ struct __wt_cursor_lsm { * WT_LSM_CHUNK -- * A single chunk (file) in an LSM tree. */ -struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_lsm_chunk { +struct __wt_lsm_chunk { const char *uri; /* Data source for this chunk */ const char *bloom_uri; /* URI of Bloom filter, if any */ struct timespec create_ts; /* Creation time (for rate limiting) */ @@ -177,16 +177,14 @@ struct __wt_lsm_tree { const char *collator_name; int collator_owned; - int refcnt; /* Number of users of the tree */ - int8_t exclusive; /* Tree is locked exclusively */ + uint32_t refcnt; /* Number of users of the tree */ + uint8_t exclusive; /* Tree is locked exclusively */ #define LSM_TREE_MAX_QUEUE 100 - int queue_ref; + uint32_t queue_ref; WT_RWLOCK *rwlock; TAILQ_ENTRY(__wt_lsm_tree) q; - WT_DSRC_STATS stats; /* LSM-level statistics */ - uint64_t dsk_gen; uint64_t ckpt_throttle; /* Rate limiting due to checkpoints */ @@ -221,9 +219,28 @@ struct __wt_lsm_tree { WT_LSM_CHUNK **old_chunks; /* Array of old LSM chunks */ size_t old_alloc; /* Space allocated for old chunks */ u_int nold_chunks; /* Number of old chunks */ - int freeing_old_chunks; /* Whether chunks are being freed */ + uint32_t freeing_old_chunks; /* Whether chunks are being freed */ uint32_t merge_aggressiveness; /* Increase amount of work per merge */ + /* + * We maintain a set of statistics outside of the normal statistics + * area, copying them into place when a statistics cursor is created. + */ +#define WT_LSM_TREE_STAT_INCR(session, fld) do { \ + if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \ + ++(fld); \ +} while (0) +#define WT_LSM_TREE_STAT_INCRV(session, fld, v) do { \ + if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \ + (fld) += (int64_t)(v); \ +} while (0) + int64_t bloom_false_positive; + int64_t bloom_hit; + int64_t bloom_miss; + int64_t lsm_checkpoint_throttle; + int64_t lsm_lookup_no_bloom; + int64_t lsm_merge_throttle; + #define WT_LSM_TREE_ACTIVE 0x01 /* Workers are active */ #define WT_LSM_TREE_AGGRESSIVE_TIMER 0x02 /* Timer for merge aggression */ #define WT_LSM_TREE_COMPACTING 0x04 /* Tree being compacted */ diff --git a/src/include/meta.h b/src/include/meta.h index 66547262417..a5a303f1630 100644 --- a/src/include/meta.h +++ b/src/include/meta.h @@ -21,7 +21,9 @@ #define WT_METADATA_TURTLE_SET "WiredTiger.turtle.set" /* Turtle temp file */ #define WT_METADATA_URI "metadata:" /* Metadata alias */ -#define WT_METAFILE_URI "file:WiredTiger.wt" /* Metadata file URI */ +#define WT_METAFILE_URI "file:WiredTiger.wt" /* Metadata table URI */ + +#define WT_LAS_URI "file:WiredTigerLAS.wt" /* Lookaside table URI*/ /* * Pre computed hash for the metadata file. Used to optimize comparisons diff --git a/src/include/misc.h b/src/include/misc.h index 7fb6ae13d38..1b2cbf11fc2 100644 --- a/src/include/misc.h +++ b/src/include/misc.h @@ -130,6 +130,7 @@ #define FLD_CLR(field, mask) ((field) &= ~((uint32_t)(mask))) #define FLD_ISSET(field, mask) ((field) & ((uint32_t)(mask))) +#define FLD64_ISSET(field, mask) ((field) & ((uint64_t)(mask))) #define FLD_SET(field, mask) ((field) |= ((uint32_t)(mask))) /* diff --git a/src/include/misc.i b/src/include/misc.i index 98facff02b9..6b502c4c1d1 100644 --- a/src/include/misc.i +++ b/src/include/misc.i @@ -7,6 +7,18 @@ */ /* + * __wt_cond_wait -- + * Wait on a mutex, optionally timing out. + */ +static inline int +__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs) +{ + int notused; + + return (__wt_cond_wait_signal(session, cond, usecs, ¬used)); +} + +/* * __wt_strdup -- * ANSI strdup function. */ diff --git a/src/include/msvc.h b/src/include/msvc.h index bc72ddf8193..8f5aa9abde8 100644 --- a/src/include/msvc.h +++ b/src/include/msvc.h @@ -31,52 +31,56 @@ #define WT_GCC_FUNC_ATTRIBUTE(x) #define WT_GCC_FUNC_DECL_ATTRIBUTE(x) -#define __WT_ATOMIC_ADD(v, val, n, s, t) \ - (WT_STATIC_ASSERT(sizeof(v) == (n)), \ - _InterlockedExchangeAdd ## s((t*)&(v), (t)(val)) + (val)) -#define __WT_ATOMIC_FETCH_ADD(v, val, n, s, t) \ - (WT_STATIC_ASSERT(sizeof(v) == (n)), \ - _InterlockedExchangeAdd ## s((t*)&(v), (t)(val))) -#define __WT_ATOMIC_CAS(v, old, new, n, s, t) \ - (WT_STATIC_ASSERT(sizeof(v) == (n)), \ - _InterlockedCompareExchange ## s \ - ((t*)&(v), (t)(new), (t)(old)) == (t)(old)) -#define __WT_ATOMIC_STORE(v, val, n, s, t) \ - (WT_STATIC_ASSERT(sizeof(v) == (n)), \ - _InterlockedExchange ## s((t*)&(v), (t)(val))) -#define __WT_ATOMIC_SUB(v, val, n, s, t) \ - (WT_STATIC_ASSERT(sizeof(v) == (n)), \ - _InterlockedExchangeAdd ## s((t*)&(v), -(t) val) - (val)) +#define WT_ATOMIC_FUNC(name, ret, type, s, t) \ +static inline ret \ +__wt_atomic_add##name(type *vp, type v) \ +{ \ + return (_InterlockedExchangeAdd ## s((t *)(vp), (t)(v)) + (v)); \ +} \ +static inline ret \ +__wt_atomic_fetch_add##name(type *vp, type v) \ +{ \ + return (_InterlockedExchangeAdd ## s((t *)(vp), (t)(v))); \ +} \ +static inline ret \ +__wt_atomic_store##name(type *vp, type v) \ +{ \ + return (_InterlockedExchange ## s((t *)(vp), (t)(v))); \ +} \ +static inline ret \ +__wt_atomic_sub##name(type *vp, type v) \ +{ \ + return (_InterlockedExchangeAdd ## s((t *)(vp), - (t)v) - (v)); \ +} \ +static inline bool \ +__wt_atomic_cas##name(type *vp, type old, type new) \ +{ \ + return (_InterlockedCompareExchange ## s \ + ((t *)(vp), (t)(new), (t)(old)) == (t)(old)); \ +} -#define WT_ATOMIC_ADD1(v, val) __WT_ATOMIC_ADD(v, val, 1, 8, char) -#define WT_ATOMIC_FETCH_ADD1(v, val) \ - __WT_ATOMIC_FETCH_ADD(v, val, 1, 8, char) -#define WT_ATOMIC_CAS1(v, old, new) __WT_ATOMIC_CAS(v, old, new, 1, 8, char) -#define WT_ATOMIC_STORE1(v, val) __WT_ATOMIC_STORE(v, val, 1, 8, char) -#define WT_ATOMIC_SUB1(v, val) __WT_ATOMIC_SUB(v, val, 1, 8, char) +WT_ATOMIC_FUNC(8, uint8_t, uint8_t, 8, char) +WT_ATOMIC_FUNC(16, uint16_t, uint16_t, 16, short) +WT_ATOMIC_FUNC(32, uint32_t, uint32_t, , long) +WT_ATOMIC_FUNC(v32, uint32_t, volatile uint32_t, , long) +WT_ATOMIC_FUNC(i32, int32_t, int32_t, , long) +WT_ATOMIC_FUNC(iv32, int32_t, volatile int32_t, , long) +WT_ATOMIC_FUNC(64, uint64_t, uint64_t, 64, __int64) +WT_ATOMIC_FUNC(v64, uint64_t, volatile uint64_t, 64, __int64) +WT_ATOMIC_FUNC(i64, int64_t, int64_t, 64, __int64) +WT_ATOMIC_FUNC(iv64, int64_t, volatile int64_t, 64, __int64) +WT_ATOMIC_FUNC(size, size_t, size_t, 64, __int64) -#define WT_ATOMIC_ADD2(v, val) __WT_ATOMIC_ADD(v, val, 2, 16, short) -#define WT_ATOMIC_FETCH_ADD2(v, val) \ - __WT_ATOMIC_FETCH_ADD(v, val, 2, 16, short) -#define WT_ATOMIC_CAS2(v, old, new) \ - __WT_ATOMIC_CAS(v, old, new, 2, 16, short) -#define WT_ATOMIC_STORE2(v, val) __WT_ATOMIC_STORE(v, val, 2, 16, short) -#define WT_ATOMIC_SUB2(v, val) __WT_ATOMIC_SUB(v, val, 2, 16, short) - -#define WT_ATOMIC_ADD4(v, val) __WT_ATOMIC_ADD(v, val, 4, , long) -#define WT_ATOMIC_FETCH_ADD4(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 4, , long) -#define WT_ATOMIC_CAS4(v, old, new) __WT_ATOMIC_CAS(v, old, new, 4, , long) -#define WT_ATOMIC_STORE4(v, val) __WT_ATOMIC_STORE(v, val, 4, , long) -#define WT_ATOMIC_SUB4(v, val) __WT_ATOMIC_SUB(v, val, 4, , long) - -#define WT_ATOMIC_ADD8(v, val) __WT_ATOMIC_ADD(v, val, 8, 64, __int64) -#define WT_ATOMIC_FETCH_ADD8(v, val) \ - __WT_ATOMIC_FETCH_ADD(v, val, 8, 64, __int64) -#define WT_ATOMIC_CAS8(v, old, new) \ - __WT_ATOMIC_CAS(v, old, new, 8, 64, __int64) -#define WT_ATOMIC_STORE8(v, val) \ - __WT_ATOMIC_STORE(v, val, 8, 64, __int64) -#define WT_ATOMIC_SUB8(v, val) __WT_ATOMIC_SUB(v, val, 8, 64, __int64) +/* + * __wt_atomic_cas_ptr -- + * Pointer compare and swap. + */ +static inline bool +__wt_atomic_cas_ptr(void *vp, void *old, void *new) +{ + return (_InterlockedCompareExchange64( + vp, (int64_t)new, (int64_t)old) == ((int64_t)old)); +} static inline void WT_BARRIER(void) { _ReadWriteBarrier(); } static inline void WT_FULL_BARRIER(void) { _mm_mfence(); } diff --git a/src/include/mutex.h b/src/include/mutex.h index 7a5028d6a28..1f1bb8f4b5c 100644 --- a/src/include/mutex.h +++ b/src/include/mutex.h @@ -24,24 +24,20 @@ struct __wt_condvar { /* * !!! - * Don't touch this structure without understanding the read/write - * locking functions. + * Don't modify this structure without understanding the read/write locking + * functions. */ -typedef union { /* Read/write lock */ -#ifdef WORDS_BIGENDIAN - WiredTiger read/write locks require modification for big-endian systems. -#else +typedef union { /* Read/write lock */ uint64_t u; struct { - uint32_t us; + uint32_t wr; /* Writers and readers */ } i; struct { - uint16_t writers; - uint16_t readers; - uint16_t users; - uint16_t pad; + uint16_t writers; /* Now serving for writers */ + uint16_t readers; /* Now serving for readers */ + uint16_t users; /* Next available ticket number */ + uint16_t __notused; /* Padding */ } s; -#endif } wt_rwlock_t; /* @@ -69,20 +65,21 @@ struct __wt_rwlock { #if SPINLOCK_TYPE == SPINLOCK_GCC -typedef volatile int WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) - WT_SPINLOCK; +struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_spinlock { + volatile int lock; +}; #elif SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\ SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE ||\ SPINLOCK_TYPE == SPINLOCK_MSVC -typedef WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) struct { +struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_spinlock { wt_mutex_t lock; const char *name; /* Statistics: mutex name */ int8_t initialized; /* Lock initialized, for cleanup */ -} WT_SPINLOCK; +}; #else diff --git a/src/include/mutex.i b/src/include/mutex.i index 8bca50635e6..5ea4583a2ab 100644 --- a/src/include/mutex.i +++ b/src/include/mutex.i @@ -31,7 +31,7 @@ __wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name) WT_UNUSED(session); WT_UNUSED(name); - *(t) = 0; + t->lock = 0; return (0); } @@ -44,7 +44,7 @@ __wt_spin_destroy(WT_SESSION_IMPL *session, WT_SPINLOCK *t) { WT_UNUSED(session); - *(t) = 0; + t->lock = 0; } /* @@ -56,7 +56,7 @@ __wt_spin_trylock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) { WT_UNUSED(session); - return (__sync_lock_test_and_set(t, 1) == 0 ? 0 : EBUSY); + return (__sync_lock_test_and_set(&t->lock, 1) == 0 ? 0 : EBUSY); } /* @@ -70,10 +70,10 @@ __wt_spin_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) WT_UNUSED(session); - while (__sync_lock_test_and_set(t, 1)) { - for (i = 0; *t && i < WT_SPIN_COUNT; i++) + while (__sync_lock_test_and_set(&t->lock, 1)) { + for (i = 0; t->lock && i < WT_SPIN_COUNT; i++) WT_PAUSE(); - if (*t) + if (t->lock) __wt_yield(); } } @@ -87,7 +87,7 @@ __wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) { WT_UNUSED(session); - __sync_lock_release(t); + __sync_lock_release(&t->lock); } #elif SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\ diff --git a/src/include/os.h b/src/include/os.h index ba5d95657d5..518b124f547 100644 --- a/src/include/os.h +++ b/src/include/os.h @@ -56,7 +56,7 @@ typedef enum { case EMFILE: \ case ENFILE: \ case ENOSPC: \ - __wt_sleep(0L, 500000L); \ + __wt_sleep(0L, 50000L); \ continue; \ default: \ break; \ @@ -77,8 +77,8 @@ typedef enum { struct __wt_fh { char *name; /* File name */ uint64_t name_hash; /* Hash of name */ - SLIST_ENTRY(__wt_fh) l; /* List of open handles */ - SLIST_ENTRY(__wt_fh) hashl; /* Hashed list of handles */ + TAILQ_ENTRY(__wt_fh) q; /* List of open handles */ + TAILQ_ENTRY(__wt_fh) hashq; /* Hashed list of handles */ u_int ref; /* Reference count */ diff --git a/src/include/queue.h b/src/include/queue.h index 42e736e7b09..1d494875cf6 100644 --- a/src/include/queue.h +++ b/src/include/queue.h @@ -38,71 +38,17 @@ extern "C" { #endif /* - * This file defines four types of data structures: singly-linked lists, - * singly-linked tail queues, lists and tail queues. + * WiredTiger only uses the TAILQ macros (we've gotten into trouble in the past + * by trying to use simpler queues and subsequently discovering a list we didn't + * think would ever get to be large could, under some workloads, become large, + * and the linear performance for removal of elements from simpler macros proved + * to be more trouble than the memory savings were worth. * - * A singly-linked list is headed by a single forward pointer. The elements - * are singly linked for minimum space and pointer manipulation overhead at - * the expense of O(n) removal for arbitrary elements. New elements can be - * added to the list after an existing element or at the head of the list. - * Elements being removed from the head of the list should use the explicit - * macro for this purpose for optimum efficiency. A singly-linked list may - * only be traversed in the forward direction. Singly-linked lists are ideal - * for applications with large datasets and few or no removals or for - * implementing a LIFO queue. + * Additionally, we've altered the TAILQ_INSERT_XXX functions to include a write + * barrier, in order to ensure we never insert a partially built structure onto + * a list (this is required because the spinlocks we use don't necessarily imply + * a write barrier). * - * A singly-linked tail queue is headed by a pair of pointers, one to the - * head of the list and the other to the tail of the list. The elements are - * singly linked for minimum space and pointer manipulation overhead at the - * expense of O(n) removal for arbitrary elements. New elements can be added - * to the list after an existing element, at the head of the list, or at the - * end of the list. Elements being removed from the head of the tail queue - * should use the explicit macro for this purpose for optimum efficiency. - * A singly-linked tail queue may only be traversed in the forward direction. - * Singly-linked tail queues are ideal for applications with large datasets - * and few or no removals or for implementing a FIFO queue. - * - * A list is headed by a single forward pointer (or an array of forward - * pointers for a hash table header). The elements are doubly linked - * so that an arbitrary element can be removed without a need to - * traverse the list. New elements can be added to the list before - * or after an existing element or at the head of the list. A list - * may only be traversed in the forward direction. - * - * A tail queue is headed by a pair of pointers, one to the head of the - * list and the other to the tail of the list. The elements are doubly - * linked so that an arbitrary element can be removed without a need to - * traverse the list. New elements can be added to the list before or - * after an existing element, at the head of the list, or at the end of - * the list. A tail queue may be traversed in either direction. - * - * For details on the use of these macros, see the queue(3) manual page. - * - * - * SLIST LIST STAILQ TAILQ - * _HEAD + + + + - * _HEAD_INITIALIZER + + + + - * _ENTRY + + + + - * _INIT + + + + - * _EMPTY + + + + - * _FIRST + + + + - * _NEXT + + + + - * _PREV - - - + - * _LAST - - + + - * _FOREACH + + + + - * _FOREACH_REVERSE - - - + - * _INSERT_HEAD + + + + - * _INSERT_BEFORE - + - + - * _INSERT_AFTER + + + + - * _INSERT_TAIL - - + + - * _CONCAT - - + + - * _REMOVE_HEAD + - + - - * _REMOVE + + + + - * - */ - -/* - * XXX * We #undef all of the macros because there are incompatible versions of this * file and these macros on various systems. What makes the problem worse is * they are included and/or defined by system include files which we may have @@ -111,50 +57,7 @@ extern "C" { * several of the LIST_XXX macros. Visual C.NET 7.0 also defines some of these * same macros in Vc7\PlatformSDK\Include\WinNT.h. Make sure we use ours. */ -#undef LIST_EMPTY -#undef LIST_ENTRY -#undef LIST_FIRST -#undef LIST_FOREACH -#undef LIST_HEAD -#undef LIST_HEAD_INITIALIZER -#undef LIST_INIT -#undef LIST_INSERT_AFTER -#undef LIST_INSERT_BEFORE -#undef LIST_INSERT_HEAD -#undef LIST_NEXT -#undef LIST_REMOVE -#undef QMD_TRACE_ELEM -#undef QMD_TRACE_HEAD -#undef QUEUE_MACRO_DEBUG -#undef SLIST_EMPTY -#undef SLIST_ENTRY -#undef SLIST_FIRST -#undef SLIST_FOREACH -#undef SLIST_FOREACH_PREVPTR -#undef SLIST_HEAD -#undef SLIST_HEAD_INITIALIZER -#undef SLIST_INIT -#undef SLIST_INSERT_AFTER -#undef SLIST_INSERT_HEAD -#undef SLIST_NEXT -#undef SLIST_REMOVE -#undef SLIST_REMOVE_HEAD -#undef STAILQ_CONCAT -#undef STAILQ_EMPTY -#undef STAILQ_ENTRY -#undef STAILQ_FIRST -#undef STAILQ_FOREACH -#undef STAILQ_HEAD -#undef STAILQ_HEAD_INITIALIZER -#undef STAILQ_INIT -#undef STAILQ_INSERT_AFTER -#undef STAILQ_INSERT_HEAD -#undef STAILQ_INSERT_TAIL -#undef STAILQ_LAST -#undef STAILQ_NEXT -#undef STAILQ_REMOVE -#undef STAILQ_REMOVE_HEAD -#undef STAILQ_REMOVE_HEAD_UNTIL + #undef TAILQ_CONCAT #undef TAILQ_EMPTY #undef TAILQ_ENTRY @@ -210,230 +113,6 @@ struct qm_trace { #endif /* QUEUE_MACRO_DEBUG */ /* - * Singly-linked List declarations. - */ -#define SLIST_HEAD(name, type) \ -struct name { \ - struct type *slh_first; /* first element */ \ -} - -#define SLIST_HEAD_INITIALIZER(head) \ - { NULL } - -#define SLIST_ENTRY(type) \ -struct { \ - struct type *sle_next; /* next element */ \ -} - -/* - * Singly-linked List functions. - */ -#define SLIST_EMPTY(head) ((head)->slh_first == NULL) - -#define SLIST_FIRST(head) ((head)->slh_first) - -#define SLIST_FOREACH(var, head, field) \ - for ((var) = SLIST_FIRST((head)); \ - (var); \ - (var) = SLIST_NEXT((var), field)) - -#define SLIST_FOREACH_PREVPTR(var, varp, head, field) \ - for ((varp) = &SLIST_FIRST((head)); \ - ((var) = *(varp)) != NULL; \ - (varp) = &SLIST_NEXT((var), field)) - -#define SLIST_INIT(head) do { \ - SLIST_FIRST((head)) = NULL; \ -} while (0) - -#define SLIST_INSERT_AFTER(slistelm, elm, field) do { \ - SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field); \ - SLIST_NEXT((slistelm), field) = (elm); \ -} while (0) - -#define SLIST_INSERT_HEAD(head, elm, field) do { \ - SLIST_NEXT((elm), field) = SLIST_FIRST((head)); \ - SLIST_FIRST((head)) = (elm); \ -} while (0) - -#define SLIST_NEXT(elm, field) ((elm)->field.sle_next) - -#define SLIST_REMOVE(head, elm, type, field) do { \ - if (SLIST_FIRST((head)) == (elm)) { \ - SLIST_REMOVE_HEAD((head), field); \ - } \ - else { \ - struct type *curelm = SLIST_FIRST((head)); \ - while (SLIST_NEXT(curelm, field) != (elm)) \ - curelm = SLIST_NEXT(curelm, field); \ - SLIST_NEXT(curelm, field) = \ - SLIST_NEXT(SLIST_NEXT(curelm, field), field); \ - } \ -} while (0) - -#define SLIST_REMOVE_HEAD(head, field) do { \ - SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field); \ -} while (0) - -/* - * Singly-linked Tail queue declarations. - */ -#define STAILQ_HEAD(name, type) \ -struct name { \ - struct type *stqh_first;/* first element */ \ - struct type **stqh_last;/* addr of last next element */ \ -} - -#define STAILQ_HEAD_INITIALIZER(head) \ - { NULL, &(head).stqh_first } - -#define STAILQ_ENTRY(type) \ -struct { \ - struct type *stqe_next; /* next element */ \ -} - -/* - * Singly-linked Tail queue functions. - */ -#define STAILQ_CONCAT(head1, head2) do { \ - if (!STAILQ_EMPTY((head2))) { \ - *(head1)->stqh_last = (head2)->stqh_first; \ - (head1)->stqh_last = (head2)->stqh_last; \ - STAILQ_INIT((head2)); \ - } \ -} while (0) - -#define STAILQ_EMPTY(head) ((head)->stqh_first == NULL) - -#define STAILQ_FIRST(head) ((head)->stqh_first) - -#define STAILQ_FOREACH(var, head, field) \ - for ((var) = STAILQ_FIRST((head)); \ - (var); \ - (var) = STAILQ_NEXT((var), field)) - -#define STAILQ_INIT(head) do { \ - STAILQ_FIRST((head)) = NULL; \ - (head)->stqh_last = &STAILQ_FIRST((head)); \ -} while (0) - -#define STAILQ_INSERT_AFTER(head, tqelm, elm, field) do { \ - if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\ - (head)->stqh_last = &STAILQ_NEXT((elm), field); \ - STAILQ_NEXT((tqelm), field) = (elm); \ -} while (0) - -#define STAILQ_INSERT_HEAD(head, elm, field) do { \ - if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL) \ - (head)->stqh_last = &STAILQ_NEXT((elm), field); \ - STAILQ_FIRST((head)) = (elm); \ -} while (0) - -#define STAILQ_INSERT_TAIL(head, elm, field) do { \ - STAILQ_NEXT((elm), field) = NULL; \ - *(head)->stqh_last = (elm); \ - (head)->stqh_last = &STAILQ_NEXT((elm), field); \ -} while (0) - -#define STAILQ_LAST(head, type, field) \ - (STAILQ_EMPTY((head)) ? \ - NULL : \ - ((struct type *) \ - ((char *)((head)->stqh_last) - __offsetof(struct type, field)))) - -#define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next) - -#define STAILQ_REMOVE(head, elm, type, field) do { \ - if (STAILQ_FIRST((head)) == (elm)) { \ - STAILQ_REMOVE_HEAD((head), field); \ - } \ - else { \ - struct type *curelm = STAILQ_FIRST((head)); \ - while (STAILQ_NEXT(curelm, field) != (elm)) \ - curelm = STAILQ_NEXT(curelm, field); \ - if ((STAILQ_NEXT(curelm, field) = \ - STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\ - (head)->stqh_last = &STAILQ_NEXT((curelm), field);\ - } \ -} while (0) - -#define STAILQ_REMOVE_HEAD(head, field) do { \ - if ((STAILQ_FIRST((head)) = \ - STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL) \ - (head)->stqh_last = &STAILQ_FIRST((head)); \ -} while (0) - -#define STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do { \ - if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL) \ - (head)->stqh_last = &STAILQ_FIRST((head)); \ -} while (0) - -/* - * List declarations. - */ -#define LIST_HEAD(name, type) \ -struct name { \ - struct type *lh_first; /* first element */ \ -} - -#define LIST_HEAD_INITIALIZER(head) \ - { NULL } - -#define LIST_ENTRY(type) \ -struct { \ - struct type *le_next; /* next element */ \ - struct type **le_prev; /* address of previous next element */ \ -} - -/* - * List functions. - */ - -#define LIST_EMPTY(head) ((head)->lh_first == NULL) - -#define LIST_FIRST(head) ((head)->lh_first) - -#define LIST_FOREACH(var, head, field) \ - for ((var) = LIST_FIRST((head)); \ - (var); \ - (var) = LIST_NEXT((var), field)) - -#define LIST_INIT(head) do { \ - LIST_FIRST((head)) = NULL; \ -} while (0) - -#define LIST_INSERT_AFTER(listelm, elm, field) do { \ - if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\ - LIST_NEXT((listelm), field)->field.le_prev = \ - &LIST_NEXT((elm), field); \ - LIST_NEXT((listelm), field) = (elm); \ - (elm)->field.le_prev = &LIST_NEXT((listelm), field); \ -} while (0) - -#define LIST_INSERT_BEFORE(listelm, elm, field) do { \ - (elm)->field.le_prev = (listelm)->field.le_prev; \ - LIST_NEXT((elm), field) = (listelm); \ - *(listelm)->field.le_prev = (elm); \ - (listelm)->field.le_prev = &LIST_NEXT((elm), field); \ -} while (0) - -#define LIST_INSERT_HEAD(head, elm, field) do { \ - if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL) \ - LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\ - LIST_FIRST((head)) = (elm); \ - (elm)->field.le_prev = &LIST_FIRST((head)); \ -} while (0) - -#define LIST_NEXT(elm, field) ((elm)->field.le_next) - -#define LIST_REMOVE(elm, field) do { \ - if (LIST_NEXT((elm), field) != NULL) \ - LIST_NEXT((elm), field)->field.le_prev = \ - (elm)->field.le_prev; \ - *(elm)->field.le_prev = LIST_NEXT((elm), field); \ -} while (0) - -/* * Tail queue declarations. */ #define TAILQ_HEAD(name, type) \ @@ -488,6 +167,7 @@ struct { \ } while (0) #define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \ + WT_WRITE_BARRIER(); \ if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\ TAILQ_NEXT((elm), field)->field.tqe_prev = \ &TAILQ_NEXT((elm), field); \ @@ -502,6 +182,7 @@ struct { \ } while (0) #define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \ + WT_WRITE_BARRIER(); \ (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \ TAILQ_NEXT((elm), field) = (listelm); \ *(listelm)->field.tqe_prev = (elm); \ @@ -511,6 +192,7 @@ struct { \ } while (0) #define TAILQ_INSERT_HEAD(head, elm, field) do { \ + WT_WRITE_BARRIER(); \ if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL) \ TAILQ_FIRST((head))->field.tqe_prev = \ &TAILQ_NEXT((elm), field); \ @@ -523,6 +205,7 @@ struct { \ } while (0) #define TAILQ_INSERT_TAIL(head, elm, field) do { \ + WT_WRITE_BARRIER(); \ TAILQ_NEXT((elm), field) = NULL; \ (elm)->field.tqe_prev = (head)->tqh_last; \ *(head)->tqh_last = (elm); \ diff --git a/src/include/schema.h b/src/include/schema.h index 8f4884281cd..0664af5adba 100644 --- a/src/include/schema.h +++ b/src/include/schema.h @@ -62,8 +62,8 @@ struct __wt_table { WT_INDEX **indices; size_t idx_alloc; - SLIST_ENTRY(__wt_table) l; - SLIST_ENTRY(__wt_table) hashl; + TAILQ_ENTRY(__wt_table) q; + TAILQ_ENTRY(__wt_table) hashq; int cg_complete, idx_complete, is_simple; u_int ncolgroups, nindices, nkey_columns; diff --git a/src/include/serial.i b/src/include/serial.i index 9e6b0f7916c..d90b29c2133 100644 --- a/src/include/serial.i +++ b/src/include/serial.i @@ -30,11 +30,11 @@ __page_write_gen_wrapped_check(WT_PAGE *page) } /* - * __insert_serial_func -- - * Worker function to add a WT_INSERT entry to a skiplist. + * __insert_simple_func -- + * Worker function to add a WT_INSERT entry to the middle of a skiplist. */ static inline int -__insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head, +__insert_simple_func(WT_SESSION_IMPL *session, WT_INSERT ***ins_stack, WT_INSERT *new_ins, u_int skipdepth) { u_int i; @@ -42,31 +42,62 @@ __insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head, WT_UNUSED(session); /* - * Confirm we are still in the expected position, and no item has been - * added where our insert belongs. Take extra care at the beginning - * and end of the list (at each level): retry if we race there. + * Update the skiplist elements referencing the new WT_INSERT item. + * If we fail connecting one of the upper levels in the skiplist, + * return success: the levels we updated are correct and sufficient. + * Even though we don't get the benefit of the memory we allocated, + * we can't roll back. * - * !!! - * Note the test for ins_stack[0] == NULL: that's the test for an - * uninitialized cursor, ins_stack[0] is cleared as part of - * initializing a cursor for a search. + * All structure setup must be flushed before the structure is entered + * into the list. We need a write barrier here, our callers depend on + * it. Don't pass complex arguments to the macro, some implementations + * read the old value multiple times. */ for (i = 0; i < skipdepth; i++) { - if (ins_stack[i] == NULL || - *ins_stack[i] != new_ins->next[i]) - return (WT_RESTART); - if (new_ins->next[i] == NULL && - ins_head->tail[i] != NULL && - ins_stack[i] != &ins_head->tail[i]->next[i]) - return (WT_RESTART); + WT_INSERT *old_ins = *ins_stack[i]; + if (old_ins != new_ins->next[i] || + !__wt_atomic_cas_ptr(ins_stack[i], old_ins, new_ins)) + return (i == 0 ? WT_RESTART : 0); } - /* Update the skiplist elements referencing the new WT_INSERT item. */ + return (0); +} + +/* + * __insert_serial_func -- + * Worker function to add a WT_INSERT entry to a skiplist. + */ +static inline int +__insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head, + WT_INSERT ***ins_stack, WT_INSERT *new_ins, u_int skipdepth) +{ + u_int i; + + /* The cursor should be positioned. */ + WT_ASSERT(session, ins_stack[0] != NULL); + + /* + * Update the skiplist elements referencing the new WT_INSERT item. + * + * Confirm we are still in the expected position, and no item has been + * added where our insert belongs. If we fail connecting one of the + * upper levels in the skiplist, return success: the levels we updated + * are correct and sufficient. Even though we don't get the benefit of + * the memory we allocated, we can't roll back. + * + * All structure setup must be flushed before the structure is entered + * into the list. We need a write barrier here, our callers depend on + * it. Don't pass complex arguments to the macro, some implementations + * read the old value multiple times. + */ for (i = 0; i < skipdepth; i++) { + WT_INSERT *old_ins = *ins_stack[i]; + if (old_ins != new_ins->next[i] || + !__wt_atomic_cas_ptr(ins_stack[i], old_ins, new_ins)) + return (i == 0 ? WT_RESTART : 0); if (ins_head->tail[i] == NULL || ins_stack[i] == &ins_head->tail[i]->next[i]) ins_head->tail[i] = new_ins; - *ins_stack[i] = new_ins; } return (0); @@ -92,7 +123,7 @@ __col_append_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head, * If the application didn't specify a record number, allocate a new one * and set up for an append. */ - if ((recno = WT_INSERT_RECNO(new_ins)) == 0) { + if ((recno = WT_INSERT_RECNO(new_ins)) == WT_RECNO_OOB) { recno = WT_INSERT_RECNO(new_ins) = btree->last_recno + 1; WT_ASSERT(session, WT_SKIP_LAST(ins_head) == NULL || recno > WT_INSERT_RECNO(WT_SKIP_LAST(ins_head))); @@ -128,20 +159,20 @@ __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT *new_ins = *new_insp; WT_DECL_RET; - /* Clear references to memory we now own. */ - *new_insp = NULL; - /* Check for page write generation wrap. */ WT_RET(__page_write_gen_wrapped_check(page)); + /* Clear references to memory we now own and must free on error. */ + *new_insp = NULL; + /* Acquire the page's spinlock, call the worker function. */ WT_PAGE_LOCK(session, page); ret = __col_append_serial_func( session, ins_head, ins_stack, new_ins, recnop, skipdepth); WT_PAGE_UNLOCK(session, page); - /* Free unused memory on error. */ if (ret != 0) { + /* Free unused memory on error. */ __wt_free(session, new_ins); return (ret); } @@ -171,21 +202,32 @@ __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page, { WT_INSERT *new_ins = *new_insp; WT_DECL_RET; - - /* Clear references to memory we now own. */ - *new_insp = NULL; + int simple; + u_int i; /* Check for page write generation wrap. */ WT_RET(__page_write_gen_wrapped_check(page)); - /* Acquire the page's spinlock, call the worker function. */ - WT_PAGE_LOCK(session, page); - ret = __insert_serial_func( - session, ins_head, ins_stack, new_ins, skipdepth); - WT_PAGE_UNLOCK(session, page); + /* Clear references to memory we now own and must free on error. */ + *new_insp = NULL; + + simple = 1; + for (i = 0; i < skipdepth; i++) + if (new_ins->next[i] == NULL) + simple = 0; + + if (simple) + ret = __insert_simple_func( + session, ins_stack, new_ins, skipdepth); + else { + WT_PAGE_LOCK(session, page); + ret = __insert_serial_func( + session, ins_head, ins_stack, new_ins, skipdepth); + WT_PAGE_UNLOCK(session, page); + } - /* Free unused memory on error. */ if (ret != 0) { + /* Free unused memory on error. */ __wt_free(session, new_ins); return (ret); } @@ -215,26 +257,27 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DECL_RET; WT_UPDATE *obsolete, *upd = *updp; - /* Clear references to memory we now own. */ - *updp = NULL; - /* Check for page write generation wrap. */ WT_RET(__page_write_gen_wrapped_check(page)); + /* Clear references to memory we now own and must free on error. */ + *updp = NULL; + /* + * All structure setup must be flushed before the structure is entered + * into the list. We need a write barrier here, our callers depend on + * it. + * * Swap the update into place. If that fails, a new update was added - * after our search, we raced. Check if our update is still permitted, - * and if it is, do a full-barrier to ensure the update's next pointer - * is set before we update the linked list and try again. + * after our search, we raced. Check if our update is still permitted. */ - while (!WT_ATOMIC_CAS8(*srch_upd, upd->next, upd)) { + while (!__wt_atomic_cas_ptr(srch_upd, upd->next, upd)) { if ((ret = __wt_txn_update_check( session, upd->next = *srch_upd)) != 0) { /* Free unused memory on error. */ __wt_free(session, upd); return (ret); } - WT_WRITE_BARRIER(); } /* @@ -249,25 +292,37 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, __wt_page_modify_set(session, page); /* - * If there are subsequent WT_UPDATE structures, we're evicting pages - * and the page-scanning mutex isn't held, discard obsolete WT_UPDATE - * structures. Serialization is needed so only one thread does the - * obsolete check at a time, and to protect updates from disappearing - * under reconciliation. + * If there are no subsequent WT_UPDATE structures we are done here. */ - if (upd->next != NULL && - __wt_txn_visible_all(session, page->modify->obsolete_check_txn)) { - F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret); - /* If we can't lock it, don't scan, that's okay. */ - if (ret != 0) - return (0); - obsolete = __wt_update_obsolete_check(session, page, upd->next); - F_CLR_ATOMIC(page, WT_PAGE_SCANNING); - if (obsolete != NULL) { + if (upd->next == NULL) + return (0); + /* + * We would like to call __wt_txn_update_oldest only in the event that + * there are further updates to this page, the check against WT_TXN_NONE + * is used as an indicator of there being further updates on this page. + */ + if (page->modify->obsolete_check_txn != WT_TXN_NONE) { + if (!__wt_txn_visible_all(session, + page->modify->obsolete_check_txn)) { + /* Try to move the oldest ID forward and re-check */ + __wt_txn_update_oldest(session,0); + } + if (!__wt_txn_visible_all(session, + page->modify->obsolete_check_txn)) { page->modify->obsolete_check_txn = WT_TXN_NONE; - __wt_update_obsolete_free(session, page, obsolete); + return (0); } } + F_CAS_ATOMIC(page, WT_PAGE_RECONCILIATION, ret); + + /* If we can't lock it, don't scan, that's okay. */ + if (ret != 0) + return (0); + obsolete = __wt_update_obsolete_check(session, page, upd->next); + F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION); + if (obsolete != NULL) { + __wt_update_obsolete_free(session, page, obsolete); + } return (0); } diff --git a/src/include/session.h b/src/include/session.h index f32da177bf9..a691794fd46 100644 --- a/src/include/session.h +++ b/src/include/session.h @@ -14,8 +14,8 @@ struct __wt_data_handle_cache { WT_DATA_HANDLE *dhandle; - SLIST_ENTRY(__wt_data_handle_cache) l; - SLIST_ENTRY(__wt_data_handle_cache) hashl; + TAILQ_ENTRY(__wt_data_handle_cache) q; + TAILQ_ENTRY(__wt_data_handle_cache) hashq; }; /* @@ -66,7 +66,7 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl { * across session close - so it is declared further down. */ /* Session handle reference list */ - SLIST_HEAD(__dhandles, __wt_data_handle_cache) dhandles; + TAILQ_HEAD(__dhandles, __wt_data_handle_cache) dhandles; time_t last_sweep; /* Last sweep for dead handles */ WT_CURSOR *cursor; /* Current cursor */ @@ -76,6 +76,11 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl { WT_CURSOR_BACKUP *bkp_cursor; /* Hot backup cursor */ WT_COMPACT *compact; /* Compact state */ + /* + * Lookaside table cursor, sweep and eviction worker threads only. + */ + WT_CURSOR *las_cursor; /* Lookaside table cursor */ + WT_DATA_HANDLE *meta_dhandle; /* Metadata file */ void *meta_track; /* Metadata operation tracking */ void *meta_track_next; /* Current position */ @@ -90,7 +95,7 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl { * table of lists. The hash table list is kept in allocated memory * that lives across session close - so it is declared further down. */ - SLIST_HEAD(__tables, __wt_table) tables; + TAILQ_HEAD(__tables, __wt_table) tables; WT_ITEM **scratch; /* Temporary memory for any function */ u_int scratch_alloc; /* Currently allocated */ @@ -151,9 +156,9 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl { WT_RAND_STATE rnd; /* Random number generation state */ /* Hashed handle reference list array */ - SLIST_HEAD(__dhandles_hash, __wt_data_handle_cache) *dhhash; + TAILQ_HEAD(__dhandles_hash, __wt_data_handle_cache) *dhhash; /* Hashed table reference list array */ - SLIST_HEAD(__tables_hash, __wt_table) *tablehash; + TAILQ_HEAD(__tables_hash, __wt_table) *tablehash; /* * Splits can "free" memory that may still be in use, and we use a diff --git a/src/include/stat.h b/src/include/stat.h index 6dc9282a613..cd2c149bc94 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -6,122 +6,217 @@ * See the file LICENSE for redistribution information. */ -struct __wt_stats { - const char *desc; /* text description */ - uint64_t v; /* 64-bit value */ -}; +/* + * Statistics counters: + * + * We use an array of statistics structures; threads write different structures + * to avoid writing the same cache line and incurring cache coherency overheads, + * which can dramatically slow fast and otherwise read-mostly workloads. + * + * With an 8B statistics value and 64B cache-line alignment, 8 values share the + * same cache line. There are collisions when different threads choose the same + * statistics structure and update values that live on the cache line. There is + * likely some locality however: a thread updating the cursor search statistic + * is likely to update other cursor statistics with a chance of hitting already + * cached values. + * + * The actual statistic value must be signed, because one thread might increment + * the value in its structure, and then another thread might decrement the same + * value in another structure (where the value was initially zero), so the value + * in the second thread's slot will go negative. + * + * When reading a statistics value, the array values are summed and returned to + * the caller. The summation is performed without locking, so the value read + * may be inconsistent (and might be negative, if increments/decrements race + * with the reader). + * + * Choosing how many structures isn't easy: obviously, a smaller number creates + * more conflicts while a larger number uses more memory. + * + * Ideally, if the application running on the system is CPU-intensive, and using + * all CPUs on the system, we want to use the same number of slots as there are + * CPUs (because their L1 caches are the units of coherency). However, in + * practice we cannot easily determine how many CPUs are actually available to + * the application. + * + * Our next best option is to use the number of threads in the application as a + * heuristic for the number of CPUs (presumably, the application architect has + * figured out how many CPUs are available). However, inside WiredTiger we don't + * know when the application creates its threads. + * + * For now, we use a fixed number of slots. Ideally, we would approximate the + * largest number of cores we expect on any machine where WiredTiger might be + * run, however, we don't want to waste that much memory on smaller machines. + * As of 2015, machines with more than 24 CPUs are relatively rare. + * + * Default hash table size; use a prime number of buckets rather than assuming + * a good hash (Reference Sedgewick, Algorithms in C, "Hash Functions"). + */ +#define WT_COUNTER_SLOTS 23 /* - * Read/write statistics without any test for statistics configuration. + * WT_STATS_SLOT_ID is the thread's slot ID for the array of structures. + * + * Ideally, we want a slot per CPU, and we want each thread to index the slot + * corresponding to the CPU it runs on. Unfortunately, getting the ID of the + * current CPU is difficult: some operating systems provide a system call to + * acquire a CPU ID, but not all (regardless, making a system call to increment + * a statistics value is far too expensive). + * + * Our second-best option is to use the thread ID. Unfortunately, there is no + * portable way to obtain a unique thread ID that's a small-enough number to + * be used as an array index (portable thread IDs are usually a pointer or an + * opaque chunk, not a simple integer). + * + * Our solution is to use the session ID; there is normally a session per thread + * and the session ID is a small, monotonically increasing number. */ -#define WT_STAT(stats, fld) \ - ((stats)->fld.v) -#define WT_STAT_ATOMIC_DECRV(stats, fld, value) do { \ - (void)WT_ATOMIC_SUB8(WT_STAT(stats, fld), (value)); \ -} while (0) -#define WT_STAT_ATOMIC_DECR(stats, fld) WT_STAT_ATOMIC_DECRV(stats, fld, 1) -#define WT_STAT_ATOMIC_INCRV(stats, fld, value) do { \ - (void)WT_ATOMIC_ADD8(WT_STAT(stats, fld), (value)); \ -} while (0) -#define WT_STAT_ATOMIC_INCR(stats, fld) WT_STAT_ATOMIC_INCRV(stats, fld, 1) -#define WT_STAT_DECRV(stats, fld, value) do { \ - (stats)->fld.v -= (value); \ -} while (0) -#define WT_STAT_DECR(stats, fld) WT_STAT_DECRV(stats, fld, 1) -#define WT_STAT_INCRV(stats, fld, value) do { \ - (stats)->fld.v += (value); \ -} while (0) -#define WT_STAT_INCR(stats, fld) WT_STAT_INCRV(stats, fld, 1) -#define WT_STAT_SET(stats, fld, value) do { \ - (stats)->fld.v = (uint64_t)(value); \ -} while (0) +#define WT_STATS_SLOT_ID(session) \ + ((session)->id) % WT_COUNTER_SLOTS /* - * Read/write statistics if "fast" statistics are configured. + * Statistic structures are arrays of int64_t's. We have functions to read/write + * those structures regardless of the specific statistic structure we're working + * with, by translating statistics structure field names to structure offsets. + * + * Translate a statistic's value name to an offset. */ -#define WT_STAT_FAST_ATOMIC_DECRV(session, stats, fld, value) do { \ - if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \ - WT_STAT_ATOMIC_DECRV(stats, fld, value); \ -} while (0) -#define WT_STAT_FAST_ATOMIC_DECR(session, stats, fld) \ - WT_STAT_FAST_ATOMIC_DECRV(session, stats, fld, 1) -#define WT_STAT_FAST_ATOMIC_INCRV(session, stats, fld, value) do { \ - if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \ - WT_STAT_ATOMIC_INCRV(stats, fld, value); \ +#define WT_STATS_FIELD_TO_SLOT(stats, fld) \ + (int)(&(stats)[0]->fld - (int64_t *)(stats)[0]) + +/* + * Sum the values from all structures in the array. + */ +static inline int64_t +__wt_stats_aggregate(void *stats_arg, int slot) +{ + int64_t **stats, aggr_v; + int i; + + stats = stats_arg; + for (aggr_v = 0, i = 0; i < WT_COUNTER_SLOTS; i++) + aggr_v += stats[i][slot]; + + /* + * This can race. However, any implementation with a single value can + * race as well, different threads could set the same counter value + * simultaneously. While we are making races more likely, we are not + * fundamentally weakening the isolation semantics found in updating a + * single value. + * + * Additionally, the aggregation can go negative (imagine a thread + * incrementing a value after aggregation has passed its slot and a + * second thread decrementing a value before aggregation has reached + * its slot). + * + * For historic API compatibility, the external type is a uint64_t; + * limit our return to positive values, negative numbers would just + * look really, really large. + */ + if (aggr_v < 0) + aggr_v = 0; + return (aggr_v); +} + +/* + * Clear the values in all structures in the array. + */ +static inline void +__wt_stats_clear(void *stats_arg, int slot) +{ + int64_t **stats; + int i; + + stats = stats_arg; + for (i = 0; i < WT_COUNTER_SLOTS; i++) + stats[i][slot] = 0; +} + +/* + * Read/write statistics without any test for statistics configuration. Reading + * and writing the field requires different actions: reading sums the values + * across the array of structures, writing updates a single structure's value. + */ +#define WT_STAT_READ(stats, fld) \ + __wt_stats_aggregate(stats, WT_STATS_FIELD_TO_SLOT(stats, fld)) +#define WT_STAT_WRITE(session, stats, fld) \ + ((stats)[WT_STATS_SLOT_ID(session)]->fld); + +#define WT_STAT_DECRV(session, stats, fld, value) \ + (stats)[WT_STATS_SLOT_ID(session)]->fld -= (int64_t)(value) +#define WT_STAT_DECR(session, stats, fld) \ + WT_STAT_DECRV(session, stats, fld, 1) +#define WT_STAT_INCRV(session, stats, fld, value) \ + (stats)[WT_STATS_SLOT_ID(session)]->fld += (int64_t)(value) +#define WT_STAT_INCR(session, stats, fld) \ + WT_STAT_INCRV(session, stats, fld, 1) +#define WT_STAT_SET(session, stats, fld, value) do { \ + __wt_stats_clear(stats, WT_STATS_FIELD_TO_SLOT(stats, fld)); \ + (stats)[0]->fld = (int64_t)(value); \ } while (0) -#define WT_STAT_FAST_ATOMIC_INCR(session, stats, fld) \ - WT_STAT_FAST_ATOMIC_INCRV(session, stats, fld, 1) + +/* + * Update statistics if "fast" statistics are configured. + */ #define WT_STAT_FAST_DECRV(session, stats, fld, value) do { \ if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \ - WT_STAT_DECRV(stats, fld, value); \ + WT_STAT_DECRV(session, stats, fld, value); \ } while (0) #define WT_STAT_FAST_DECR(session, stats, fld) \ WT_STAT_FAST_DECRV(session, stats, fld, 1) #define WT_STAT_FAST_INCRV(session, stats, fld, value) do { \ if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \ - WT_STAT_INCRV(stats, fld, value); \ + WT_STAT_INCRV(session, stats, fld, value); \ } while (0) #define WT_STAT_FAST_INCR(session, stats, fld) \ WT_STAT_FAST_INCRV(session, stats, fld, 1) #define WT_STAT_FAST_SET(session, stats, fld, value) do { \ if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \ - WT_STAT_SET(stats, fld, value); \ + WT_STAT_SET(session, stats, fld, value); \ } while (0) /* - * Read/write connection handle statistics if "fast" statistics are configured. + * Update connection handle statistics if "fast" statistics are configured. */ -#define WT_STAT_FAST_CONN_ATOMIC_DECRV(session, fld, value) \ - WT_STAT_FAST_ATOMIC_DECRV(session, &S2C(session)->stats, fld, value) -#define WT_STAT_FAST_CONN_ATOMIC_DECR(session, fld) \ - WT_STAT_FAST_ATOMIC_DECR(session, &S2C(session)->stats, fld) -#define WT_STAT_FAST_CONN_ATOMIC_INCRV(session, fld, value) \ - WT_STAT_FAST_ATOMIC_INCRV(session, &S2C(session)->stats, fld, value) -#define WT_STAT_FAST_CONN_ATOMIC_INCR(session, fld) \ - WT_STAT_FAST_ATOMIC_INCR(session, &S2C(session)->stats, fld) #define WT_STAT_FAST_CONN_DECR(session, fld) \ - WT_STAT_FAST_DECR(session, &S2C(session)->stats, fld) + WT_STAT_FAST_DECR(session, S2C(session)->stats, fld) #define WT_STAT_FAST_CONN_DECRV(session, fld, value) \ - WT_STAT_FAST_DECRV(session, &S2C(session)->stats, fld, value) + WT_STAT_FAST_DECRV(session, S2C(session)->stats, fld, value) #define WT_STAT_FAST_CONN_INCR(session, fld) \ - WT_STAT_FAST_INCR(session, &S2C(session)->stats, fld) + WT_STAT_FAST_INCR(session, S2C(session)->stats, fld) #define WT_STAT_FAST_CONN_INCRV(session, fld, value) \ - WT_STAT_FAST_INCRV(session, &S2C(session)->stats, fld, value) + WT_STAT_FAST_INCRV(session, S2C(session)->stats, fld, value) #define WT_STAT_FAST_CONN_SET(session, fld, value) \ - WT_STAT_FAST_SET(session, &S2C(session)->stats, fld, value) + WT_STAT_FAST_SET(session, S2C(session)->stats, fld, value) /* - * Read/write data-source handle statistics if the data-source handle is set - * and "fast" statistics are configured. + * Update data-source handle statistics if "fast" statistics are configured + * and the data-source handle is set. * * XXX * We shouldn't have to check if the data-source handle is NULL, but it's - * useful until everything is converted to using data-source handles. + * necessary until everything is converted to using data-source handles. */ #define WT_STAT_FAST_DATA_DECRV(session, fld, value) do { \ if ((session)->dhandle != NULL) \ WT_STAT_FAST_DECRV( \ - session, &(session)->dhandle->stats, fld, value); \ + session, (session)->dhandle->stats, fld, value); \ } while (0) #define WT_STAT_FAST_DATA_DECR(session, fld) \ WT_STAT_FAST_DATA_DECRV(session, fld, 1) #define WT_STAT_FAST_DATA_INCRV(session, fld, value) do { \ if ((session)->dhandle != NULL) \ WT_STAT_FAST_INCRV( \ - session, &(session)->dhandle->stats, fld, value); \ + session, (session)->dhandle->stats, fld, value); \ } while (0) #define WT_STAT_FAST_DATA_INCR(session, fld) \ WT_STAT_FAST_DATA_INCRV(session, fld, 1) #define WT_STAT_FAST_DATA_SET(session, fld, value) do { \ if ((session)->dhandle != NULL) \ WT_STAT_FAST_SET( \ - session, &(session)->dhandle->stats, fld, value); \ + session, (session)->dhandle->stats, fld, value); \ } while (0) -/* Connection handle statistics value. */ -#define WT_CONN_STAT(session, fld) \ - WT_STAT(&S2C(session)->stats, fld) - /* * DO NOT EDIT: automatically built by dist/stat.py. */ @@ -132,148 +227,157 @@ struct __wt_stats { */ #define WT_CONNECTION_STATS_BASE 1000 struct __wt_connection_stats { - WT_STATS async_alloc_race; - WT_STATS async_alloc_view; - WT_STATS async_cur_queue; - WT_STATS async_flush; - WT_STATS async_full; - WT_STATS async_max_queue; - WT_STATS async_nowork; - WT_STATS async_op_alloc; - WT_STATS async_op_compact; - WT_STATS async_op_insert; - WT_STATS async_op_remove; - WT_STATS async_op_search; - WT_STATS async_op_update; - WT_STATS block_byte_map_read; - WT_STATS block_byte_read; - WT_STATS block_byte_write; - WT_STATS block_map_read; - WT_STATS block_preload; - WT_STATS block_read; - WT_STATS block_write; - WT_STATS cache_bytes_dirty; - WT_STATS cache_bytes_internal; - WT_STATS cache_bytes_inuse; - WT_STATS cache_bytes_leaf; - WT_STATS cache_bytes_max; - WT_STATS cache_bytes_overflow; - WT_STATS cache_bytes_read; - WT_STATS cache_bytes_write; - WT_STATS cache_eviction_app; - WT_STATS cache_eviction_checkpoint; - WT_STATS cache_eviction_clean; - WT_STATS cache_eviction_deepen; - WT_STATS cache_eviction_dirty; - WT_STATS cache_eviction_fail; - WT_STATS cache_eviction_force; - WT_STATS cache_eviction_force_delete; - WT_STATS cache_eviction_force_fail; - WT_STATS cache_eviction_hazard; - WT_STATS cache_eviction_internal; - WT_STATS cache_eviction_maximum_page_size; - WT_STATS cache_eviction_queue_empty; - WT_STATS cache_eviction_queue_not_empty; - WT_STATS cache_eviction_server_evicting; - WT_STATS cache_eviction_server_not_evicting; - WT_STATS cache_eviction_slow; - WT_STATS cache_eviction_split; - WT_STATS cache_eviction_walk; - WT_STATS cache_eviction_worker_evicting; - WT_STATS cache_inmem_split; - WT_STATS cache_overhead; - WT_STATS cache_pages_dirty; - WT_STATS cache_pages_inuse; - WT_STATS cache_read; - WT_STATS cache_write; - WT_STATS cond_wait; - WT_STATS cursor_create; - WT_STATS cursor_insert; - WT_STATS cursor_next; - WT_STATS cursor_prev; - WT_STATS cursor_remove; - WT_STATS cursor_reset; - WT_STATS cursor_search; - WT_STATS cursor_search_near; - WT_STATS cursor_update; - WT_STATS dh_conn_handles; - WT_STATS dh_conn_ref; - WT_STATS dh_conn_sweeps; - WT_STATS dh_conn_tod; - WT_STATS dh_session_handles; - WT_STATS dh_session_sweeps; - WT_STATS file_open; - WT_STATS log_buffer_size; - WT_STATS log_bytes_payload; - WT_STATS log_bytes_written; - WT_STATS log_close_yields; - WT_STATS log_compress_len; - WT_STATS log_compress_mem; - WT_STATS log_compress_small; - WT_STATS log_compress_write_fails; - WT_STATS log_compress_writes; - WT_STATS log_max_filesize; - WT_STATS log_prealloc_files; - WT_STATS log_prealloc_max; - WT_STATS log_prealloc_used; - WT_STATS log_release_write_lsn; - WT_STATS log_scan_records; - WT_STATS log_scan_rereads; - WT_STATS log_scans; - WT_STATS log_slot_closes; - WT_STATS log_slot_coalesced; - WT_STATS log_slot_consolidated; - WT_STATS log_slot_joins; - WT_STATS log_slot_races; - WT_STATS log_slot_toobig; - WT_STATS log_slot_toosmall; - WT_STATS log_slot_transitions; - WT_STATS log_sync; - WT_STATS log_sync_dir; - WT_STATS log_write_lsn; - WT_STATS log_writes; - WT_STATS lsm_checkpoint_throttle; - WT_STATS lsm_merge_throttle; - WT_STATS lsm_rows_merged; - WT_STATS lsm_work_queue_app; - WT_STATS lsm_work_queue_manager; - WT_STATS lsm_work_queue_max; - WT_STATS lsm_work_queue_switch; - WT_STATS lsm_work_units_created; - WT_STATS lsm_work_units_discarded; - WT_STATS lsm_work_units_done; - WT_STATS memory_allocation; - WT_STATS memory_free; - WT_STATS memory_grow; - WT_STATS page_busy_blocked; - WT_STATS page_forcible_evict_blocked; - WT_STATS page_locked_blocked; - WT_STATS page_read_blocked; - WT_STATS page_sleep; - WT_STATS read_io; - WT_STATS rec_pages; - WT_STATS rec_pages_eviction; - WT_STATS rec_split_stashed_bytes; - WT_STATS rec_split_stashed_objects; - WT_STATS rwlock_read; - WT_STATS rwlock_write; - WT_STATS session_cursor_open; - WT_STATS session_open; - WT_STATS txn_begin; - WT_STATS txn_checkpoint; - WT_STATS txn_checkpoint_generation; - WT_STATS txn_checkpoint_running; - WT_STATS txn_checkpoint_time_max; - WT_STATS txn_checkpoint_time_min; - WT_STATS txn_checkpoint_time_recent; - WT_STATS txn_checkpoint_time_total; - WT_STATS txn_commit; - WT_STATS txn_fail_cache; - WT_STATS txn_pinned_checkpoint_range; - WT_STATS txn_pinned_range; - WT_STATS txn_rollback; - WT_STATS txn_sync; - WT_STATS write_io; + int64_t async_alloc_race; + int64_t async_alloc_view; + int64_t async_cur_queue; + int64_t async_flush; + int64_t async_full; + int64_t async_max_queue; + int64_t async_nowork; + int64_t async_op_alloc; + int64_t async_op_compact; + int64_t async_op_insert; + int64_t async_op_remove; + int64_t async_op_search; + int64_t async_op_update; + int64_t block_byte_map_read; + int64_t block_byte_read; + int64_t block_byte_write; + int64_t block_map_read; + int64_t block_preload; + int64_t block_read; + int64_t block_write; + int64_t cache_bytes_dirty; + int64_t cache_bytes_internal; + int64_t cache_bytes_inuse; + int64_t cache_bytes_leaf; + int64_t cache_bytes_max; + int64_t cache_bytes_overflow; + int64_t cache_bytes_read; + int64_t cache_bytes_write; + int64_t cache_eviction_app; + int64_t cache_eviction_checkpoint; + int64_t cache_eviction_clean; + int64_t cache_eviction_deepen; + int64_t cache_eviction_dirty; + int64_t cache_eviction_fail; + int64_t cache_eviction_force; + int64_t cache_eviction_force_delete; + int64_t cache_eviction_force_fail; + int64_t cache_eviction_hazard; + int64_t cache_eviction_internal; + int64_t cache_eviction_maximum_page_size; + int64_t cache_eviction_queue_empty; + int64_t cache_eviction_queue_not_empty; + int64_t cache_eviction_server_evicting; + int64_t cache_eviction_server_not_evicting; + int64_t cache_eviction_slow; + int64_t cache_eviction_split; + int64_t cache_eviction_walk; + int64_t cache_eviction_worker_evicting; + int64_t cache_inmem_split; + int64_t cache_inmem_splittable; + int64_t cache_lookaside_insert; + int64_t cache_lookaside_remove; + int64_t cache_overhead; + int64_t cache_pages_dirty; + int64_t cache_pages_inuse; + int64_t cache_read; + int64_t cache_read_lookaside; + int64_t cache_write; + int64_t cache_write_lookaside; + int64_t cache_write_restore; + int64_t cond_wait; + int64_t cursor_create; + int64_t cursor_insert; + int64_t cursor_next; + int64_t cursor_prev; + int64_t cursor_remove; + int64_t cursor_reset; + int64_t cursor_restart; + int64_t cursor_search; + int64_t cursor_search_near; + int64_t cursor_update; + int64_t dh_conn_handle_count; + int64_t dh_session_handles; + int64_t dh_session_sweeps; + int64_t dh_sweep_close; + int64_t dh_sweep_ref; + int64_t dh_sweep_remove; + int64_t dh_sweep_tod; + int64_t dh_sweeps; + int64_t file_open; + int64_t log_buffer_size; + int64_t log_bytes_payload; + int64_t log_bytes_written; + int64_t log_close_yields; + int64_t log_compress_len; + int64_t log_compress_mem; + int64_t log_compress_small; + int64_t log_compress_write_fails; + int64_t log_compress_writes; + int64_t log_max_filesize; + int64_t log_prealloc_files; + int64_t log_prealloc_max; + int64_t log_prealloc_used; + int64_t log_release_write_lsn; + int64_t log_scan_records; + int64_t log_scan_rereads; + int64_t log_scans; + int64_t log_slot_closes; + int64_t log_slot_coalesced; + int64_t log_slot_consolidated; + int64_t log_slot_joins; + int64_t log_slot_races; + int64_t log_slot_switch_busy; + int64_t log_slot_transitions; + int64_t log_slot_unbuffered; + int64_t log_sync; + int64_t log_sync_dir; + int64_t log_write_lsn; + int64_t log_writes; + int64_t lsm_checkpoint_throttle; + int64_t lsm_merge_throttle; + int64_t lsm_rows_merged; + int64_t lsm_work_queue_app; + int64_t lsm_work_queue_manager; + int64_t lsm_work_queue_max; + int64_t lsm_work_queue_switch; + int64_t lsm_work_units_created; + int64_t lsm_work_units_discarded; + int64_t lsm_work_units_done; + int64_t memory_allocation; + int64_t memory_free; + int64_t memory_grow; + int64_t page_busy_blocked; + int64_t page_forcible_evict_blocked; + int64_t page_locked_blocked; + int64_t page_read_blocked; + int64_t page_sleep; + int64_t read_io; + int64_t rec_pages; + int64_t rec_pages_eviction; + int64_t rec_split_stashed_bytes; + int64_t rec_split_stashed_objects; + int64_t rwlock_read; + int64_t rwlock_write; + int64_t session_cursor_open; + int64_t session_open; + int64_t txn_begin; + int64_t txn_checkpoint; + int64_t txn_checkpoint_generation; + int64_t txn_checkpoint_running; + int64_t txn_checkpoint_time_max; + int64_t txn_checkpoint_time_min; + int64_t txn_checkpoint_time_recent; + int64_t txn_checkpoint_time_total; + int64_t txn_commit; + int64_t txn_fail_cache; + int64_t txn_pinned_checkpoint_range; + int64_t txn_pinned_range; + int64_t txn_rollback; + int64_t txn_sync; + int64_t write_io; }; /* @@ -281,96 +385,102 @@ struct __wt_connection_stats { */ #define WT_DSRC_STATS_BASE 2000 struct __wt_dsrc_stats { - WT_STATS allocation_size; - WT_STATS block_alloc; - WT_STATS block_checkpoint_size; - WT_STATS block_extension; - WT_STATS block_free; - WT_STATS block_magic; - WT_STATS block_major; - WT_STATS block_minor; - WT_STATS block_reuse_bytes; - WT_STATS block_size; - WT_STATS bloom_count; - WT_STATS bloom_false_positive; - WT_STATS bloom_hit; - WT_STATS bloom_miss; - WT_STATS bloom_page_evict; - WT_STATS bloom_page_read; - WT_STATS bloom_size; - WT_STATS btree_checkpoint_generation; - WT_STATS btree_column_deleted; - WT_STATS btree_column_fix; - WT_STATS btree_column_internal; - WT_STATS btree_column_variable; - WT_STATS btree_compact_rewrite; - WT_STATS btree_entries; - WT_STATS btree_fixed_len; - WT_STATS btree_maximum_depth; - WT_STATS btree_maxintlkey; - WT_STATS btree_maxintlpage; - WT_STATS btree_maxleafkey; - WT_STATS btree_maxleafpage; - WT_STATS btree_maxleafvalue; - WT_STATS btree_overflow; - WT_STATS btree_row_internal; - WT_STATS btree_row_leaf; - WT_STATS cache_bytes_read; - WT_STATS cache_bytes_write; - WT_STATS cache_eviction_checkpoint; - WT_STATS cache_eviction_clean; - WT_STATS cache_eviction_deepen; - WT_STATS cache_eviction_dirty; - WT_STATS cache_eviction_fail; - WT_STATS cache_eviction_hazard; - WT_STATS cache_eviction_internal; - WT_STATS cache_eviction_split; - WT_STATS cache_inmem_split; - WT_STATS cache_overflow_value; - WT_STATS cache_read; - WT_STATS cache_read_overflow; - WT_STATS cache_write; - WT_STATS compress_raw_fail; - WT_STATS compress_raw_fail_temporary; - WT_STATS compress_raw_ok; - WT_STATS compress_read; - WT_STATS compress_write; - WT_STATS compress_write_fail; - WT_STATS compress_write_too_small; - WT_STATS cursor_create; - WT_STATS cursor_insert; - WT_STATS cursor_insert_bulk; - WT_STATS cursor_insert_bytes; - WT_STATS cursor_next; - WT_STATS cursor_prev; - WT_STATS cursor_remove; - WT_STATS cursor_remove_bytes; - WT_STATS cursor_reset; - WT_STATS cursor_search; - WT_STATS cursor_search_near; - WT_STATS cursor_update; - WT_STATS cursor_update_bytes; - WT_STATS lsm_checkpoint_throttle; - WT_STATS lsm_chunk_count; - WT_STATS lsm_generation_max; - WT_STATS lsm_lookup_no_bloom; - WT_STATS lsm_merge_throttle; - WT_STATS rec_dictionary; - WT_STATS rec_multiblock_internal; - WT_STATS rec_multiblock_leaf; - WT_STATS rec_multiblock_max; - WT_STATS rec_overflow_key_internal; - WT_STATS rec_overflow_key_leaf; - WT_STATS rec_overflow_value; - WT_STATS rec_page_delete; - WT_STATS rec_page_match; - WT_STATS rec_pages; - WT_STATS rec_pages_eviction; - WT_STATS rec_prefix_compression; - WT_STATS rec_suffix_compression; - WT_STATS session_compact; - WT_STATS session_cursor_open; - WT_STATS txn_update_conflict; + int64_t allocation_size; + int64_t block_alloc; + int64_t block_checkpoint_size; + int64_t block_extension; + int64_t block_free; + int64_t block_magic; + int64_t block_major; + int64_t block_minor; + int64_t block_reuse_bytes; + int64_t block_size; + int64_t bloom_count; + int64_t bloom_false_positive; + int64_t bloom_hit; + int64_t bloom_miss; + int64_t bloom_page_evict; + int64_t bloom_page_read; + int64_t bloom_size; + int64_t btree_checkpoint_generation; + int64_t btree_column_deleted; + int64_t btree_column_fix; + int64_t btree_column_internal; + int64_t btree_column_rle; + int64_t btree_column_variable; + int64_t btree_compact_rewrite; + int64_t btree_entries; + int64_t btree_fixed_len; + int64_t btree_maximum_depth; + int64_t btree_maxintlkey; + int64_t btree_maxintlpage; + int64_t btree_maxleafkey; + int64_t btree_maxleafpage; + int64_t btree_maxleafvalue; + int64_t btree_overflow; + int64_t btree_row_internal; + int64_t btree_row_leaf; + int64_t cache_bytes_read; + int64_t cache_bytes_write; + int64_t cache_eviction_checkpoint; + int64_t cache_eviction_clean; + int64_t cache_eviction_deepen; + int64_t cache_eviction_dirty; + int64_t cache_eviction_fail; + int64_t cache_eviction_hazard; + int64_t cache_eviction_internal; + int64_t cache_eviction_split; + int64_t cache_inmem_split; + int64_t cache_inmem_splittable; + int64_t cache_overflow_value; + int64_t cache_read; + int64_t cache_read_lookaside; + int64_t cache_read_overflow; + int64_t cache_write; + int64_t cache_write_lookaside; + int64_t cache_write_restore; + int64_t compress_raw_fail; + int64_t compress_raw_fail_temporary; + int64_t compress_raw_ok; + int64_t compress_read; + int64_t compress_write; + int64_t compress_write_fail; + int64_t compress_write_too_small; + int64_t cursor_create; + int64_t cursor_insert; + int64_t cursor_insert_bulk; + int64_t cursor_insert_bytes; + int64_t cursor_next; + int64_t cursor_prev; + int64_t cursor_remove; + int64_t cursor_remove_bytes; + int64_t cursor_reset; + int64_t cursor_restart; + int64_t cursor_search; + int64_t cursor_search_near; + int64_t cursor_update; + int64_t cursor_update_bytes; + int64_t lsm_checkpoint_throttle; + int64_t lsm_chunk_count; + int64_t lsm_generation_max; + int64_t lsm_lookup_no_bloom; + int64_t lsm_merge_throttle; + int64_t rec_dictionary; + int64_t rec_multiblock_internal; + int64_t rec_multiblock_leaf; + int64_t rec_multiblock_max; + int64_t rec_overflow_key_internal; + int64_t rec_overflow_key_leaf; + int64_t rec_overflow_value; + int64_t rec_page_delete; + int64_t rec_page_match; + int64_t rec_pages; + int64_t rec_pages_eviction; + int64_t rec_prefix_compression; + int64_t rec_suffix_compression; + int64_t session_compact; + int64_t session_cursor_open; + int64_t txn_update_conflict; }; /* Statistics section: END */ diff --git a/src/include/txn.h b/src/include/txn.h index 7a67f713244..4a325c70a95 100644 --- a/src/include/txn.h +++ b/src/include/txn.h @@ -31,7 +31,7 @@ struct __wt_named_snapshot { const char *name; - STAILQ_ENTRY(__wt_named_snapshot) q; + TAILQ_ENTRY(__wt_named_snapshot) q; uint64_t snap_min, snap_max; uint64_t *snapshot; @@ -72,15 +72,14 @@ struct __wt_txn_global { /* Named snapshot state. */ WT_RWLOCK *nsnap_rwlock; volatile uint64_t nsnap_oldest_id; - STAILQ_HEAD(__wt_nsnap_qh, __wt_named_snapshot) nsnaph; + TAILQ_HEAD(__wt_nsnap_qh, __wt_named_snapshot) nsnaph; WT_TXN_STATE *states; /* Per-session transaction states */ }; typedef enum __wt_txn_isolation { - WT_ISO_EVICTION, /* Internal: eviction context */ - WT_ISO_READ_UNCOMMITTED, WT_ISO_READ_COMMITTED, + WT_ISO_READ_UNCOMMITTED, WT_ISO_SNAPSHOT } WT_TXN_ISOLATION; diff --git a/src/include/txn.i b/src/include/txn.i index a9b54d26e47..2b42990f5e5 100644 --- a/src/include/txn.i +++ b/src/include/txn.i @@ -140,12 +140,22 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session) } /* + * __wt_txn_committed -- + * Return if a transaction has been committed. + */ +static inline bool +__wt_txn_committed(WT_SESSION_IMPL *session, uint64_t id) +{ + return (WT_TXNID_LT(id, S2C(session)->txn_global.last_running)); +} + +/* * __wt_txn_visible_all -- * Check if a given transaction ID is "globally visible". This is, if * all sessions in the system will see the transaction ID including the * ID that belongs to a running checkpoint. */ -static inline int +static inline bool __wt_txn_visible_all(WT_SESSION_IMPL *session, uint64_t id) { uint64_t oldest_id; @@ -159,28 +169,21 @@ __wt_txn_visible_all(WT_SESSION_IMPL *session, uint64_t id) * __wt_txn_visible -- * Can the current transaction see the given ID? */ -static inline int +static inline bool __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id) { WT_TXN *txn; - int found; + bool found; txn = &session->txn; /* Changes with no associated transaction are always visible. */ if (id == WT_TXN_NONE) - return (1); + return (true); /* Nobody sees the results of aborted transactions. */ if (id == WT_TXN_ABORTED) - return (0); - - /* - * Eviction only sees globally visible updates, or if there is a - * checkpoint transaction running, use its transaction. - */ - if (txn->isolation == WT_ISO_EVICTION) - return (__wt_txn_visible_all(session, id)); + return (false); /* * Read-uncommitted transactions see all other changes. @@ -194,11 +197,11 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id) */ if (txn->isolation == WT_ISO_READ_UNCOMMITTED || session->dhandle == session->meta_dhandle) - return (1); + return (true); /* Transactions see their own changes. */ if (id == txn->id) - return (1); + return (true); /* * WT_ISO_SNAPSHOT, WT_ISO_READ_COMMITTED: the ID is visible if it is @@ -210,9 +213,9 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id) * snapshot is empty. */ if (WT_TXNID_LE(txn->snap_max, id)) - return (0); + return (false); if (txn->snapshot_count == 0 || WT_TXNID_LT(id, txn->snap_min)) - return (1); + return (true); WT_BINARY_SEARCH(id, txn->snapshot, txn->snapshot_count, found); return (!found); @@ -266,7 +269,7 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) } F_SET(txn, WT_TXN_RUNNING); - return (0); + return (false); } /* @@ -300,7 +303,7 @@ __wt_txn_new_id(WT_SESSION_IMPL *session) * global current ID, so we want post-increment semantics. Our atomic * add primitive does pre-increment, so adjust the result here. */ - return (WT_ATOMIC_ADD8(S2C(session)->txn_global.current, 1) - 1); + return (__wt_atomic_addv64(&S2C(session)->txn_global.current, 1) - 1); } /* @@ -376,8 +379,9 @@ __wt_txn_id_check(WT_SESSION_IMPL *session) */ do { txn_state->id = txn->id = txn_global->current; - } while (!WT_ATOMIC_CAS8( - txn_global->current, txn->id, txn->id + 1)); + } while (!__wt_atomic_casv64( + &txn_global->current, txn->id, txn->id + 1) || + WT_TXNID_LT(txn->id, txn_global->last_running)); /* * If we have used 64-bits of transaction IDs, there is nothing @@ -476,7 +480,7 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session) * __wt_txn_am_oldest -- * Am I the oldest transaction in the system? */ -static inline int +static inline bool __wt_txn_am_oldest(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; @@ -491,12 +495,12 @@ __wt_txn_am_oldest(WT_SESSION_IMPL *session) txn_global = &conn->txn_global; if (txn->id == WT_TXN_NONE) - return (0); + return (false); WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LT(id, txn->id)) - return (0); + return (false); - return (1); + return (true); } diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index e8f3b9958ce..71ba3f41a44 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -1750,6 +1750,9 @@ struct __wt_connection { * @config{ name, the name of a cache that * is shared between databases or \c "none" when no shared cache is * configured., a string; default \c none.} + * @config{ quota, maximum size of cache this + * database can be allocated from the shared cache. Defaults to the + * entire shared cache size., an integer; default \c 0.} * @config{ reserve, amount of cache this * database is guaranteed to have available from the shared cache. This * setting is per database. Defaults to the chunk size., an integer; @@ -2072,8 +2075,10 @@ struct __wt_connection { * @config{checkpoint_sync, flush files to stable storage when closing or * writing checkpoints., a boolean flag; default \c true.} * @config{config_base, write the base configuration file if creating the - * database\, see @ref config_base for more information., a boolean flag; - * default \c true.} + * database. If \c false in the config passed directly to ::wiredtiger_open\, + * will ignore any existing base configuration file in addition to not creating + * one. See @ref config_base for more information., a boolean flag; default \c + * true.} * @config{create, create the database if it does not exist., a boolean flag; * default \c false.} * @config{direct_io, Use \c O_DIRECT to access files. Options are given as a @@ -2214,10 +2219,12 @@ struct __wt_connection { * @config{ name, the name of a cache that is shared * between databases or \c "none" when no shared cache is configured., a string; * default \c none.} - * @config{ reserve, amount of cache - * this database is guaranteed to have available from the shared cache. This - * setting is per database. Defaults to the chunk size., an integer; default \c - * 0.} + * @config{ quota, maximum size of + * cache this database can be allocated from the shared cache. Defaults to the + * entire shared cache size., an integer; default \c 0.} + * @config{ reserve, amount of cache this database is + * guaranteed to have available from the shared cache. This setting is per + * database. Defaults to the chunk size., an integer; default \c 0.} * @config{ size, maximum memory to allocate for the * shared cache. Setting this will update the value if one is already set., an * integer between 1MB and 10TB; default \c 500MB.} @@ -3640,192 +3647,210 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1047 /*! cache: in-memory page splits */ #define WT_STAT_CONN_CACHE_INMEM_SPLIT 1048 +/*! cache: in-memory page passed criteria to be split */ +#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1049 +/*! cache: lookaside table insert calls */ +#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1050 +/*! cache: lookaside table remove calls */ +#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1051 /*! cache: percentage overhead */ -#define WT_STAT_CONN_CACHE_OVERHEAD 1049 +#define WT_STAT_CONN_CACHE_OVERHEAD 1052 /*! cache: tracked dirty pages in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1050 +#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1053 /*! cache: pages currently held in the cache */ -#define WT_STAT_CONN_CACHE_PAGES_INUSE 1051 +#define WT_STAT_CONN_CACHE_PAGES_INUSE 1054 /*! cache: pages read into cache */ -#define WT_STAT_CONN_CACHE_READ 1052 +#define WT_STAT_CONN_CACHE_READ 1055 +/*! cache: pages read into cache requiring lookaside entries */ +#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1056 /*! cache: pages written from cache */ -#define WT_STAT_CONN_CACHE_WRITE 1053 +#define WT_STAT_CONN_CACHE_WRITE 1057 +/*! cache: page written requiring lookaside records */ +#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1058 +/*! cache: pages written requiring in-memory restoration */ +#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1059 /*! connection: pthread mutex condition wait calls */ -#define WT_STAT_CONN_COND_WAIT 1054 +#define WT_STAT_CONN_COND_WAIT 1060 /*! cursor: cursor create calls */ -#define WT_STAT_CONN_CURSOR_CREATE 1055 +#define WT_STAT_CONN_CURSOR_CREATE 1061 /*! cursor: cursor insert calls */ -#define WT_STAT_CONN_CURSOR_INSERT 1056 +#define WT_STAT_CONN_CURSOR_INSERT 1062 /*! cursor: cursor next calls */ -#define WT_STAT_CONN_CURSOR_NEXT 1057 +#define WT_STAT_CONN_CURSOR_NEXT 1063 /*! cursor: cursor prev calls */ -#define WT_STAT_CONN_CURSOR_PREV 1058 +#define WT_STAT_CONN_CURSOR_PREV 1064 /*! cursor: cursor remove calls */ -#define WT_STAT_CONN_CURSOR_REMOVE 1059 +#define WT_STAT_CONN_CURSOR_REMOVE 1065 /*! cursor: cursor reset calls */ -#define WT_STAT_CONN_CURSOR_RESET 1060 +#define WT_STAT_CONN_CURSOR_RESET 1066 +/*! cursor: cursor restarted searches */ +#define WT_STAT_CONN_CURSOR_RESTART 1067 /*! cursor: cursor search calls */ -#define WT_STAT_CONN_CURSOR_SEARCH 1061 +#define WT_STAT_CONN_CURSOR_SEARCH 1068 /*! cursor: cursor search near calls */ -#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1062 +#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1069 /*! cursor: cursor update calls */ -#define WT_STAT_CONN_CURSOR_UPDATE 1063 -/*! data-handle: connection dhandles swept */ -#define WT_STAT_CONN_DH_CONN_HANDLES 1064 -/*! data-handle: connection candidate referenced */ -#define WT_STAT_CONN_DH_CONN_REF 1065 -/*! data-handle: connection sweeps */ -#define WT_STAT_CONN_DH_CONN_SWEEPS 1066 -/*! data-handle: connection time-of-death sets */ -#define WT_STAT_CONN_DH_CONN_TOD 1067 +#define WT_STAT_CONN_CURSOR_UPDATE 1070 +/*! data-handle: connection data handles currently active */ +#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1071 /*! data-handle: session dhandles swept */ -#define WT_STAT_CONN_DH_SESSION_HANDLES 1068 +#define WT_STAT_CONN_DH_SESSION_HANDLES 1072 /*! data-handle: session sweep attempts */ -#define WT_STAT_CONN_DH_SESSION_SWEEPS 1069 +#define WT_STAT_CONN_DH_SESSION_SWEEPS 1073 +/*! data-handle: connection sweep dhandles closed */ +#define WT_STAT_CONN_DH_SWEEP_CLOSE 1074 +/*! data-handle: connection sweep candidate became referenced */ +#define WT_STAT_CONN_DH_SWEEP_REF 1075 +/*! data-handle: connection sweep dhandles removed from hash list */ +#define WT_STAT_CONN_DH_SWEEP_REMOVE 1076 +/*! data-handle: connection sweep time-of-death sets */ +#define WT_STAT_CONN_DH_SWEEP_TOD 1077 +/*! data-handle: connection sweeps */ +#define WT_STAT_CONN_DH_SWEEPS 1078 /*! connection: files currently open */ -#define WT_STAT_CONN_FILE_OPEN 1070 +#define WT_STAT_CONN_FILE_OPEN 1079 /*! log: total log buffer size */ -#define WT_STAT_CONN_LOG_BUFFER_SIZE 1071 +#define WT_STAT_CONN_LOG_BUFFER_SIZE 1080 /*! log: log bytes of payload data */ -#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1072 +#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1081 /*! log: log bytes written */ -#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1073 +#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1082 /*! log: yields waiting for previous log file close */ -#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1074 +#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1083 /*! log: total size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_LEN 1075 +#define WT_STAT_CONN_LOG_COMPRESS_LEN 1084 /*! log: total in-memory size of compressed records */ -#define WT_STAT_CONN_LOG_COMPRESS_MEM 1076 +#define WT_STAT_CONN_LOG_COMPRESS_MEM 1085 /*! log: log records too small to compress */ -#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1077 +#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1086 /*! log: log records not compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1078 +#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1087 /*! log: log records compressed */ -#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1079 +#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1088 /*! log: maximum log file size */ -#define WT_STAT_CONN_LOG_MAX_FILESIZE 1080 +#define WT_STAT_CONN_LOG_MAX_FILESIZE 1089 /*! log: pre-allocated log files prepared */ -#define WT_STAT_CONN_LOG_PREALLOC_FILES 1081 +#define WT_STAT_CONN_LOG_PREALLOC_FILES 1090 /*! log: number of pre-allocated log files to create */ -#define WT_STAT_CONN_LOG_PREALLOC_MAX 1082 +#define WT_STAT_CONN_LOG_PREALLOC_MAX 1091 /*! log: pre-allocated log files used */ -#define WT_STAT_CONN_LOG_PREALLOC_USED 1083 +#define WT_STAT_CONN_LOG_PREALLOC_USED 1092 /*! log: log release advances write LSN */ -#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1084 +#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1093 /*! log: records processed by log scan */ -#define WT_STAT_CONN_LOG_SCAN_RECORDS 1085 +#define WT_STAT_CONN_LOG_SCAN_RECORDS 1094 /*! log: log scan records requiring two reads */ -#define WT_STAT_CONN_LOG_SCAN_REREADS 1086 +#define WT_STAT_CONN_LOG_SCAN_REREADS 1095 /*! log: log scan operations */ -#define WT_STAT_CONN_LOG_SCANS 1087 +#define WT_STAT_CONN_LOG_SCANS 1096 /*! log: consolidated slot closures */ -#define WT_STAT_CONN_LOG_SLOT_CLOSES 1088 +#define WT_STAT_CONN_LOG_SLOT_CLOSES 1097 /*! log: written slots coalesced */ -#define WT_STAT_CONN_LOG_SLOT_COALESCED 1089 +#define WT_STAT_CONN_LOG_SLOT_COALESCED 1098 /*! log: logging bytes consolidated */ -#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1090 +#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1099 /*! log: consolidated slot joins */ -#define WT_STAT_CONN_LOG_SLOT_JOINS 1091 +#define WT_STAT_CONN_LOG_SLOT_JOINS 1100 /*! log: consolidated slot join races */ -#define WT_STAT_CONN_LOG_SLOT_RACES 1092 -/*! log: record size exceeded maximum */ -#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1093 -/*! log: failed to find a slot large enough for record */ -#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1094 +#define WT_STAT_CONN_LOG_SLOT_RACES 1101 +/*! log: busy returns attempting to switch slots */ +#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1102 /*! log: consolidated slot join transitions */ -#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1095 +#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1103 +/*! log: consolidated slot unbuffered writes */ +#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1104 /*! log: log sync operations */ -#define WT_STAT_CONN_LOG_SYNC 1096 +#define WT_STAT_CONN_LOG_SYNC 1105 /*! log: log sync_dir operations */ -#define WT_STAT_CONN_LOG_SYNC_DIR 1097 +#define WT_STAT_CONN_LOG_SYNC_DIR 1106 /*! log: log server thread advances write LSN */ -#define WT_STAT_CONN_LOG_WRITE_LSN 1098 +#define WT_STAT_CONN_LOG_WRITE_LSN 1107 /*! log: log write operations */ -#define WT_STAT_CONN_LOG_WRITES 1099 +#define WT_STAT_CONN_LOG_WRITES 1108 /*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1100 +#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1109 /*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1101 +#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1110 /*! LSM: rows merged in an LSM tree */ -#define WT_STAT_CONN_LSM_ROWS_MERGED 1102 +#define WT_STAT_CONN_LSM_ROWS_MERGED 1111 /*! LSM: application work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1103 +#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1112 /*! LSM: merge work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1104 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1113 /*! LSM: tree queue hit maximum */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1105 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1114 /*! LSM: switch work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1106 +#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1115 /*! LSM: tree maintenance operations scheduled */ -#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1107 +#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1116 /*! LSM: tree maintenance operations discarded */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1108 +#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1117 /*! LSM: tree maintenance operations executed */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1109 +#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1118 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1110 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1119 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1111 +#define WT_STAT_CONN_MEMORY_FREE 1120 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1112 +#define WT_STAT_CONN_MEMORY_GROW 1121 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1113 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1122 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1114 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1123 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1115 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1124 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1116 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1125 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1117 +#define WT_STAT_CONN_PAGE_SLEEP 1126 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1118 +#define WT_STAT_CONN_READ_IO 1127 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1119 +#define WT_STAT_CONN_REC_PAGES 1128 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1120 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1129 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1121 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1130 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1122 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1131 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1123 +#define WT_STAT_CONN_RWLOCK_READ 1132 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1124 +#define WT_STAT_CONN_RWLOCK_WRITE 1133 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1125 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1134 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1126 +#define WT_STAT_CONN_SESSION_OPEN 1135 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1127 +#define WT_STAT_CONN_TXN_BEGIN 1136 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1128 +#define WT_STAT_CONN_TXN_CHECKPOINT 1137 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1129 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1138 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1130 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1139 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1131 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1140 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1132 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1141 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1133 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1142 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1134 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1143 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1135 +#define WT_STAT_CONN_TXN_COMMIT 1144 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1136 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1145 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1137 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1146 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1138 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1147 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1139 +#define WT_STAT_CONN_TXN_ROLLBACK 1148 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1140 +#define WT_STAT_CONN_TXN_SYNC 1149 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1141 +#define WT_STAT_CONN_WRITE_IO 1150 /*! * @} @@ -3875,146 +3900,158 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_DSRC_BTREE_COLUMN_FIX 2019 /*! btree: column-store internal pages */ #define WT_STAT_DSRC_BTREE_COLUMN_INTERNAL 2020 +/*! btree: column-store variable-size RLE encoded values */ +#define WT_STAT_DSRC_BTREE_COLUMN_RLE 2021 /*! btree: column-store variable-size leaf pages */ -#define WT_STAT_DSRC_BTREE_COLUMN_VARIABLE 2021 +#define WT_STAT_DSRC_BTREE_COLUMN_VARIABLE 2022 /*! btree: pages rewritten by compaction */ -#define WT_STAT_DSRC_BTREE_COMPACT_REWRITE 2022 +#define WT_STAT_DSRC_BTREE_COMPACT_REWRITE 2023 /*! btree: number of key/value pairs */ -#define WT_STAT_DSRC_BTREE_ENTRIES 2023 +#define WT_STAT_DSRC_BTREE_ENTRIES 2024 /*! btree: fixed-record size */ -#define WT_STAT_DSRC_BTREE_FIXED_LEN 2024 +#define WT_STAT_DSRC_BTREE_FIXED_LEN 2025 /*! btree: maximum tree depth */ -#define WT_STAT_DSRC_BTREE_MAXIMUM_DEPTH 2025 +#define WT_STAT_DSRC_BTREE_MAXIMUM_DEPTH 2026 /*! btree: maximum internal page key size */ -#define WT_STAT_DSRC_BTREE_MAXINTLKEY 2026 +#define WT_STAT_DSRC_BTREE_MAXINTLKEY 2027 /*! btree: maximum internal page size */ -#define WT_STAT_DSRC_BTREE_MAXINTLPAGE 2027 +#define WT_STAT_DSRC_BTREE_MAXINTLPAGE 2028 /*! btree: maximum leaf page key size */ -#define WT_STAT_DSRC_BTREE_MAXLEAFKEY 2028 +#define WT_STAT_DSRC_BTREE_MAXLEAFKEY 2029 /*! btree: maximum leaf page size */ -#define WT_STAT_DSRC_BTREE_MAXLEAFPAGE 2029 +#define WT_STAT_DSRC_BTREE_MAXLEAFPAGE 2030 /*! btree: maximum leaf page value size */ -#define WT_STAT_DSRC_BTREE_MAXLEAFVALUE 2030 +#define WT_STAT_DSRC_BTREE_MAXLEAFVALUE 2031 /*! btree: overflow pages */ -#define WT_STAT_DSRC_BTREE_OVERFLOW 2031 +#define WT_STAT_DSRC_BTREE_OVERFLOW 2032 /*! btree: row-store internal pages */ -#define WT_STAT_DSRC_BTREE_ROW_INTERNAL 2032 +#define WT_STAT_DSRC_BTREE_ROW_INTERNAL 2033 /*! btree: row-store leaf pages */ -#define WT_STAT_DSRC_BTREE_ROW_LEAF 2033 +#define WT_STAT_DSRC_BTREE_ROW_LEAF 2034 /*! cache: bytes read into cache */ -#define WT_STAT_DSRC_CACHE_BYTES_READ 2034 +#define WT_STAT_DSRC_CACHE_BYTES_READ 2035 /*! cache: bytes written from cache */ -#define WT_STAT_DSRC_CACHE_BYTES_WRITE 2035 +#define WT_STAT_DSRC_CACHE_BYTES_WRITE 2036 /*! cache: checkpoint blocked page eviction */ -#define WT_STAT_DSRC_CACHE_EVICTION_CHECKPOINT 2036 +#define WT_STAT_DSRC_CACHE_EVICTION_CHECKPOINT 2037 /*! cache: unmodified pages evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2037 +#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2038 /*! cache: page split during eviction deepened the tree */ -#define WT_STAT_DSRC_CACHE_EVICTION_DEEPEN 2038 +#define WT_STAT_DSRC_CACHE_EVICTION_DEEPEN 2039 /*! cache: modified pages evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_DIRTY 2039 +#define WT_STAT_DSRC_CACHE_EVICTION_DIRTY 2040 /*! cache: data source pages selected for eviction unable to be evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_FAIL 2040 +#define WT_STAT_DSRC_CACHE_EVICTION_FAIL 2041 /*! cache: hazard pointer blocked page eviction */ -#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 2041 +#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 2042 /*! cache: internal pages evicted */ -#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 2042 +#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 2043 /*! cache: pages split during eviction */ -#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT 2043 +#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT 2044 /*! cache: in-memory page splits */ -#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2044 +#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2045 +/*! cache: in-memory page passed criteria to be split */ +#define WT_STAT_DSRC_CACHE_INMEM_SPLITTABLE 2046 /*! cache: overflow values cached in memory */ -#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2045 +#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2047 /*! cache: pages read into cache */ -#define WT_STAT_DSRC_CACHE_READ 2046 +#define WT_STAT_DSRC_CACHE_READ 2048 +/*! cache: pages read into cache requiring lookaside entries */ +#define WT_STAT_DSRC_CACHE_READ_LOOKASIDE 2049 /*! cache: overflow pages read into cache */ -#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2047 +#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2050 /*! cache: pages written from cache */ -#define WT_STAT_DSRC_CACHE_WRITE 2048 +#define WT_STAT_DSRC_CACHE_WRITE 2051 +/*! cache: page written requiring lookaside records */ +#define WT_STAT_DSRC_CACHE_WRITE_LOOKASIDE 2052 +/*! cache: pages written requiring in-memory restoration */ +#define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2053 /*! compression: raw compression call failed, no additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2049 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2054 /*! compression: raw compression call failed, additional data available */ -#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2050 +#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2055 /*! compression: raw compression call succeeded */ -#define WT_STAT_DSRC_COMPRESS_RAW_OK 2051 +#define WT_STAT_DSRC_COMPRESS_RAW_OK 2056 /*! compression: compressed pages read */ -#define WT_STAT_DSRC_COMPRESS_READ 2052 +#define WT_STAT_DSRC_COMPRESS_READ 2057 /*! compression: compressed pages written */ -#define WT_STAT_DSRC_COMPRESS_WRITE 2053 +#define WT_STAT_DSRC_COMPRESS_WRITE 2058 /*! compression: page written failed to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2054 +#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2059 /*! compression: page written was too small to compress */ -#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2055 +#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2060 /*! cursor: create calls */ -#define WT_STAT_DSRC_CURSOR_CREATE 2056 +#define WT_STAT_DSRC_CURSOR_CREATE 2061 /*! cursor: insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT 2057 +#define WT_STAT_DSRC_CURSOR_INSERT 2062 /*! cursor: bulk-loaded cursor-insert calls */ -#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2058 +#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2063 /*! cursor: cursor-insert key and value bytes inserted */ -#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2059 +#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2064 /*! cursor: next calls */ -#define WT_STAT_DSRC_CURSOR_NEXT 2060 +#define WT_STAT_DSRC_CURSOR_NEXT 2065 /*! cursor: prev calls */ -#define WT_STAT_DSRC_CURSOR_PREV 2061 +#define WT_STAT_DSRC_CURSOR_PREV 2066 /*! cursor: remove calls */ -#define WT_STAT_DSRC_CURSOR_REMOVE 2062 +#define WT_STAT_DSRC_CURSOR_REMOVE 2067 /*! cursor: cursor-remove key bytes removed */ -#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2063 +#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2068 /*! cursor: reset calls */ -#define WT_STAT_DSRC_CURSOR_RESET 2064 +#define WT_STAT_DSRC_CURSOR_RESET 2069 +/*! cursor: restarted searches */ +#define WT_STAT_DSRC_CURSOR_RESTART 2070 /*! cursor: search calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH 2065 +#define WT_STAT_DSRC_CURSOR_SEARCH 2071 /*! cursor: search near calls */ -#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2066 +#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2072 /*! cursor: update calls */ -#define WT_STAT_DSRC_CURSOR_UPDATE 2067 +#define WT_STAT_DSRC_CURSOR_UPDATE 2073 /*! cursor: cursor-update value bytes updated */ -#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2068 +#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2074 /*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2069 +#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2075 /*! LSM: chunks in the LSM tree */ -#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2070 +#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2076 /*! LSM: highest merge generation in the LSM tree */ -#define WT_STAT_DSRC_LSM_GENERATION_MAX 2071 +#define WT_STAT_DSRC_LSM_GENERATION_MAX 2077 /*! LSM: queries that could have benefited from a Bloom filter that did * not exist */ -#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2072 +#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2078 /*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2073 +#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2079 /*! reconciliation: dictionary matches */ -#define WT_STAT_DSRC_REC_DICTIONARY 2074 +#define WT_STAT_DSRC_REC_DICTIONARY 2080 /*! reconciliation: internal page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2075 +#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2081 /*! reconciliation: leaf page multi-block writes */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2076 +#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2082 /*! reconciliation: maximum blocks required for a page */ -#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2077 +#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2083 /*! reconciliation: internal-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2078 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2084 /*! reconciliation: leaf-page overflow keys */ -#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2079 +#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2085 /*! reconciliation: overflow values written */ -#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2080 +#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2086 /*! reconciliation: pages deleted */ -#define WT_STAT_DSRC_REC_PAGE_DELETE 2081 +#define WT_STAT_DSRC_REC_PAGE_DELETE 2087 /*! reconciliation: page checksum matches */ -#define WT_STAT_DSRC_REC_PAGE_MATCH 2082 +#define WT_STAT_DSRC_REC_PAGE_MATCH 2088 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_DSRC_REC_PAGES 2083 +#define WT_STAT_DSRC_REC_PAGES 2089 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_DSRC_REC_PAGES_EVICTION 2084 +#define WT_STAT_DSRC_REC_PAGES_EVICTION 2090 /*! reconciliation: leaf page key bytes discarded using prefix compression */ -#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2085 +#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2091 /*! reconciliation: internal page key bytes discarded using suffix * compression */ -#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2086 +#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2092 /*! session: object compaction */ -#define WT_STAT_DSRC_SESSION_COMPACT 2087 +#define WT_STAT_DSRC_SESSION_COMPACT 2093 /*! session: open cursor count */ -#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2088 +#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2094 /*! transaction: update conflicts */ -#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2089 +#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2095 /*! @} */ /* * Statistics section: END diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h index 64e29e104bc..4d46a25b63c 100644 --- a/src/include/wt_internal.h +++ b/src/include/wt_internal.h @@ -41,6 +41,7 @@ extern "C" { #else #include <pthread.h> #endif +#include <stdbool.h> #include <stddef.h> #include <stdio.h> #include <stdint.h> @@ -55,11 +56,6 @@ extern "C" { #include <windows.h> #endif -/******************************************* - * WiredTiger externally maintained include files. - *******************************************/ -#include "queue.h" - /* * DO NOT EDIT: automatically built by dist/s_typedef. * Forward type declarations for internal types: BEGIN @@ -182,12 +178,18 @@ struct __wt_insert_head; typedef struct __wt_insert_head WT_INSERT_HEAD; struct __wt_keyed_encryptor; typedef struct __wt_keyed_encryptor WT_KEYED_ENCRYPTOR; +struct __wt_log; + typedef struct __wt_log WT_LOG; struct __wt_log_desc; typedef struct __wt_log_desc WT_LOG_DESC; struct __wt_log_op_desc; typedef struct __wt_log_op_desc WT_LOG_OP_DESC; struct __wt_log_rec_desc; typedef struct __wt_log_rec_desc WT_LOG_REC_DESC; +struct __wt_log_record; + typedef struct __wt_log_record WT_LOG_RECORD; +struct __wt_logslot; + typedef struct __wt_logslot WT_LOGSLOT; struct __wt_lsm_chunk; typedef struct __wt_lsm_chunk WT_LSM_CHUNK; struct __wt_lsm_data_source; @@ -204,6 +206,8 @@ struct __wt_lsm_worker_cookie; typedef struct __wt_lsm_worker_cookie WT_LSM_WORKER_COOKIE; struct __wt_multi; typedef struct __wt_multi WT_MULTI; +struct __wt_myslot; + typedef struct __wt_myslot WT_MYSLOT; struct __wt_named_collator; typedef struct __wt_named_collator WT_NAMED_COLLATOR; struct __wt_named_compressor; @@ -242,16 +246,18 @@ struct __wt_rwlock; typedef struct __wt_rwlock WT_RWLOCK; struct __wt_salvage_cookie; typedef struct __wt_salvage_cookie WT_SALVAGE_COOKIE; +struct __wt_save_upd; + typedef struct __wt_save_upd WT_SAVE_UPD; struct __wt_scratch_track; typedef struct __wt_scratch_track WT_SCRATCH_TRACK; struct __wt_session_impl; typedef struct __wt_session_impl WT_SESSION_IMPL; struct __wt_size; typedef struct __wt_size WT_SIZE; +struct __wt_spinlock; + typedef struct __wt_spinlock WT_SPINLOCK; struct __wt_split_stash; typedef struct __wt_split_stash WT_SPLIT_STASH; -struct __wt_stats; - typedef struct __wt_stats WT_STATS; struct __wt_table; typedef struct __wt_table WT_TABLE; struct __wt_txn; @@ -262,8 +268,6 @@ struct __wt_txn_op; typedef struct __wt_txn_op WT_TXN_OP; struct __wt_txn_state; typedef struct __wt_txn_state WT_TXN_STATE; -struct __wt_upd_skipped; - typedef struct __wt_upd_skipped WT_UPD_SKIPPED; struct __wt_update; typedef struct __wt_update WT_UPDATE; union __wt_rand_state; @@ -285,6 +289,8 @@ union __wt_rand_state; #endif #include "hardware.h" +#include "queue.h" + #ifdef _WIN32 #include "os_windows.h" #else @@ -330,6 +336,7 @@ union __wt_rand_state; #include "cache.i" /* required by txn.i */ #include "cell.i" /* required by btree.i */ +#include "log.i" #include "mutex.i" /* required by btree.i */ #include "txn.i" /* required by btree.i */ diff --git a/src/log/log.c b/src/log/log.c index 4242571fe53..4041761d062 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -34,6 +34,24 @@ __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn) } /* + * __wt_log_ckpt_lsn -- + * Force out buffered records and return an LSN for checkpoint. + */ +int +__wt_log_ckpt_lsn(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn) +{ + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + + conn = S2C(session); + log = conn->log; + WT_RET(__wt_log_force_write(session, 1)); + WT_RET(__wt_log_wrlsn(session)); + *ckp_lsn = log->write_start_lsn; + return (0); +} + +/* * __wt_log_background -- * Record the given LSN as the background LSN and signal the * thread as needed. @@ -53,7 +71,7 @@ __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn) * needed. */ __wt_spin_lock(session, &log->log_sync_lock); - if (WT_LOG_CMP(lsn, &log->bg_sync_lsn) > 0) + if (__wt_log_cmp(lsn, &log->bg_sync_lsn) > 0) log->bg_sync_lsn = *lsn; __wt_spin_unlock(session, &log->log_sync_lock); return (__wt_cond_signal(session, conn->log_file_cond)); @@ -100,7 +118,7 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) /* * Sync the log file if needed. */ - if (WT_LOG_CMP(&log->sync_lsn, min_lsn) < 0) { + if (__wt_log_cmp(&log->sync_lsn, min_lsn) < 0) { WT_ERR(__wt_verbose(session, WT_VERB_LOG, "log_force_sync: sync to LSN %d/%lu", min_lsn->file, min_lsn->offset)); @@ -241,6 +259,11 @@ __wt_log_get_all_files(WT_SESSION_IMPL *session, log = S2C(session)->log; *maxid = 0; + /* + * These may be files needed by backup. Force the current slot + * to get written to the file. + */ + WT_RET(__wt_log_force_write(session, 1)); WT_RET(__log_get_files(session, WT_LOG_FILENAME, &files, &count)); /* Filter out any files that are below the checkpoint LSN. */ @@ -354,70 +377,12 @@ static int __log_size_fit(WT_SESSION_IMPL *session, WT_LSN *lsn, uint64_t recsize) { WT_CONNECTION_IMPL *conn; - - conn = S2C(session); - return (lsn->offset + (wt_off_t)recsize < conn->log_file_max); -} - -/* - * __log_acquire -- - * Called with the log slot lock held. Can be called recursively - * from __wt_log_newfile when we change log files. - */ -static int -__log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot) -{ - WT_CONNECTION_IMPL *conn; WT_LOG *log; - int created_log; conn = S2C(session); log = conn->log; - created_log = 1; - /* - * Called locked. Add recsize to alloc_lsn. Save our starting LSN - * where the previous allocation finished for the release LSN. - * That way when log files switch, we're waiting for the correct LSN - * from outstanding writes. - */ - slot->slot_release_lsn = log->alloc_lsn; - if (!__log_size_fit(session, &log->alloc_lsn, recsize)) { - WT_RET(__wt_log_newfile(session, 0, &created_log)); - if (log->log_close_fh != NULL) - F_SET(slot, WT_SLOT_CLOSEFH); - } - - /* - * Checkpoints can be configured based on amount of log written. - * Add in this log record to the sum and if needed, signal the - * checkpoint condition. The logging subsystem manages the - * accumulated field. There is a bit of layering violation - * here checking the connection ckpt field and using its - * condition. - */ - if (WT_CKPT_LOGSIZE(conn)) { - log->log_written += (wt_off_t)recsize; - WT_RET(__wt_checkpoint_signal(session, log->log_written)); - } - - /* - * Need to minimally fill in slot info here. Our slot start LSN - * comes after any potential new log file creations. - */ - slot->slot_start_lsn = log->alloc_lsn; - slot->slot_start_offset = log->alloc_lsn.offset; - /* - * Pre-allocate on the first real write into the log file, if it - * was just created (i.e. not pre-allocated). - */ - if (log->alloc_lsn.offset == WT_LOG_FIRST_RECORD && created_log) - WT_RET(__log_prealloc(session, log->log_fh)); - - log->alloc_lsn.offset += (wt_off_t)recsize; - slot->slot_end_lsn = log->alloc_lsn; - slot->slot_error = 0; - slot->slot_fh = log->log_fh; - return (0); + return (lsn->offset == WT_LOG_FIRST_RECORD || + lsn->offset + (wt_off_t)recsize < conn->log_file_max); } /* @@ -490,24 +455,32 @@ __log_decrypt(WT_SESSION_IMPL *session, WT_ITEM *in, WT_ITEM *out) */ static int __log_fill(WT_SESSION_IMPL *session, - WT_MYSLOT *myslot, int direct, WT_ITEM *record, WT_LSN *lsnp) + WT_MYSLOT *myslot, int force, WT_ITEM *record, WT_LSN *lsnp) { WT_DECL_RET; WT_LOG_RECORD *logrec; + /* + * The WT_LOG_SLOT_BUF_MAX macro uses log. + */ logrec = (WT_LOG_RECORD *)record->mem; /* - * Call __wt_write. For now the offset is the real byte offset. If the - * offset becomes a unit of WT_LOG_ALIGN this is where we would multiply - * by WT_LOG_ALIGN to get the real file byte offset for write(). + * Call __wt_write or copy into the buffer. For now the offset is the + * real byte offset. If the offset becomes a unit of WT_LOG_ALIGN this + * is where we would multiply by WT_LOG_ALIGN to get the real file byte + * offset for write(). */ - if (direct) + if (!force && !F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) + memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset, + logrec, logrec->len); + else + /* + * If this is a force or unbuffered write, write it now. + * A forced write sends in a temporary, local slot. + */ WT_ERR(__wt_write(session, myslot->slot->slot_fh, myslot->offset + myslot->slot->slot_start_offset, (size_t)logrec->len, (void *)logrec)); - else - memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset, - logrec, logrec->len); WT_STAT_FAST_CONN_INCRV(session, log_bytes_written, logrec->len); if (lsnp != NULL) { @@ -563,13 +536,13 @@ __log_file_header( logrec->checksum = 0; logrec->checksum = __wt_cksum(logrec, log->allocsize); WT_CLEAR(tmp); + memset(&myslot, 0, sizeof(myslot)); myslot.slot = &tmp; - myslot.offset = 0; /* - * We may recursively call __log_acquire to allocate log space for the - * log descriptor record. Call __log_fill to write it, but we - * do not need to call __log_release because we're not waiting for + * We may recursively call __wt_log_acquire to allocate log space for + * the log descriptor record. Call __log_fill to write it, but we + * do not need to call __wt_log_release because we're not waiting for * any earlier operations to complete. */ if (prealloc) { @@ -577,7 +550,7 @@ __log_file_header( tmp.slot_fh = fh; } else { WT_ASSERT(session, fh == NULL); - WT_ERR(__log_acquire(session, logrec->len, &tmp)); + WT_ERR(__wt_log_acquire(session, logrec->len, &tmp)); } WT_ERR(__log_fill(session, &myslot, 1, buf, NULL)); /* @@ -697,6 +670,146 @@ err: __wt_scr_free(session, &from_path); } /* + * __log_newfile -- + * Create the next log file and write the file header record into it. + */ +static int +__log_newfile(WT_SESSION_IMPL *session, int conn_open, int *created) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LOG *log; + WT_LSN end_lsn; + int create_log, yield_cnt; + + conn = S2C(session); + log = conn->log; + + create_log = 1; + yield_cnt = 0; + /* + * Set aside the log file handle to be closed later. Other threads + * may still be using it to write to the log. If the log file size + * is small we could fill a log file before the previous one is closed. + * Wait for that to close. + */ + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); + while (log->log_close_fh != NULL) { + WT_STAT_FAST_CONN_INCR(session, log_close_yields); + WT_RET(__wt_log_wrlsn(session)); + if (++yield_cnt > 10000) + return (EBUSY); + __wt_yield(); + } + log->log_close_fh = log->log_fh; + if (log->log_close_fh != NULL) + log->log_close_lsn = log->alloc_lsn; + log->fileid++; + /* + * Make sure everything we set above is visible. + */ + WT_FULL_BARRIER(); + /* + * If we're pre-allocating log files, look for one. If there aren't any + * or we're not pre-allocating, then create one. + */ + if (conn->log_prealloc) { + ret = __log_alloc_prealloc(session, log->fileid); + /* + * If ret is 0 it means we found a pre-allocated file. + * If ret is non-zero but not WT_NOTFOUND, we return the error. + * If ret is WT_NOTFOUND, we leave create_log set and create + * the new log file. + */ + if (ret == 0) + create_log = 0; + /* + * If we get any error other than WT_NOTFOUND, return it. + */ + if (ret != 0 && ret != WT_NOTFOUND) + return (ret); + ret = 0; + } + /* + * If we need to create the log file, do so now. + */ + if (create_log) { + log->prep_missed++; + WT_RET(__wt_log_allocfile( + session, log->fileid, WT_LOG_FILENAME, 1)); + } + WT_RET(__log_openfile(session, + 0, &log->log_fh, WT_LOG_FILENAME, log->fileid)); + /* + * We need to setup the LSNs. Set the end LSN and alloc LSN to + * the end of the header. + */ + log->alloc_lsn.file = log->fileid; + log->alloc_lsn.offset = WT_LOG_FIRST_RECORD; + end_lsn = log->alloc_lsn; + + /* + * If we're called from connection creation code, we need to update + * the LSNs since we're the only write in progress. + */ + if (conn_open) { + WT_RET(__wt_fsync(session, log->log_fh)); + log->sync_lsn = end_lsn; + log->write_lsn = end_lsn; + log->write_start_lsn = end_lsn; + } + if (created != NULL) + *created = create_log; + return (0); +} + +/* + * __wt_log_acquire -- + * Called serially when switching slots. Can be called recursively + * from __log_newfile when we change log files. + */ +int +__wt_log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot) +{ + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + int created_log; + + conn = S2C(session); + log = conn->log; + created_log = 1; + /* + * Add recsize to alloc_lsn. Save our starting LSN + * where the previous allocation finished for the release LSN. + * That way when log files switch, we're waiting for the correct LSN + * from outstanding writes. + */ + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); + /* + * We need to set the release LSN earlier, before a log file change. + */ + slot->slot_release_lsn = log->alloc_lsn; + if (!__log_size_fit(session, &log->alloc_lsn, recsize)) { + WT_RET(__log_newfile(session, 0, &created_log)); + if (log->log_close_fh != NULL) + F_SET(slot, WT_SLOT_CLOSEFH); + } + + /* + * Pre-allocate on the first real write into the log file, if it + * was just created (i.e. not pre-allocated). + */ + if (log->alloc_lsn.offset == WT_LOG_FIRST_RECORD && created_log) + WT_RET(__log_prealloc(session, log->log_fh)); + /* + * Initialize the slot for activation. + */ + __wt_log_slot_activate(session, slot); + + return (0); +} + +/* * __log_truncate -- * Truncate the log to the given LSN. If this_log is set, it will only * truncate the log file indicated in the given LSN. If not set, @@ -791,7 +904,7 @@ __wt_log_allocfile( */ WT_RET(__wt_scr_alloc(session, 0, &from_path)); WT_ERR(__wt_scr_alloc(session, 0, &to_path)); - tmp_id = WT_ATOMIC_ADD4(log->tmp_fileid, 1); + tmp_id = __wt_atomic_add32(&log->tmp_fileid, 1); WT_ERR(__log_filename(session, tmp_id, WT_LOG_TMPNAME, from_path)); WT_ERR(__log_filename(session, lognum, dest, to_path)); /* @@ -842,7 +955,7 @@ err: __wt_scr_free(session, &path); * __wt_log_open -- * Open the appropriate log file for the connection. The purpose is * to find the last log file that exists, open it and set our initial - * LSNs to the end of that file. If none exist, call __wt_log_newfile + * LSNs to the end of that file. If none exist, call __log_newfile * to create it. */ int @@ -917,7 +1030,9 @@ __wt_log_open(WT_SESSION_IMPL *session) * Start logging at the beginning of the next log file, no matter * where the previous log file ends. */ - WT_ERR(__wt_log_newfile(session, 1, NULL)); + WT_WITH_SLOT_LOCK(session, log, + ret = __log_newfile(session, 1, NULL)); + WT_ERR(ret); /* If we found log files, save the new state. */ if (logcount > 0) { @@ -1055,48 +1170,67 @@ err: } /* - * __log_release -- + * __wt_log_release -- * Release a log slot. */ -static int -__log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) +int +__wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_LSN sync_lsn; - size_t write_size; - int locked, yield_count; + int locked, need_relock, yield_count; + int64_t release_buffered, release_bytes; conn = S2C(session); log = conn->log; - locked = yield_count = 0; - *freep = 1; + locked = need_relock = yield_count = 0; + if (freep != NULL) + *freep = 1; + release_buffered = + WT_LOG_SLOT_RELEASED_BUFFERED(slot->slot_state); + release_bytes = release_buffered + slot->slot_unbuffered; /* Write the buffered records */ - if (F_ISSET(slot, WT_SLOT_BUFFERED)) { - write_size = (size_t) - (slot->slot_end_lsn.offset - slot->slot_start_offset); - WT_ERR(__wt_write(session, slot->slot_fh, - slot->slot_start_offset, write_size, slot->slot_buf.mem)); + /* + * Checkpoints can be configured based on amount of log written. + * Add in this log record to the sum and if needed, signal the + * checkpoint condition. The logging subsystem manages the + * accumulated field. There is a bit of layering violation + * here checking the connection ckpt field and using its + * condition. + */ + if (WT_CKPT_LOGSIZE(conn)) { + log->log_written += (wt_off_t)release_bytes; + WT_RET(__wt_checkpoint_signal(session, log->log_written)); } + if (release_buffered != 0) + WT_ERR(__wt_write(session, + slot->slot_fh, slot->slot_start_offset, + (size_t)release_buffered, slot->slot_buf.mem)); + /* - * If this is not a buffered write, meaning the slot we have is a - * dummy constructed slot, not from the slot pool, or we have to wait - * for a synchronous operation, we do not pass handling of this slot - * off to the worker thread. The caller is responsible for freeing - * the slot in that case. Otherwise the worker thread will free it. + * If we have to wait for a synchronous operation, we do not pass + * handling of this slot off to the worker thread. The caller is + * responsible for freeing the slot in that case. Otherwise the + * worker thread will free it. */ - if (F_ISSET(slot, WT_SLOT_BUFFERED) && - !F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) { - *freep = 0; + if (!F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) { + if (freep != NULL) + *freep = 0; slot->slot_state = WT_LOG_SLOT_WRITTEN; /* * After this point the worker thread owns the slot. There * is nothing more to do but return. */ - WT_ERR(__wt_cond_signal(session, conn->log_wrlsn_cond)); + /* + * !!! Signalling the wrlsn_cond condition here results in + * worse performance because it causes more scheduling churn + * and more walking of the slot pool for a very small number + * of slots to process. Don't signal here. + */ goto done; } @@ -1105,15 +1239,31 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) * be holes in the log file. */ WT_STAT_FAST_CONN_INCR(session, log_release_write_lsn); - while (WT_LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0) { + while (__wt_log_cmp(&log->write_lsn, &slot->slot_release_lsn) != 0) { + /* + * If we're on a locked path and the write LSN is not advancing, + * unlock in case an earlier thread is trying to switch its + * slot and complete its operation. + */ + if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) { + __wt_spin_unlock(session, &log->log_slot_lock); + need_relock = 1; + } if (++yield_count < 1000) __wt_yield(); else WT_ERR(__wt_cond_wait( session, log->log_write_cond, 200)); + if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) { + __wt_spin_lock(session, &log->log_slot_lock); + need_relock = 0; + } } + log->write_start_lsn = slot->slot_start_lsn; log->write_lsn = slot->slot_end_lsn; + + WT_ASSERT(session, slot != log->active_slot); WT_ERR(__wt_cond_signal(session, log->log_write_cond)); /* @@ -1168,7 +1318,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) * Sync the log file if needed. */ if (F_ISSET(slot, WT_SLOT_SYNC) && - WT_LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) { + __wt_log_cmp(&log->sync_lsn, &slot->slot_end_lsn) < 0) { WT_ERR(__wt_verbose(session, WT_VERB_LOG, "log_release: sync log %s", log->log_fh->name)); WT_STAT_FAST_CONN_INCR(session, log_sync); @@ -1186,6 +1336,8 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep) } err: if (locked) __wt_spin_unlock(session, &log->log_sync_lock); + if (need_relock) + __wt_spin_lock(session, &log->log_slot_lock); if (ret != 0 && slot->slot_error == 0) slot->slot_error = ret; done: @@ -1193,93 +1345,6 @@ done: } /* - * __wt_log_newfile -- - * Create the next log file and write the file header record into it. - */ -int -__wt_log_newfile(WT_SESSION_IMPL *session, int conn_create, int *created) -{ - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - WT_LOG *log; - WT_LSN end_lsn; - int create_log; - - conn = S2C(session); - log = conn->log; - - create_log = 1; - /* - * Set aside the log file handle to be closed later. Other threads - * may still be using it to write to the log. If the log file size - * is small we could fill a log file before the previous one is closed. - * Wait for that to close. - */ - while (log->log_close_fh != NULL) { - WT_STAT_FAST_CONN_INCR(session, log_close_yields); - WT_RET(__wt_log_wrlsn(session, NULL, NULL)); - __wt_yield(); - } - log->log_close_fh = log->log_fh; - log->fileid++; - - /* - * If we're pre-allocating log files, look for one. If there aren't any - * or we're not pre-allocating, then create one. - */ - ret = 0; - if (conn->log_prealloc) { - ret = __log_alloc_prealloc(session, log->fileid); - /* - * If ret is 0 it means we found a pre-allocated file. - * If ret is non-zero but not WT_NOTFOUND, we return the error. - * If ret is WT_NOTFOUND, we leave create_log set and create - * the new log file. - */ - if (ret == 0) - create_log = 0; - /* - * If we get any error other than WT_NOTFOUND, return it. - */ - if (ret != 0 && ret != WT_NOTFOUND) - return (ret); - ret = 0; - } - /* - * If we need to create the log file, do so now. - */ - if (create_log) { - log->prep_missed++; - if ((ret = __wt_log_allocfile( - session, log->fileid, WT_LOG_FILENAME, 0)) != 0) - return (ret); - } - WT_RET(__log_openfile(session, - 0, &log->log_fh, WT_LOG_FILENAME, log->fileid)); - /* - * We need to setup the LSNs. Set the end LSN and alloc LSN to - * the end of the header. - */ - log->alloc_lsn.file = log->fileid; - log->alloc_lsn.offset = WT_LOG_FIRST_RECORD; - end_lsn = log->alloc_lsn; - - /* - * If we're called from connection creation code, we need to update - * the LSNs since we're the only write in progress. - */ - if (conn_create) { - WT_RET(__wt_fsync(session, log->log_fh)); - log->sync_lsn = end_lsn; - log->write_lsn = end_lsn; - log->write_start_lsn = end_lsn; - } - if (created != NULL) - *created = create_log; - return (0); -} - -/* * __wt_log_scan -- * Scan the logs, calling a function on each record found. */ @@ -1535,7 +1600,7 @@ advance: /* Truncate if we're in recovery. */ if (LF_ISSET(WT_LOGSCAN_RECOVER) && - WT_LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0) + __wt_log_cmp(&rd_lsn, &log->trunc_lsn) < 0) WT_ERR(__log_truncate(session, &rd_lsn, WT_LOG_FILENAME, 0)); @@ -1559,43 +1624,20 @@ err: WT_STAT_FAST_CONN_INCR(session, log_scans); } /* - * __log_direct_write -- - * Write a log record without using the consolidation arrays. + * __wt_log_force_write -- + * Force a switch and release and write of the current slot. + * Wrapper function that takes the lock. */ -static int -__log_direct_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, - uint32_t flags) +int +__wt_log_force_write(WT_SESSION_IMPL *session, int retry) { - WT_DECL_RET; WT_LOG *log; - WT_LOGSLOT tmp; WT_MYSLOT myslot; - int dummy, locked; log = S2C(session)->log; - myslot.slot = &tmp; - myslot.offset = 0; - dummy = 0; - WT_CLEAR(tmp); - - /* Fast path the contended case. */ - if (__wt_spin_trylock(session, &log->log_slot_lock) != 0) - return (EAGAIN); - locked = 1; - - if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC)) - F_SET(&tmp, WT_SLOT_SYNC_DIR); - if (LF_ISSET(WT_LOG_FSYNC)) - F_SET(&tmp, WT_SLOT_SYNC); - WT_ERR(__log_acquire(session, record->size, &tmp)); - __wt_spin_unlock(session, &log->log_slot_lock); - locked = 0; - WT_ERR(__log_fill(session, &myslot, 1, record, lsnp)); - WT_ERR(__log_release(session, &tmp, &dummy)); - -err: if (locked) - __wt_spin_unlock(session, &log->log_slot_lock); - return (ret); + memset(&myslot, 0, sizeof(myslot)); + myslot.slot = log->active_slot; + return (__wt_log_slot_switch(session, &myslot, retry, 1)); } /* @@ -1741,14 +1783,16 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, WT_LOG_RECORD *logrec; WT_LSN lsn; WT_MYSLOT myslot; - uint32_t rdup_len; - int free_slot, locked; + int64_t release_size; + uint32_t force, rdup_len; + int free_slot; conn = S2C(session); log = conn->log; - free_slot = locked = 0; + free_slot = 0; WT_INIT_LSN(&lsn); myslot.slot = NULL; + memset(&myslot, 0, sizeof(myslot)); /* * Assume the WT_ITEM the caller passed is a WT_LOG_RECORD, which has a * header at the beginning for us to fill in. @@ -1778,87 +1822,67 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, WT_STAT_FAST_CONN_INCR(session, log_writes); - if (!F_ISSET(log, WT_LOG_FORCE_CONSOLIDATE)) { - ret = __log_direct_write(session, record, &lsn, flags); - if (ret == 0 && lsnp != NULL) - *lsnp = lsn; - /* - * All needed syncing will be handled directly except - * a background sync. Handle that here. - */ - if (ret == 0) { - if (LF_ISSET(WT_LOG_BACKGROUND)) - goto bg; - else - return (0); - } - if (ret != EAGAIN) - WT_ERR(ret); - /* - * An EAGAIN return means we failed to get the try lock - - * fall through to the consolidation code in that case. - */ - } - + __wt_log_slot_join(session, rdup_len, flags, &myslot); + /* + * If the addition of this record crosses the buffer boundary, + * switch in a new slot. + */ + force = LF_ISSET(WT_LOG_FLUSH | WT_LOG_FSYNC); + ret = 0; + if (myslot.end_offset >= WT_LOG_SLOT_BUF_MAX || + F_ISSET(&myslot, WT_MYSLOT_UNBUFFERED) || force) + ret = __wt_log_slot_switch(session, &myslot, 1, 0); + if (ret == 0) + ret = __log_fill(session, &myslot, 0, record, &lsn); + release_size = __wt_log_slot_release( + session, &myslot, (int64_t)rdup_len); /* - * As soon as we see contention for the log slot, disable direct - * log writes. We get better performance by forcing writes through - * the consolidation code. This is because individual writes flood - * the I/O system faster than they contend on the log slot lock. + * If we get an error we still need to do proper accounting in + * the slot fields. + * XXX On error we may still need to call release and free. */ - F_SET(log, WT_LOG_FORCE_CONSOLIDATE); - if ((ret = __wt_log_slot_join( - session, rdup_len, flags, &myslot)) == ENOMEM) { + if (ret != 0) + myslot.slot->slot_error = ret; + WT_ASSERT(session, ret == 0); + if (WT_LOG_SLOT_DONE(release_size)) { + WT_ERR(__wt_log_release(session, myslot.slot, &free_slot)); + if (free_slot) + __wt_log_slot_free(session, myslot.slot); + } else if (force) { /* - * If we couldn't find a consolidated slot for this record - * write the record directly. + * If we are going to wait for this slot to get written, + * signal the wrlsn thread. + * + * XXX I've seen times when conditions are NULL. */ - while ((ret = __log_direct_write( - session, record, lsnp, flags)) == EAGAIN) - ; - WT_ERR(ret); - return (0); + if (conn->log_cond != NULL) { + WT_ERR(__wt_cond_signal(session, conn->log_cond)); + __wt_yield(); + } else + WT_ERR(__wt_log_force_write(session, 1)); } - WT_ERR(ret); - if (myslot.offset == 0) { - __wt_spin_lock(session, &log->log_slot_lock); - locked = 1; - WT_ERR(__wt_log_slot_close(session, myslot.slot)); - WT_ERR(__log_acquire( - session, myslot.slot->slot_group_size, myslot.slot)); - __wt_spin_unlock(session, &log->log_slot_lock); - locked = 0; - WT_ERR(__wt_log_slot_notify(session, myslot.slot)); - } else - WT_ERR(__wt_log_slot_wait(session, myslot.slot)); - WT_ERR(__log_fill(session, &myslot, 0, record, &lsn)); - if (__wt_log_slot_release(myslot.slot, rdup_len) == WT_LOG_SLOT_DONE) { - WT_ERR(__log_release(session, myslot.slot, &free_slot)); - if (free_slot) - WT_ERR(__wt_log_slot_free(session, myslot.slot)); + if (LF_ISSET(WT_LOG_FLUSH)) { + /* Wait for our writes to reach the OS */ + while (__wt_log_cmp(&log->write_lsn, &lsn) <= 0 && + myslot.slot->slot_error == 0) + (void)__wt_cond_wait( + session, log->log_write_cond, 10000); } else if (LF_ISSET(WT_LOG_FSYNC)) { /* Wait for our writes to reach disk */ - while (WT_LOG_CMP(&log->sync_lsn, &lsn) <= 0 && + while (__wt_log_cmp(&log->sync_lsn, &lsn) <= 0 && myslot.slot->slot_error == 0) (void)__wt_cond_wait( session, log->log_sync_cond, 10000); - } else if (LF_ISSET(WT_LOG_FLUSH)) { - /* Wait for our writes to reach the OS */ - while (WT_LOG_CMP(&log->write_lsn, &lsn) <= 0 && - myslot.slot->slot_error == 0) - (void)__wt_cond_wait( - session, log->log_write_cond, 10000); } /* * Advance the background sync LSN if needed. */ -bg: if (LF_ISSET(WT_LOG_BACKGROUND) && - WT_LOG_CMP(&session->bg_sync_lsn, &lsn) <= 0) + if (LF_ISSET(WT_LOG_BACKGROUND) && + __wt_log_cmp(&session->bg_sync_lsn, &lsn) <= 0) WT_ERR(__wt_log_background(session, &lsn)); -err: if (locked) - __wt_spin_unlock(session, &log->log_slot_lock); +err: if (ret == 0 && lsnp != NULL) *lsnp = lsn; /* diff --git a/src/log/log_slot.c b/src/log/log_slot.c index 0b580af4526..216a594ce3d 100644 --- a/src/log/log_slot.c +++ b/src/log/log_slot.c @@ -9,325 +9,486 @@ #include "wt_internal.h" /* - * This file implements the consolidated array algorithm as described in - * the paper: - * Scalability of write-ahead logging on multicore and multisocket hardware - * by Ryan Johnson, Ippokratis Pandis, Radu Stoica, Manos Athanassoulis - * and Anastasia Ailamaki. - * - * It appeared in The VLDB Journal, DOI 10.1007/s00778-011-0260-8 and can - * be found at: - * http://infoscience.epfl.ch/record/170505/files/aether-smpfulltext.pdf + * __wt_log_slot_activate -- + * Initialize a slot to become active. */ - -/* - * __wt_log_slot_init -- - * Initialize the slot array. - */ -int -__wt_log_slot_init(WT_SESSION_IMPL *session) +void +__wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { WT_CONNECTION_IMPL *conn; - WT_DECL_RET; WT_LOG *log; - WT_LOGSLOT *slot; - int32_t i; conn = S2C(session); log = conn->log; - for (i = 0; i < WT_SLOT_POOL; i++) { - log->slot_pool[i].slot_state = WT_LOG_SLOT_FREE; - log->slot_pool[i].slot_index = WT_SLOT_INVALID_INDEX; - } - /* - * Set up the available slots from the pool the first time. - */ - for (i = 0; i < WT_SLOT_ACTIVE; i++) { - slot = &log->slot_pool[i]; - slot->slot_index = (uint32_t)i; - slot->slot_state = WT_LOG_SLOT_READY; - log->slot_array[i] = slot; - } - - /* - * Allocate memory for buffers now that the arrays are setup. Split - * this out to make error handling simpler. - * - * Cap the slot buffer to the log file size. - */ - log->slot_buf_size = - WT_MIN((size_t)conn->log_file_max, WT_LOG_SLOT_BUF_SIZE); - for (i = 0; i < WT_SLOT_POOL; i++) { - WT_ERR(__wt_buf_init(session, - &log->slot_pool[i].slot_buf, log->slot_buf_size)); - F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS); - } - WT_STAT_FAST_CONN_INCRV(session, - log_buffer_size, log->slot_buf_size * WT_SLOT_POOL); - if (0) { -err: while (--i >= 0) - __wt_buf_free(session, &log->slot_pool[i].slot_buf); - } - return (ret); + slot->slot_state = 0; + slot->slot_start_lsn = slot->slot_end_lsn = log->alloc_lsn; + slot->slot_start_offset = log->alloc_lsn.offset; + slot->slot_last_offset = log->alloc_lsn.offset; + slot->slot_fh = log->log_fh; + slot->slot_error = 0; + slot->slot_unbuffered = 0; } /* - * __wt_log_slot_destroy -- - * Clean up the slot array on shutdown. + * __wt_log_slot_close -- + * Close out the slot the caller is using. The slot may already be + * closed or freed by another thread. */ int -__wt_log_slot_destroy(WT_SESSION_IMPL *session) +__wt_log_slot_close( + WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *releasep, int forced) { WT_CONNECTION_IMPL *conn; WT_LOG *log; - int i; + int64_t end_offset, new_state, old_state; + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); conn = S2C(session); log = conn->log; - - for (i = 0; i < WT_SLOT_POOL; i++) - __wt_buf_free(session, &log->slot_pool[i].slot_buf); + if (releasep != NULL) + *releasep = 0; + if (slot == NULL) + return (WT_NOTFOUND); +retry: + old_state = slot->slot_state; + /* + * If this close is coming from a forced close and a thread is in + * the middle of using the slot, return EBUSY. The caller can + * decide if retrying is necessary or not. + */ + if (forced && WT_LOG_SLOT_INPROGRESS(old_state)) + return (EBUSY); + /* + * If someone else is switching out this slot we lost. Nothing to + * do but return. Return WT_NOTFOUND anytime the given slot was + * processed by another closing thread. Only return 0 when we + * actually closed the slot. + */ + if (WT_LOG_SLOT_CLOSED(old_state)) + return (WT_NOTFOUND); + /* + * If someone completely processed this slot, we're done. + */ + if (FLD64_ISSET((uint64_t)slot->slot_state, WT_LOG_SLOT_RESERVED)) + return (WT_NOTFOUND); + new_state = (old_state | WT_LOG_SLOT_CLOSE); + /* + * Close this slot. If we lose the race retry. + */ + if (!__wt_atomic_casiv64(&slot->slot_state, old_state, new_state)) + goto retry; + /* + * We own the slot now. No one else can join. + * Set the end LSN. + */ + WT_STAT_FAST_CONN_INCR(session, log_slot_closes); + if (WT_LOG_SLOT_DONE(new_state) && releasep != NULL) + *releasep = 1; + slot->slot_end_lsn = slot->slot_start_lsn; + end_offset = + WT_LOG_SLOT_JOINED_BUFFERED(old_state) + slot->slot_unbuffered; + slot->slot_end_lsn.offset += (wt_off_t)end_offset; + WT_STAT_FAST_CONN_INCRV(session, + log_slot_consolidated, end_offset); + /* + * XXX Would like to change so one piece of code advances the LSN. + */ + log->alloc_lsn = slot->slot_end_lsn; + WT_ASSERT(session, log->alloc_lsn.file >= log->write_lsn.file); return (0); } /* - * __wt_log_slot_join -- - * Join a consolidated logging slot. Callers should be prepared to deal - * with a ENOMEM return - which indicates no slots could accommodate - * the log record. + * __log_slot_switch_internal -- + * Switch out the current slot and set up a new one. */ -int -__wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, - uint32_t flags, WT_MYSLOT *myslotp) +static int +__log_slot_switch_internal( + WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int forced) { - WT_CONNECTION_IMPL *conn; + WT_DECL_RET; WT_LOG *log; WT_LOGSLOT *slot; - int64_t new_state, old_state; - uint32_t allocated_slot, slot_attempts; + int free_slot, release; - conn = S2C(session); - log = conn->log; - slot_attempts = 0; + log = S2C(session)->log; + release = 0; + slot = myslot->slot; + + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); - if (mysize >= (uint64_t)log->slot_buf_size) { - WT_STAT_FAST_CONN_INCR(session, log_slot_toobig); - return (ENOMEM); - } -find_slot: -#if WT_SLOT_ACTIVE == 1 - allocated_slot = 0; -#else - allocated_slot = __wt_random(&session->rnd) % WT_SLOT_ACTIVE; -#endif - /* - * Get the selected slot. Use a barrier to prevent the compiler from - * caching this read. - */ - WT_BARRIER(); - slot = log->slot_array[allocated_slot]; -join_slot: - /* - * Read the current slot state. Use a barrier to prevent the compiler - * from caching this read. - */ - WT_BARRIER(); - old_state = slot->slot_state; - /* - * WT_LOG_SLOT_READY and higher means the slot is available for - * joining. Any other state means it is in use and transitioning - * from the active array. - */ - if (old_state < WT_LOG_SLOT_READY) { - WT_STAT_FAST_CONN_INCR(session, log_slot_transitions); - goto find_slot; - } /* - * Add in our size to the state and then atomically swap that - * into place if it is still the same value. + * If someone else raced us to closing this specific slot, we're + * done here. */ - new_state = old_state + (int64_t)mysize; - if (new_state < old_state) { - /* Our size doesn't fit here. */ - WT_STAT_FAST_CONN_INCR(session, log_slot_toobig); - goto find_slot; - } + if (slot != log->active_slot) + return (0); + /* - * If the slot buffer isn't big enough to hold this update, try - * to find another slot. + * If close returns WT_NOTFOUND, it means that someone else is + * processing the slot change. However, we could have retried + * from a busy time creating a new slot. If so, we are that + * someone else and we need to try setting up a new slot again. */ - if (new_state > (int64_t)slot->slot_buf.memsize) { - if (++slot_attempts > 5) { - WT_STAT_FAST_CONN_INCR(session, log_slot_toosmall); - return (ENOMEM); + if (!F_ISSET(myslot, WT_MYSLOT_CLOSE)) { + ret = __wt_log_slot_close( + session, slot, &release, forced); + if (ret == WT_NOTFOUND) + return (0); + WT_RET(ret); + if (release) { + WT_RET(__wt_log_release(session, slot, &free_slot)); + if (free_slot) + __wt_log_slot_free(session, slot); } - goto find_slot; } /* - * We lost a race to add our size into this slot. Check the state - * and try again. + * Set that we have closed this slot because we may call in here + * multiple times if we retry creating a new slot. */ - if (!WT_ATOMIC_CAS8(slot->slot_state, old_state, new_state)) { - WT_STAT_FAST_CONN_INCR(session, log_slot_races); - goto join_slot; - } - WT_ASSERT(session, myslotp != NULL); + F_SET(myslot, WT_MYSLOT_CLOSE); + WT_RET(__wt_log_slot_new(session)); + F_CLR(myslot, WT_MYSLOT_CLOSE); + return (0); +} + +/* + * __wt_log_slot_switch -- + * Switch out the current slot and set up a new one. + */ +int +__wt_log_slot_switch( + WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int retry, int forced) +{ + WT_DECL_RET; + WT_LOG *log; + + log = S2C(session)->log; /* - * We joined this slot. Fill in our information to return to - * the caller. + * !!! Since the WT_WITH_SLOT_LOCK macro is a do-while loop, the + * compiler does not like it combined directly with the while loop + * here. */ - WT_STAT_FAST_CONN_INCR(session, log_slot_joins); - if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC)) - F_SET(slot, WT_SLOT_SYNC_DIR); - if (LF_ISSET(WT_LOG_FSYNC)) - F_SET(slot, WT_SLOT_SYNC); - myslotp->slot = slot; - myslotp->offset = (wt_off_t)old_state - WT_LOG_SLOT_READY; - return (0); + do { + WT_WITH_SLOT_LOCK(session, log, + ret = __log_slot_switch_internal( + session, myslot, forced)); + if (ret == EBUSY) { + WT_STAT_FAST_CONN_INCR(session, log_slot_switch_busy); + __wt_yield(); + } + } while (F_ISSET(myslot, WT_MYSLOT_CLOSE) || (retry && ret == EBUSY)); + return (ret); } /* - * __log_slot_find_free -- - * Find and return a free log slot. + * __wt_log_slot_new -- + * Find a free slot and switch it as the new active slot. + * Must be called holding the slot lock. */ -static int -__log_slot_find_free(WT_SESSION_IMPL *session, WT_LOGSLOT **slot) +int +__wt_log_slot_new(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_LOG *log; - uint32_t pool_i; + WT_LOGSLOT *slot; + int32_t i; + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); conn = S2C(session); log = conn->log; - WT_ASSERT(session, slot != NULL); /* - * Encourage processing and moving the write LSN forward. - * That process has to walk the slots anyway, so do that - * work and let it give us the index of a free slot along - * the way. + * Although this function is single threaded, multiple threads could + * be trying to set a new active slot sequentially. If we find an + * active slot that is valid, return. */ - WT_RET(__wt_log_wrlsn(session, &pool_i, NULL)); - while (pool_i == WT_SLOT_POOL) { + if ((slot = log->active_slot) != NULL && + WT_LOG_SLOT_OPEN(slot->slot_state)) + return (0); + + /* + * Keep trying until we can find a free slot. + */ + for (;;) { + /* + * For now just restart at 0. We could use log->pool_index + * if that is inefficient. + */ + for (i = 0; i < WT_SLOT_POOL; i++) { + slot = &log->slot_pool[i]; + if (slot->slot_state == WT_LOG_SLOT_FREE) { + /* + * Make sure that the next buffer size can + * fit in the file. Proactively switch if + * it cannot. This reduces, but does not + * eliminate, log files that exceed the + * maximum file size. + * + * We want to minimize the risk of an + * error due to no space. + */ + WT_RET(__wt_log_acquire(session, + log->slot_buf_size, slot)); + /* + * We have a new, free slot to use. + * Set it as the active slot. + */ + WT_STAT_FAST_CONN_INCR(session, + log_slot_transitions); + log->active_slot = slot; + return (0); + } + } + /* + * If we didn't find any free slots signal the worker thread. + */ + (void)__wt_cond_signal(session, conn->log_wrlsn_cond); __wt_yield(); - WT_RET(__wt_log_wrlsn(session, &pool_i, NULL)); } - *slot = &log->slot_pool[pool_i]; - WT_ASSERT(session, (*slot)->slot_state == WT_LOG_SLOT_FREE); - return (0); + /* NOTREACHED */ } /* - * __wt_log_slot_close -- - * Close a slot and do not allow any other threads to join this slot. - * Remove this from the active slot array and move a new slot from - * the pool into its place. Set up the size of this group; - * Must be called with the logging spinlock held. + * __wt_log_slot_init -- + * Initialize the slot array. */ int -__wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) +__wt_log_slot_init(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; + WT_DECL_RET; WT_LOG *log; - WT_LOGSLOT *newslot; - int64_t old_state; + WT_LOGSLOT *slot; + int32_t i; conn = S2C(session); log = conn->log; - /* - * Find an unused slot in the pool. - */ - WT_RET(__log_slot_find_free(session, &newslot)); + WT_CACHE_LINE_ALIGNMENT_VERIFY(session, log->slot_pool); + for (i = 0; i < WT_SLOT_POOL; i++) + log->slot_pool[i].slot_state = WT_LOG_SLOT_FREE; /* - * Swap out the slot we're going to use and put a free one in the - * slot array in its place so that threads can use it right away. + * Allocate memory for buffers now that the arrays are setup. Split + * this out to make error handling simpler. */ - WT_STAT_FAST_CONN_INCR(session, log_slot_closes); - newslot->slot_state = WT_LOG_SLOT_READY; - newslot->slot_index = slot->slot_index; - log->slot_array[newslot->slot_index] = newslot; - old_state = WT_ATOMIC_STORE8(slot->slot_state, WT_LOG_SLOT_PENDING); - slot->slot_group_size = (uint64_t)(old_state - WT_LOG_SLOT_READY); /* - * Note that this statistic may be much bigger than in reality, - * especially when compared with the total bytes written in - * __log_fill. The reason is that this size reflects any - * rounding up that is needed and the total bytes in __log_fill - * is the amount of user bytes. + * Cap the slot buffer to the log file size times two if needed. + * That means we try to fill to half the buffer but allow some + * extra space. + * + * !!! If the buffer size is too close to the log file size, we will + * switch log files very aggressively. Scale back the buffer for + * small log file sizes. */ + log->slot_buf_size = (uint32_t)WT_MIN( + (size_t)conn->log_file_max/10, WT_LOG_SLOT_BUF_SIZE); + for (i = 0; i < WT_SLOT_POOL; i++) { + WT_ERR(__wt_buf_init(session, + &log->slot_pool[i].slot_buf, log->slot_buf_size)); + F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS); + } WT_STAT_FAST_CONN_INCRV(session, - log_slot_consolidated, (uint64_t)slot->slot_group_size); - return (0); + log_buffer_size, log->slot_buf_size * WT_SLOT_POOL); + /* + * Set up the available slot from the pool the first time. + */ + slot = &log->slot_pool[0]; + /* + * We cannot initialize the release LSN in the activate function + * because that is called after a log file switch. + */ + slot->slot_release_lsn = log->alloc_lsn; + __wt_log_slot_activate(session, slot); + log->active_slot = slot; + + if (0) { +err: while (--i >= 0) + __wt_buf_free(session, &log->slot_pool[i].slot_buf); + } + return (ret); } /* - * __wt_log_slot_notify -- - * Notify all threads waiting for the state to be < WT_LOG_SLOT_DONE. + * __wt_log_slot_destroy -- + * Clean up the slot array on shutdown. */ int -__wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) +__wt_log_slot_destroy(WT_SESSION_IMPL *session) { - WT_UNUSED(session); + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + WT_LOGSLOT *slot; + int64_t rel; + int i; - slot->slot_state = - (int64_t)WT_LOG_SLOT_DONE - (int64_t)slot->slot_group_size; + conn = S2C(session); + log = conn->log; + + /* + * Write out any remaining buffers. Free the buffer. + */ + for (i = 0; i < WT_SLOT_POOL; i++) { + slot = &log->slot_pool[i]; + if (!FLD64_ISSET( + (uint64_t)slot->slot_state, WT_LOG_SLOT_RESERVED)) { + rel = WT_LOG_SLOT_RELEASED_BUFFERED(slot->slot_state); + if (rel != 0) + WT_RET(__wt_write(session, slot->slot_fh, + slot->slot_start_offset, (size_t)rel, + slot->slot_buf.mem)); + } + __wt_buf_free(session, &log->slot_pool[i].slot_buf); + } return (0); } /* - * __wt_log_slot_wait -- - * Wait for slot leader to allocate log area and tell us our log offset. + * __wt_log_slot_join -- + * Join a consolidated logging slot. Must be called with + * the read lock held. */ -int -__wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) +void +__wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, + uint32_t flags, WT_MYSLOT *myslot) { - int yield_count; + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + WT_LOGSLOT *slot; + int64_t flag_state, new_state, old_state, released; + int32_t join_offset, new_join; +#ifdef HAVE_DIAGNOSTIC + int unbuf_force; +#endif - yield_count = 0; - WT_UNUSED(session); + conn = S2C(session); + log = conn->log; - while (slot->slot_state > WT_LOG_SLOT_DONE) - if (++yield_count < 1000) - __wt_yield(); - else - __wt_sleep(0, 200); - return (0); + /* + * Make sure the length cannot overflow. The caller should not + * even call this function if it doesn't fit but use direct + * writes. + */ + WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT)); + + /* + * There should almost always be a slot open. + */ +#ifdef HAVE_DIAGNOSTIC + unbuf_force = ((++log->write_calls % 1000) == 0); +#endif + for (;;) { + WT_BARRIER(); + slot = log->active_slot; + old_state = slot->slot_state; + /* + * Try to join our size into the existing size and + * atomically write it back into the state. + */ + flag_state = WT_LOG_SLOT_FLAGS(old_state); + released = WT_LOG_SLOT_RELEASED(old_state); + join_offset = WT_LOG_SLOT_JOINED(old_state); +#ifdef HAVE_DIAGNOSTIC + if (unbuf_force || mysize > WT_LOG_SLOT_BUF_MAX) { +#else + if (mysize > WT_LOG_SLOT_BUF_MAX) { +#endif + new_join = join_offset + WT_LOG_SLOT_UNBUFFERED; + F_SET(myslot, WT_MYSLOT_UNBUFFERED); + myslot->slot = slot; + } else + new_join = join_offset + (int32_t)mysize; + new_state = (int64_t)WT_LOG_SLOT_JOIN_REL( + (int64_t)new_join, (int64_t)released, (int64_t)flag_state); + + /* + * Check if the slot is open for joining and we are able to + * swap in our size into the state. + */ + if (WT_LOG_SLOT_OPEN(old_state) && + __wt_atomic_casiv64( + &slot->slot_state, old_state, new_state)) + break; + /* + * The slot is no longer open or we lost the race to + * update it. Yield and try again. + */ + WT_STAT_FAST_CONN_INCR(session, log_slot_races); + __wt_yield(); + } + /* + * We joined this slot. Fill in our information to return to + * the caller. + */ + if (mysize != 0) + WT_STAT_FAST_CONN_INCR(session, log_slot_joins); + if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC)) + F_SET(slot, WT_SLOT_SYNC_DIR); + if (LF_ISSET(WT_LOG_FSYNC)) + F_SET(slot, WT_SLOT_SYNC); + if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) { + WT_ASSERT(session, slot->slot_unbuffered == 0); + WT_STAT_FAST_CONN_INCR(session, log_slot_unbuffered); + slot->slot_unbuffered = (int64_t)mysize; + } + myslot->slot = slot; + myslot->offset = join_offset; + myslot->end_offset = (wt_off_t)((uint64_t)join_offset + mysize); } /* * __wt_log_slot_release -- * Each thread in a consolidated group releases its portion to - * signal it has completed writing its piece of the log. + * signal it has completed copying its piece of the log into + * the memory buffer. */ int64_t -__wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size) +__wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size) { - int64_t newsize; + WT_LOGSLOT *slot; + wt_off_t cur_offset, my_start; + int64_t my_size, rel_size; + WT_UNUSED(session); + slot = myslot->slot; + my_start = slot->slot_start_offset + myslot->offset; + while ((cur_offset = slot->slot_last_offset) < my_start) { + /* + * Set our offset if we are larger. + */ + if (__wt_atomic_casiv64( + &slot->slot_last_offset, cur_offset, my_start)) + break; + /* + * If we raced another thread updating this, try again. + */ + WT_BARRIER(); + } /* - * Add my size into the state. When it reaches WT_LOG_SLOT_DONE - * all participatory threads have completed copying their piece. + * Add my size into the state and return the new size. */ - newsize = WT_ATOMIC_ADD8(slot->slot_state, (int64_t)size); - return (newsize); + rel_size = size; + if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) + rel_size = WT_LOG_SLOT_UNBUFFERED; + my_size = (int64_t)WT_LOG_SLOT_JOIN_REL((int64_t)0, rel_size, 0); + return (__wt_atomic_addiv64(&slot->slot_state, my_size)); } /* * __wt_log_slot_free -- * Free a slot back into the pool. */ -int +void __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { - WT_UNUSED(session); /* * Make sure flags don't get retained between uses. * We have to reset them them here because multiple threads may * change the flags when joining the slot. */ + WT_UNUSED(session); slot->flags = WT_SLOT_INIT_FLAGS; + slot->slot_error = 0; slot->slot_state = WT_LOG_SLOT_FREE; - return (0); } diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c index 84b8d5c9532..6068bb6c559 100644 --- a/src/lsm/lsm_cursor.c +++ b/src/lsm/lsm_cursor.c @@ -134,7 +134,7 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm) if (have_primary) { WT_ENTER_PAGE_INDEX(session); WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)primary)->btree, - ovfl = __wt_btree_lsm_size(session, hard_limit ? + ovfl = __wt_btree_lsm_over_size(session, hard_limit ? 2 * lsm_tree->chunk_size : lsm_tree->chunk_size)); WT_LEAVE_PAGE_INDEX(session); @@ -1066,12 +1066,12 @@ __clsm_lookup(WT_CURSOR_LSM *clsm, WT_ITEM *value) ret = __wt_bloom_hash_get(bloom, &bhash); if (ret == WT_NOTFOUND) { - WT_STAT_FAST_INCR(session, - &clsm->lsm_tree->stats, bloom_miss); + WT_LSM_TREE_STAT_INCR( + session, clsm->lsm_tree->bloom_miss); continue; } else if (ret == 0) - WT_STAT_FAST_INCR(session, - &clsm->lsm_tree->stats, bloom_hit); + WT_LSM_TREE_STAT_INCR( + session, clsm->lsm_tree->bloom_hit); WT_ERR(ret); } c->set_key(c, &cursor->key); @@ -1086,11 +1086,11 @@ __clsm_lookup(WT_CURSOR_LSM *clsm, WT_ITEM *value) F_CLR(c, WT_CURSTD_KEY_SET); /* Update stats: the active chunk can't have a bloom filter. */ if (bloom != NULL) - WT_STAT_FAST_INCR(session, - &clsm->lsm_tree->stats, bloom_false_positive); + WT_LSM_TREE_STAT_INCR(session, + clsm->lsm_tree->bloom_false_positive); else if (clsm->primary_chunk == NULL || i != clsm->nchunks) - WT_STAT_FAST_INCR(session, - &clsm->lsm_tree->stats, lsm_lookup_no_bloom); + WT_LSM_TREE_STAT_INCR(session, + clsm->lsm_tree->lsm_lookup_no_bloom); } WT_ERR(WT_NOTFOUND); @@ -1331,12 +1331,12 @@ __clsm_put(WT_SESSION_IMPL *session, ++clsm->update_count >= 100) && lsm_tree->merge_throttle + lsm_tree->ckpt_throttle > 0) { clsm->update_count = 0; - WT_STAT_FAST_INCRV(session, &clsm->lsm_tree->stats, - lsm_checkpoint_throttle, lsm_tree->ckpt_throttle); + WT_LSM_TREE_STAT_INCRV(session, + lsm_tree->lsm_checkpoint_throttle, lsm_tree->ckpt_throttle); WT_STAT_FAST_CONN_INCRV(session, lsm_checkpoint_throttle, lsm_tree->ckpt_throttle); - WT_STAT_FAST_INCRV(session, &clsm->lsm_tree->stats, - lsm_merge_throttle, lsm_tree->merge_throttle); + WT_LSM_TREE_STAT_INCRV(session, + lsm_tree->lsm_merge_throttle, lsm_tree->merge_throttle); WT_STAT_FAST_CONN_INCRV(session, lsm_merge_throttle, lsm_tree->merge_throttle); __wt_sleep(0, diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c index cb078d991d8..6c59232b619 100644 --- a/src/lsm/lsm_manager.c +++ b/src/lsm/lsm_manager.c @@ -258,7 +258,7 @@ __wt_lsm_manager_free_work_unit( if (entry != NULL) { WT_ASSERT(session, entry->lsm_tree->queue_ref > 0); - (void)WT_ATOMIC_SUB4(entry->lsm_tree->queue_ref, 1); + (void)__wt_atomic_sub32(&entry->lsm_tree->queue_ref, 1); __wt_free(session, entry); } } @@ -273,7 +273,7 @@ __wt_lsm_manager_destroy(WT_SESSION_IMPL *session) WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LSM_MANAGER *manager; - WT_LSM_WORK_UNIT *current, *next; + WT_LSM_WORK_UNIT *current; WT_SESSION *wt_session; uint32_t i; uint64_t removed; @@ -297,23 +297,17 @@ __wt_lsm_manager_destroy(WT_SESSION_IMPL *session) manager->lsm_worker_cookies[0].tid = 0; /* Release memory from any operations left on the queue. */ - for (current = TAILQ_FIRST(&manager->switchqh); - current != NULL; current = next) { - next = TAILQ_NEXT(current, q); + while ((current = TAILQ_FIRST(&manager->switchqh)) != NULL) { TAILQ_REMOVE(&manager->switchqh, current, q); ++removed; __wt_lsm_manager_free_work_unit(session, current); } - for (current = TAILQ_FIRST(&manager->appqh); - current != NULL; current = next) { - next = TAILQ_NEXT(current, q); + while ((current = TAILQ_FIRST(&manager->appqh)) != NULL) { TAILQ_REMOVE(&manager->appqh, current, q); ++removed; __wt_lsm_manager_free_work_unit(session, current); } - for (current = TAILQ_FIRST(&manager->managerqh); - current != NULL; current = next) { - next = TAILQ_NEXT(current, q); + while ((current = TAILQ_FIRST(&manager->managerqh)) != NULL) { TAILQ_REMOVE(&manager->managerqh, current, q); ++removed; __wt_lsm_manager_free_work_unit(session, current); @@ -645,9 +639,9 @@ __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session, * on close, the flag is cleared and then the queue reference count * is checked. */ - (void)WT_ATOMIC_ADD4(lsm_tree->queue_ref, 1); + (void)__wt_atomic_add32(&lsm_tree->queue_ref, 1); if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) { - (void)WT_ATOMIC_SUB4(lsm_tree->queue_ref, 1); + (void)__wt_atomic_sub32(&lsm_tree->queue_ref, 1); return (0); } @@ -674,6 +668,6 @@ __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session, return (0); err: if (!pushed) - (void)WT_ATOMIC_SUB4(lsm_tree->queue_ref, 1); + (void)__wt_atomic_sub32(&lsm_tree->queue_ref, 1); return (ret); } diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c index d7e684b8f51..01a61359949 100644 --- a/src/lsm/lsm_merge.c +++ b/src/lsm/lsm_merge.c @@ -398,7 +398,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) locked = 0; /* Allocate an ID for the merge. */ - dest_id = WT_ATOMIC_ADD4(lsm_tree->last, 1); + dest_id = __wt_atomic_add32(&lsm_tree->last, 1); /* * We only want to do the chunk loop if we're running with verbose, @@ -493,7 +493,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) * merge_syncing field so that compact knows it is still in * progress. */ - (void)WT_ATOMIC_ADD4(lsm_tree->merge_syncing, 1); + (void)__wt_atomic_add32(&lsm_tree->merge_syncing, 1); in_sync = 1; /* * We've successfully created the new chunk. Now install it. We need @@ -512,7 +512,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) * Don't block if the cache is full: our next unit of work may be to * discard some trees to free space. */ - F_SET(session, WT_SESSION_NO_CACHE_CHECK); + F_SET(session, WT_SESSION_NO_EVICTION); if (create_bloom) { if (ret == 0) @@ -544,7 +544,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) WT_TRET(dest->close(dest)); dest = NULL; ++lsm_tree->merge_progressing; - (void)WT_ATOMIC_SUB4(lsm_tree->merge_syncing, 1); + (void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1); in_sync = 0; WT_ERR_NOTFOUND_OK(ret); @@ -600,7 +600,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) err: if (locked) WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); if (in_sync) - (void)WT_ATOMIC_SUB4(lsm_tree->merge_syncing, 1); + (void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1); if (src != NULL) WT_TRET(src->close(src)); if (dest != NULL) @@ -632,6 +632,6 @@ err: if (locked) "Merge failed with %s", __wt_strerror(session, ret, NULL, 0))); } - F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK); + F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); return (ret); } diff --git a/src/lsm/lsm_stat.c b/src/lsm/lsm_stat.c index 126a59af0d1..2817ec9eeb7 100644 --- a/src/lsm/lsm_stat.c +++ b/src/lsm/lsm_stat.c @@ -22,6 +22,7 @@ __curstat_lsm_init( WT_DSRC_STATS *new, *stats; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; + int64_t bloom_count; u_int i; int locked; char config[64]; @@ -49,25 +50,22 @@ __curstat_lsm_init( cfg[1] = disk_cfg[1] = config; } - /* - * Set the cursor to reference the data source statistics; we don't - * initialize it, instead we copy (rather than aggregate), the first - * chunk's statistics, which has the same effect. - */ - stats = &cst->u.dsrc_stats; - /* Hold the LSM lock so that we can safely walk through the chunks. */ WT_ERR(__wt_lsm_tree_readlock(session, lsm_tree)); locked = 1; - /* Initialize the statistics. */ - __wt_stat_init_dsrc_stats(stats); + /* + * Set the cursor to reference the data source statistics into which + * we're going to aggregate statistics from the underlying objects. + */ + stats = &cst->u.dsrc_stats; + __wt_stat_dsrc_init_single(stats); /* * For each chunk, aggregate its statistics, as well as any associated * bloom filter statistics, into the total statistics. */ - for (i = 0; i < lsm_tree->nchunks; i++) { + for (bloom_count = 0, i = 0; i < lsm_tree->nchunks; i++) { chunk = lsm_tree->chunk[i]; /* @@ -93,17 +91,17 @@ __curstat_lsm_init( * top-level. */ new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor); - WT_STAT_SET(new, lsm_generation_max, chunk->generation); + new->lsm_generation_max = chunk->generation; /* Aggregate statistics from each new chunk. */ - __wt_stat_aggregate_dsrc_stats(new, stats); + __wt_stat_dsrc_aggregate_single(new, stats); WT_ERR(stat_cursor->close(stat_cursor)); if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) continue; /* Maintain a count of bloom filters. */ - WT_STAT_INCR(&lsm_tree->stats, bloom_count); + ++bloom_count; /* Get the bloom filter's underlying object. */ WT_ERR(__wt_buf_fmt( @@ -117,24 +115,39 @@ __curstat_lsm_init( * into the top-level. */ new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor); - WT_STAT_SET(new, - bloom_size, (chunk->count * lsm_tree->bloom_bit_count) / 8); - WT_STAT_SET(new, bloom_page_evict, - WT_STAT(new, cache_eviction_clean) + - WT_STAT(new, cache_eviction_dirty)); - WT_STAT_SET(new, bloom_page_read, WT_STAT(new, cache_read)); - - __wt_stat_aggregate_dsrc_stats(new, stats); + new->bloom_size = + (int64_t)((chunk->count * lsm_tree->bloom_bit_count) / 8); + new->bloom_page_evict = + new->cache_eviction_clean + new->cache_eviction_dirty; + new->bloom_page_read = new->cache_read; + + __wt_stat_dsrc_aggregate_single(new, stats); WT_ERR(stat_cursor->close(stat_cursor)); } /* Set statistics that aren't aggregated directly into the cursor */ - WT_STAT_SET(stats, lsm_chunk_count, lsm_tree->nchunks); + stats->bloom_count = bloom_count; + stats->lsm_chunk_count = lsm_tree->nchunks; - /* Aggregate, and optionally clear, LSM-level specific information. */ - __wt_stat_aggregate_dsrc_stats(&lsm_tree->stats, stats); + /* Include, and optionally clear, LSM-level specific information. */ + stats->bloom_miss = lsm_tree->bloom_miss; + if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) + lsm_tree->bloom_miss = 0; + stats->bloom_hit = lsm_tree->bloom_hit; + if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) + lsm_tree->bloom_hit = 0; + stats->bloom_false_positive = lsm_tree->bloom_false_positive; + if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) + lsm_tree->bloom_false_positive = 0; + stats->lsm_lookup_no_bloom = lsm_tree->lsm_lookup_no_bloom; + if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) + lsm_tree->lsm_lookup_no_bloom = 0; + stats->lsm_checkpoint_throttle = lsm_tree->lsm_checkpoint_throttle; + if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) + lsm_tree->lsm_checkpoint_throttle = 0; + stats->lsm_merge_throttle = lsm_tree->lsm_merge_throttle; if (F_ISSET(cst, WT_CONN_STAT_CLEAR)) - __wt_stat_refresh_dsrc_stats(&lsm_tree->stats); + lsm_tree->lsm_merge_throttle = 0; __wt_curstat_dsrc_final(cst); diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c index 6c6b185f821..46db76e099c 100644 --- a/src/lsm/lsm_tree.c +++ b/src/lsm/lsm_tree.c @@ -141,7 +141,7 @@ __wt_lsm_tree_close_all(WT_SESSION_IMPL *session) * is no need to decrement the reference count since discard * is unconditional. */ - (void)WT_ATOMIC_ADD4(lsm_tree->refcnt, 1); + (void)__wt_atomic_add32(&lsm_tree->refcnt, 1); WT_TRET(__lsm_tree_close(session, lsm_tree)); WT_TRET(__lsm_tree_discard(session, lsm_tree, 1)); } @@ -486,15 +486,17 @@ __lsm_tree_find(WT_SESSION_IMPL *session, * Make sure we win the race to switch on the * exclusive flag. */ - if (!WT_ATOMIC_CAS1(lsm_tree->exclusive, 0, 1)) + if (!__wt_atomic_cas8( + &lsm_tree->exclusive, 0, 1)) return (EBUSY); /* Make sure there are no readers */ - if (!WT_ATOMIC_CAS4(lsm_tree->refcnt, 0, 1)) { + if (!__wt_atomic_cas32( + &lsm_tree->refcnt, 0, 1)) { lsm_tree->exclusive = 0; return (EBUSY); } } else { - (void)WT_ATOMIC_ADD4(lsm_tree->refcnt, 1); + (void)__wt_atomic_add32(&lsm_tree->refcnt, 1); /* * We got a reference, check if an exclusive @@ -503,8 +505,8 @@ __lsm_tree_find(WT_SESSION_IMPL *session, if (lsm_tree->exclusive) { WT_ASSERT(session, lsm_tree->refcnt > 0); - (void)WT_ATOMIC_SUB4( - lsm_tree->refcnt, 1); + (void)__wt_atomic_sub32( + &lsm_tree->refcnt, 1); return (EBUSY); } } @@ -565,7 +567,7 @@ __lsm_tree_open(WT_SESSION_IMPL *session, WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); /* Start the LSM manager thread if it isn't running. */ - if (WT_ATOMIC_CAS4(conn->lsm_manager.lsm_workers, 0, 1)) + if (__wt_atomic_cas32(&conn->lsm_manager.lsm_workers, 0, 1)) WT_RET(__wt_lsm_manager_start(session)); /* Make sure no one beat us to it. */ @@ -596,7 +598,7 @@ __lsm_tree_open(WT_SESSION_IMPL *session, * with getting handles exclusive. */ lsm_tree->refcnt = 1; - lsm_tree->exclusive = (int8_t)exclusive; + lsm_tree->exclusive = exclusive ? 1 : 0; lsm_tree->queue_ref = 0; /* Set a flush timestamp as a baseline. */ @@ -644,7 +646,7 @@ __wt_lsm_tree_release(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_ASSERT(session, lsm_tree->refcnt > 0); if (lsm_tree->exclusive) lsm_tree->exclusive = 0; - (void)WT_ATOMIC_SUB4(lsm_tree->refcnt, 1); + (void)__wt_atomic_sub32(&lsm_tree->refcnt, 1); } /* How aggressively to ramp up or down throttle due to level 0 merging */ @@ -839,7 +841,7 @@ __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree, 0); - new_id = WT_ATOMIC_ADD4(lsm_tree->last, 1); + new_id = __wt_atomic_add32(&lsm_tree->last, 1); WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc, nchunks + 1, &lsm_tree->chunk)); @@ -1097,7 +1099,7 @@ __wt_lsm_tree_truncate( /* Create the new chunk. */ WT_ERR(__wt_calloc_one(session, &chunk)); - chunk->id = WT_ATOMIC_ADD4(lsm_tree->last, 1); + chunk->id = __wt_atomic_add32(&lsm_tree->last, 1); WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); /* Mark all chunks old. */ @@ -1142,7 +1144,7 @@ __wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) * Diagnostic: avoid deadlocks with the schema lock: if we need it for * an operation, we should already have it. */ - F_SET(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK); + F_SET(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); return (0); } @@ -1155,7 +1157,7 @@ __wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; - F_CLR(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK); + F_CLR(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); if ((ret = __wt_readunlock(session, lsm_tree->rwlock)) != 0) WT_PANIC_RET(session, ret, "Unlocking an LSM tree"); @@ -1175,7 +1177,7 @@ __wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) * Diagnostic: avoid deadlocks with the schema lock: if we need it for * an operation, we should already have it. */ - F_SET(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK); + F_SET(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); return (0); } @@ -1188,7 +1190,7 @@ __wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; - F_CLR(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK); + F_CLR(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK); if ((ret = __wt_writeunlock(session, lsm_tree->rwlock)) != 0) WT_PANIC_RET(session, ret, "Unlocking an LSM tree"); @@ -1207,7 +1209,8 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip) WT_LSM_TREE *lsm_tree; time_t begin, end; uint64_t progress; - int i, compacting, flushing, locked, ref; + uint32_t i; + int compacting, flushing, locked, ref; compacting = flushing = locked = ref = 0; chunk = NULL; @@ -1282,7 +1285,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip) * If we have a chunk, we want to look for it to be on-disk. * So we need to add a reference to keep it available. */ - (void)WT_ATOMIC_ADD4(chunk->refcnt, 1); + (void)__wt_atomic_add32(&chunk->refcnt, 1); ref = 1; } @@ -1330,7 +1333,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip) "Start compacting progress %" PRIu64, name, chunk->id, lsm_tree->merge_progressing)); - (void)WT_ATOMIC_SUB4(chunk->refcnt, 1); + (void)__wt_atomic_sub32(&chunk->refcnt, 1); flushing = ref = 0; compacting = 1; F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); @@ -1384,7 +1387,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip) err: /* Ensure anything we set is cleared. */ if (ref) - (void)WT_ATOMIC_SUB4(chunk->refcnt, 1); + (void)__wt_atomic_sub32(&chunk->refcnt, 1); if (compacting) { F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING); lsm_tree->merge_aggressiveness = 0; diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c index c3bee162ea1..8eba0127b8b 100644 --- a/src/lsm/lsm_work_unit.c +++ b/src/lsm/lsm_work_unit.c @@ -53,7 +53,7 @@ __lsm_copy_chunks(WT_SESSION_IMPL *session, * it's safe. */ for (i = 0; i < nchunks; i++) - (void)WT_ATOMIC_ADD4(cookie->chunk_array[i]->refcnt, 1); + (void)__wt_atomic_add32(&cookie->chunk_array[i]->refcnt, 1); err: WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree)); @@ -122,7 +122,7 @@ __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session, force ? " w/ force" : "", i, lsm_tree->nchunks, chunk->uri)); - (void)WT_ATOMIC_ADD4(chunk->refcnt, 1); + (void)__wt_atomic_add32(&chunk->refcnt, 1); } err: WT_RET(__wt_lsm_tree_readunlock(session, lsm_tree)); @@ -145,7 +145,7 @@ __lsm_unpin_chunks(WT_SESSION_IMPL *session, WT_LSM_WORKER_COOKIE *cookie) if (cookie->chunk_array[i] == NULL) continue; WT_ASSERT(session, cookie->chunk_array[i]->refcnt > 0); - (void)WT_ATOMIC_SUB4(cookie->chunk_array[i]->refcnt, 1); + (void)__wt_atomic_sub32(&cookie->chunk_array[i]->refcnt, 1); } /* Ensure subsequent calls don't double decrement. */ cookie->nchunks = 0; @@ -223,7 +223,7 @@ __wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) * See if we win the race to switch on the "busy" flag and * recheck that the chunk still needs a Bloom filter. */ - if (WT_ATOMIC_CAS4(chunk->bloom_busy, 0, 1)) { + if (__wt_atomic_cas32(&chunk->bloom_busy, 0, 1)) { if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) { ret = __lsm_bloom_create( session, lsm_tree, chunk, (u_int)i); @@ -301,17 +301,19 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, * Flush the file before checkpointing: this is the expensive part in * terms of I/O. * - * Use the special eviction isolation level to avoid interfering with - * an application checkpoint: we have already checked that all of the - * updates in this chunk are globally visible. - * - * !!! We can wait here for checkpoints and fsyncs to complete, which - * can be a long time. + * !!! + * We can wait here for checkpoints and fsyncs to complete, which can + * take a long time. */ if ((ret = __wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)) == 0) { + /* + * Set read-uncommitted: we have already checked that all of the + * updates in this chunk are globally visible, use the cheapest + * possible check in reconciliation. + */ saved_isolation = session->txn.isolation; - session->txn.isolation = WT_ISO_EVICTION; + session->txn.isolation = WT_ISO_READ_UNCOMMITTED; ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; WT_TRET(__wt_session_release_btree(session)); @@ -412,7 +414,7 @@ __lsm_bloom_create(WT_SESSION_IMPL *session, * ourselves to get stuck creating bloom filters, the entire tree * can stall since there may be no worker threads available to flush. */ - F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK); + F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { WT_ERR(src->get_key(src, &key)); WT_ERR(__wt_bloom_insert(bloom, &key)); @@ -446,7 +448,7 @@ __lsm_bloom_create(WT_SESSION_IMPL *session, err: if (bloom != NULL) WT_TRET(__wt_bloom_close(bloom)); - F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK); + F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); return (ret); } @@ -528,7 +530,7 @@ __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) * Make sure only a single thread is freeing the old chunk array * at any time. */ - if (!WT_ATOMIC_CAS4(lsm_tree->freeing_old_chunks, 0, 1)) + if (!__wt_atomic_cas32(&lsm_tree->freeing_old_chunks, 0, 1)) return (0); /* * Take a copy of the current state of the LSM tree and look for chunks diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c index 8ed4a117641..3add3155e17 100644 --- a/src/lsm/lsm_worker.c +++ b/src/lsm/lsm_worker.c @@ -65,7 +65,7 @@ __lsm_worker_general_op( ret = __wt_lsm_checkpoint_chunk( session, entry->lsm_tree, chunk); WT_ASSERT(session, chunk->refcnt > 0); - (void)WT_ATOMIC_SUB4(chunk->refcnt, 1); + (void)__wt_atomic_sub32(&chunk->refcnt, 1); WT_ERR(ret); } } else if (entry->type == WT_LSM_WORK_DROP) diff --git a/src/meta/meta_apply.c b/src/meta/meta_apply.c index 6d08ce3aa6a..315621f2ae9 100644 --- a/src/meta/meta_apply.c +++ b/src/meta/meta_apply.c @@ -32,7 +32,7 @@ __wt_meta_btree_apply(WT_SESSION_IMPL *session, WT_ERR(cursor->get_key(cursor, &uri)); if (!WT_PREFIX_MATCH(uri, "file:")) break; - else if (strcmp(uri, WT_METAFILE_URI) == 0) + if (strcmp(uri, WT_METAFILE_URI) == 0) continue; /* diff --git a/src/meta/meta_table.c b/src/meta/meta_table.c index 227d0fa9a6c..8255f004dab 100644 --- a/src/meta/meta_table.c +++ b/src/meta/meta_table.c @@ -12,22 +12,22 @@ * __metadata_turtle -- * Return if a key's value should be taken from the turtle file. */ -static int +static bool __metadata_turtle(const char *key) { switch (key[0]) { case 'f': if (strcmp(key, WT_METAFILE_URI) == 0) - return (1); + return (true); break; case 'W': if (strcmp(key, "WiredTiger version") == 0) - return (1); + return (true); if (strcmp(key, "WiredTiger version string") == 0) - return (1); + return (true); break; } - return (0); + return (false); } /* @@ -37,6 +37,8 @@ __metadata_turtle(const char *key) int __wt_metadata_open(WT_SESSION_IMPL *session) { + WT_BTREE *btree; + if (session->meta_dhandle != NULL) return (0); @@ -45,7 +47,24 @@ __wt_metadata_open(WT_SESSION_IMPL *session) session->meta_dhandle = session->dhandle; WT_ASSERT(session, session->meta_dhandle != NULL); - /* The meta_dhandle doesn't need to stay locked -- release it. */ + /* + * Set special flags for the metadata file: eviction (the metadata file + * is in-memory and never evicted), logging (the metadata file is always + * logged if possible). + * + * Test flags before setting them so updates can't race in subsequent + * opens (the first update is safe because it's single-threaded from + * wiredtiger_open). + */ + btree = S2BT(session); + if (!F_ISSET(btree, WT_BTREE_IN_MEMORY)) + F_SET(btree, WT_BTREE_IN_MEMORY); + if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) + F_SET(btree, WT_BTREE_NO_EVICTION); + if (F_ISSET(btree, WT_BTREE_NO_LOGGING)) + F_CLR(btree, WT_BTREE_NO_LOGGING); + + /* The metadata handle doesn't need to stay locked -- release it. */ return (__wt_session_release_btree(session)); } @@ -59,9 +78,9 @@ __wt_metadata_cursor( { WT_DATA_HANDLE *saved_dhandle; WT_DECL_RET; + int is_dead; const char *cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_open_cursor), config, NULL }; - int is_dead; saved_dhandle = session->dhandle; WT_ERR(__wt_metadata_open(session)); diff --git a/src/os_posix/os_alloc.c b/src/os_posix/os_alloc.c index 4d04f9ac579..eb2482723ec 100644 --- a/src/os_posix/os_alloc.c +++ b/src/os_posix/os_alloc.c @@ -58,7 +58,9 @@ __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp) WT_STAT_FAST_CONN_INCR(session, memory_allocation); if ((p = calloc(number, size)) == NULL) - WT_RET_MSG(session, __wt_errno(), "memory allocation"); + WT_RET_MSG(session, __wt_errno(), + "memory allocation of %" WT_SIZET_FMT " bytes failed", + size * number); *(void **)retp = p; return (0); @@ -100,7 +102,9 @@ __wt_realloc(WT_SESSION_IMPL *session, } if ((p = realloc(p, bytes_to_allocate)) == NULL) - WT_RET_MSG(session, __wt_errno(), "memory allocation"); + WT_RET_MSG(session, __wt_errno(), + "memory allocation of %" WT_SIZET_FMT " bytes failed", + bytes_to_allocate); /* * Clear the allocated memory -- an application might: allocate memory, @@ -171,7 +175,9 @@ __wt_realloc_aligned(WT_SESSION_IMPL *session, if ((ret = posix_memalign(&newp, S2C(session)->buffer_alignment, bytes_to_allocate)) != 0) - WT_RET_MSG(session, ret, "memory allocation"); + WT_RET_MSG(session, ret, + "memory allocation of %" WT_SIZET_FMT + " bytes failed", bytes_to_allocate); if (p != NULL) memcpy(newp, p, bytes_allocated); diff --git a/src/os_posix/os_mtx_cond.c b/src/os_posix/os_mtx_cond.c index dfd72dd0cd2..7946b4ab0cc 100644 --- a/src/os_posix/os_mtx_cond.c +++ b/src/os_posix/os_mtx_cond.c @@ -41,11 +41,13 @@ err: __wt_free(session, cond); } /* - * __wt_cond_wait -- - * Wait on a mutex, optionally timing out. + * __wt_cond_wait_signal -- + * Wait on a mutex, optionally timing out. If we get it + * before the time out period expires, let the caller know. */ int -__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs) +__wt_cond_wait_signal( + WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, int *signalled) { struct timespec ts; WT_DECL_RET; @@ -54,7 +56,8 @@ __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs) locked = 0; /* Fast path if already signalled. */ - if (WT_ATOMIC_ADD4(cond->waiters, 1) == 0) + *signalled = 1; + if (__wt_atomic_addi32(&cond->waiters, 1) == 0) return (0); /* @@ -88,10 +91,12 @@ __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs) #ifdef ETIME ret == ETIME || #endif - ret == ETIMEDOUT) + ret == ETIMEDOUT) { + *signalled = 0; ret = 0; + } - (void)WT_ATOMIC_SUB4(cond->waiters, 1); + (void)__wt_atomic_subi32(&cond->waiters, 1); err: if (locked) WT_TRET(pthread_mutex_unlock(&cond->mtx)); @@ -124,7 +129,7 @@ __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) if (cond->waiters == -1) return (0); - if (cond->waiters > 0 || !WT_ATOMIC_CAS4(cond->waiters, 0, -1)) { + if (cond->waiters > 0 || !__wt_atomic_casi32(&cond->waiters, 0, -1)) { WT_ERR(pthread_mutex_lock(&cond->mtx)); locked = 1; WT_ERR(pthread_cond_broadcast(&cond->cond)); diff --git a/src/os_posix/os_mtx_rw.c b/src/os_posix/os_mtx_rw.c index cdd4f8a24e1..d47ab197643 100644 --- a/src/os_posix/os_mtx_rw.c +++ b/src/os_posix/os_mtx_rw.c @@ -38,6 +38,78 @@ * Joseph Seigh. Note that a similar (but not identical) algorithm was published * by John Mellor-Crummey and Michael Scott in their landmark paper "Scalable * Reader-Writer Synchronization for Shared-Memory Multiprocessors". + * + * The following is an explanation of this code. First, the underlying lock + * structure. + * + * struct { + * uint16_t writers; Now serving for writers + * uint16_t readers; Now serving for readers + * uint16_t users; Next available ticket number + * uint16_t __notused; Padding + * } + * + * First, imagine a store's 'take a number' ticket algorithm. A customer takes + * a unique ticket number and customers are served in ticket order. In the data + * structure, 'writers' is the next writer to be served, 'readers' is the next + * reader to be served, and 'users' is the next available ticket number. + * + * Next, consider exclusive (write) locks. The 'now serving' number for writers + * is 'writers'. To lock, 'take a number' and wait until that number is being + * served; more specifically, atomically copy and increment the current value of + * 'users', and then wait until 'writers' equals that copied number. + * + * Shared (read) locks are similar. Like writers, readers atomically get the + * next number available. However, instead of waiting for 'writers' to equal + * their number, they wait for 'readers' to equal their number. + * + * This has the effect of queuing lock requests in the order they arrive + * (incidentally avoiding starvation). + * + * Each lock/unlock pair requires incrementing both 'readers' and 'writers'. + * In the case of a reader, the 'readers' increment happens when the reader + * acquires the lock (to allow read-lock sharing), and the 'writers' increment + * happens when the reader releases the lock. In the case of a writer, both + * 'readers' and 'writers' are incremented when the writer releases the lock. + * + * For example, consider the following read (R) and write (W) lock requests: + * + * writers readers users + * 0 0 0 + * R: ticket 0, readers match OK 0 1 1 + * R: ticket 1, readers match OK 0 2 2 + * R: ticket 2, readers match OK 0 3 3 + * W: ticket 3, writers no match block 0 3 4 + * R: ticket 2, unlock 1 3 4 + * R: ticket 0, unlock 2 3 4 + * R: ticket 1, unlock 3 3 4 + * W: ticket 3, writers match OK 3 3 4 + * + * Note the writer blocks until 'writers' equals its ticket number and it does + * not matter if readers unlock in order or not. + * + * Readers or writers entering the system after the write lock is queued block, + * and the next ticket holder (reader or writer) will unblock when the writer + * unlocks. An example, continuing from the last line of the above example: + * + * writers readers users + * W: ticket 3, writers match OK 3 3 4 + * R: ticket 4, readers no match block 3 3 5 + * R: ticket 5, readers no match block 3 3 6 + * W: ticket 6, writers no match block 3 3 7 + * W: ticket 3, unlock 4 4 7 + * R: ticket 4, readers match OK 4 5 7 + * R: ticket 5, readers match OK 4 6 7 + * + * The 'users' field is a 2-byte value so the available ticket number wraps at + * 64K requests. If a thread's lock request is not granted until the 'users' + * field cycles and the same ticket is taken by another thread, we could grant + * a lock to two separate threads at the same time, and bad things happen: two + * writer threads or a reader thread and a writer thread would run in parallel, + * and lock waiters could be skipped if the unlocks race. This is unlikely, it + * only happens if a lock request is blocked by 64K other requests. The fix is + * to grow the lock structure fields, but the largest atomic instruction we have + * is 8 bytes, the structure has no room to grow. */ #include "wt_internal.h" @@ -69,20 +141,31 @@ __wt_rwlock_alloc( int __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) { - wt_rwlock_t *l; - uint64_t old, new, pad, users, writers; + wt_rwlock_t *l, new, old; WT_RET(__wt_verbose( session, WT_VERB_MUTEX, "rwlock: try_readlock %s", rwlock->name)); WT_STAT_FAST_CONN_INCR(session, rwlock_read); l = &rwlock->rwlock; - pad = l->s.pad; - users = l->s.users; - writers = l->s.writers; - old = (pad << 48) + (users << 32) + (users << 16) + writers; - new = (pad << 48) + ((users + 1) << 32) + ((users + 1) << 16) + writers; - return (WT_ATOMIC_CAS8(l->u, old, new) ? 0 : EBUSY); + new = old = *l; + + /* + * This read lock can only be granted if the lock was last granted to + * a reader and there are no readers or writers blocked on the lock, + * that is, if this thread's ticket would be the next ticket granted. + * Do the cheap test to see if this can possibly succeed (and confirm + * the lock is in the correct state to grant this read lock). + */ + if (old.s.readers != old.s.users) + return (EBUSY); + + /* + * The replacement lock value is a result of allocating a new ticket and + * incrementing the reader value to match it. + */ + new.s.readers = new.s.users = old.s.users + 1; + return (__wt_atomic_cas64(&l->u, old.u, new.u) ? 0 : EBUSY); } /* @@ -93,8 +176,7 @@ int __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) { wt_rwlock_t *l; - uint64_t me; - uint16_t val; + uint16_t ticket; int pause_cnt; WT_RET(__wt_verbose( @@ -102,17 +184,22 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_STAT_FAST_CONN_INCR(session, rwlock_read); l = &rwlock->rwlock; - me = WT_ATOMIC_FETCH_ADD8(l->u, (uint64_t)1 << 32); - val = (uint16_t)(me >> 32); - for (pause_cnt = 0; val != l->s.readers;) { + + /* + * Possibly wrap: if we have more than 64K lockers waiting, the ticket + * value will wrap and two lockers will simultaneously be granted the + * lock. + */ + ticket = __wt_atomic_fetch_add16(&l->s.users, 1); + for (pause_cnt = 0; ticket != l->s.readers;) { /* * We failed to get the lock; pause before retrying and if we've * paused enough, sleep so we don't burn CPU to no purpose. This * situation happens if there are more threads than cores in the - * system and we're thrashing on shared resources. Regardless, - * don't sleep long, all we need is to schedule the other reader - * threads to complete a few more instructions and increment the - * reader count. + * system and we're thrashing on shared resources. + * + * Don't sleep long when waiting on a read lock, hopefully we're + * waiting on another read thread to increment the reader count. */ if (++pause_cnt < 1000) WT_PAUSE(); @@ -120,6 +207,10 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) __wt_sleep(0, 10); } + /* + * We're the only writer of the readers field, so the update does not + * need to be atomic. + */ ++l->s.readers; return (0); @@ -138,7 +229,12 @@ __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) session, WT_VERB_MUTEX, "rwlock: read unlock %s", rwlock->name)); l = &rwlock->rwlock; - WT_ATOMIC_ADD2(l->s.writers, 1); + + /* + * Increment the writers value (other readers are doing the same, make + * sure we don't race). + */ + (void)__wt_atomic_add16(&l->s.writers, 1); return (0); } @@ -150,20 +246,28 @@ __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) int __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) { - wt_rwlock_t *l; - uint64_t old, new, pad, readers, users; + wt_rwlock_t *l, new, old; WT_RET(__wt_verbose( session, WT_VERB_MUTEX, "rwlock: try_writelock %s", rwlock->name)); WT_STAT_FAST_CONN_INCR(session, rwlock_write); l = &rwlock->rwlock; - pad = l->s.pad; - readers = l->s.readers; - users = l->s.users; - old = (pad << 48) + (users << 32) + (readers << 16) + users; - new = (pad << 48) + ((users + 1) << 32) + (readers << 16) + users; - return (WT_ATOMIC_CAS8(l->u, old, new) ? 0 : EBUSY); + old = new = *l; + + /* + * This write lock can only be granted if the lock was last granted to + * a writer and there are no readers or writers blocked on the lock, + * that is, if this thread's ticket would be the next ticket granted. + * Do the cheap test to see if this can possibly succeed (and confirm + * the lock is in the correct state to grant this write lock). + */ + if (old.s.writers != old.s.users) + return (EBUSY); + + /* The replacement lock value is a result of allocating a new ticket. */ + ++new.s.users; + return (__wt_atomic_cas64(&l->u, old.u, new.u) ? 0 : EBUSY); } /* @@ -174,23 +278,33 @@ int __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) { wt_rwlock_t *l; - uint64_t me; - uint16_t val; + uint16_t ticket; + int pause_cnt; WT_RET(__wt_verbose( session, WT_VERB_MUTEX, "rwlock: writelock %s", rwlock->name)); WT_STAT_FAST_CONN_INCR(session, rwlock_write); + l = &rwlock->rwlock; + /* - * Possibly wrap: if we have more than 64K lockers waiting, the count - * of writers will wrap and two lockers will simultaneously be granted - * the write lock. + * Possibly wrap: if we have more than 64K lockers waiting, the ticket + * value will wrap and two lockers will simultaneously be granted the + * lock. */ - l = &rwlock->rwlock; - me = WT_ATOMIC_FETCH_ADD8(l->u, (uint64_t)1 << 32); - val = (uint16_t)(me >> 32); - while (val != l->s.writers) - WT_PAUSE(); + ticket = __wt_atomic_fetch_add16(&l->s.users, 1); + for (pause_cnt = 0; ticket != l->s.writers;) { + /* + * We failed to get the lock; pause before retrying and if we've + * paused enough, sleep so we don't burn CPU to no purpose. This + * situation happens if there are more threads than cores in the + * system and we're thrashing on shared resources. + */ + if (++pause_cnt < 1000) + WT_PAUSE(); + else + __wt_sleep(0, 10); + } return (0); } @@ -211,12 +325,23 @@ __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) copy = *l; + /* + * We're the only writer of the writers/readers fields, so the update + * does not need to be atomic; we have to update both values at the + * same time though, otherwise we'd potentially race with the thread + * next granted the lock. + * + * Use a memory barrier to ensure the compiler doesn't mess with these + * instructions and rework the code in a way that avoids the update as + * a unit. + */ WT_BARRIER(); ++copy.s.writers; ++copy.s.readers; - l->i.us = copy.i.us; + l->i.wr = copy.i.wr; + return (0); } diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c index 7a4f5fdb38d..ef4662aa369 100644 --- a/src/os_posix/os_open.c +++ b/src/os_posix/os_open.c @@ -53,7 +53,7 @@ __wt_open(WT_SESSION_IMPL *session, hash = __wt_hash_city64(name, strlen(name)); bucket = hash % WT_HASH_ARRAY_SIZE; __wt_spin_lock(session, &conn->fh_lock); - SLIST_FOREACH(tfh, &conn->fhhash[bucket], hashl) { + TAILQ_FOREACH(tfh, &conn->fhhash[bucket], hashq) { if (strcmp(name, tfh->name) == 0) { ++tfh->ref; *fhp = tfh; @@ -167,7 +167,7 @@ setupfh: */ matched = 0; __wt_spin_lock(session, &conn->fh_lock); - SLIST_FOREACH(tfh, &conn->fhhash[bucket], hashl) { + TAILQ_FOREACH(tfh, &conn->fhhash[bucket], hashq) { if (strcmp(name, tfh->name) == 0) { ++tfh->ref; *fhp = tfh; @@ -177,7 +177,7 @@ setupfh: } if (!matched) { WT_CONN_FILE_INSERT(conn, fh, bucket); - (void)WT_ATOMIC_ADD4(conn->open_file_count, 1); + (void)__wt_atomic_add32(&conn->open_file_count, 1); *fhp = fh; } __wt_spin_unlock(session, &conn->fh_lock); @@ -213,6 +213,8 @@ __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp) fh = *fhp; *fhp = NULL; + WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: close", fh->name)); + __wt_spin_lock(session, &conn->fh_lock); if (fh == NULL || fh->ref == 0 || --fh->ref > 0) { __wt_spin_unlock(session, &conn->fh_lock); @@ -222,7 +224,7 @@ __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp) /* Remove from the list. */ bucket = fh->name_hash % WT_HASH_ARRAY_SIZE; WT_CONN_FILE_REMOVE(conn, fh, bucket); - (void)WT_ATOMIC_SUB4(conn->open_file_count, 1); + (void)__wt_atomic_sub32(&conn->open_file_count, 1); __wt_spin_unlock(session, &conn->fh_lock); diff --git a/src/os_posix/os_path.c b/src/os_posix/os_path.c index 07b14b55b44..af28e1b3b56 100644 --- a/src/os_posix/os_path.c +++ b/src/os_posix/os_path.c @@ -12,10 +12,10 @@ * __wt_absolute_path -- * Return if a filename is an absolute path. */ -int +bool __wt_absolute_path(const char *path) { - return (path[0] == '/' ? 1 : 0); + return (path[0] == '/'); } /* diff --git a/src/os_posix/os_remove.c b/src/os_posix/os_remove.c index 3fc692d8755..96bbba9bab2 100644 --- a/src/os_posix/os_remove.c +++ b/src/os_posix/os_remove.c @@ -29,7 +29,7 @@ __remove_file_check(WT_SESSION_IMPL *session, const char *name) * level should have closed it before removing. */ __wt_spin_lock(session, &conn->fh_lock); - SLIST_FOREACH(fh, &conn->fhhash[bucket], hashl) + TAILQ_FOREACH(fh, &conn->fhhash[bucket], hashq) if (strcmp(name, fh->name) == 0) break; __wt_spin_unlock(session, &conn->fh_lock); diff --git a/src/os_posix/os_thread.c b/src/os_posix/os_thread.c index e4f24cdb44e..c7222aac6c4 100644 --- a/src/os_posix/os_thread.c +++ b/src/os_posix/os_thread.c @@ -19,7 +19,8 @@ __wt_thread_create(WT_SESSION_IMPL *session, WT_DECL_RET; /* Spawn a new thread of control. */ - if ((ret = pthread_create(tidret, NULL, func, arg)) == 0) + WT_SYSCALL_RETRY(pthread_create(tidret, NULL, func, arg), ret); + if (ret == 0) return (0); WT_RET_MSG(session, ret, "pthread_create"); } @@ -33,7 +34,8 @@ __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid) { WT_DECL_RET; - if ((ret = pthread_join(tid, NULL)) == 0) + WT_SYSCALL_RETRY(pthread_join(tid, NULL), ret); + if (ret == 0) return (0); WT_RET_MSG(session, ret, "pthread_join"); diff --git a/src/os_win/os_errno.c b/src/os_win/os_errno.c index 097c73b5731..a9d3d521052 100644 --- a/src/os_win/os_errno.c +++ b/src/os_win/os_errno.c @@ -22,7 +22,7 @@ __wt_map_error_to_windows_error(int error) { Also validate he do not get any COM errors (which are negative integers) */ - WT_ASSERT(NULL, error > 0 && error > -(windows_error_offset)); + WT_ASSERT(NULL, error < 0); return (error + -(windows_error_offset)); } @@ -96,7 +96,7 @@ __wt_strerror(WT_SESSION_IMPL *session, int error, char *errbuf, size_t errlen) snprintf(errbuf, errlen, "%s", buf) > 0) return (errbuf); if (lasterror != 0 && session != NULL && - __wt_buf_set(session, &session->err, buf, strlen(buf)) == 0) + __wt_buf_fmt(session, &session->err, "%s", buf) == 0) return (session->err.data); } diff --git a/src/os_win/os_mtx_cond.c b/src/os_win/os_mtx_cond.c index 51f6d6533c8..14ca5d61282 100644 --- a/src/os_win/os_mtx_cond.c +++ b/src/os_win/os_mtx_cond.c @@ -37,13 +37,15 @@ __wt_cond_alloc(WT_SESSION_IMPL *session, } /* - * __wt_cond_wait -- - * Wait on a mutex, optionally timing out. + * __wt_cond_wait_signal -- + * Wait on a mutex, optionally timing out. If we get it + * before the time out period expires, let the caller know. */ int -__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs) +__wt_cond_wait_signal( + WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, int *signalled) { - DWORD milliseconds; + DWORD err, milliseconds; WT_DECL_RET; uint64_t milliseconds64; int locked; @@ -51,7 +53,8 @@ __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs) locked = 0; /* Fast path if already signalled. */ - if (WT_ATOMIC_ADD4(cond->waiters, 1) == 0) + *signalled = 1; + if (__wt_atomic_addi32(&cond->waiters, 1) == 0) return (0); /* @@ -91,17 +94,25 @@ __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs) ret = SleepConditionVariableCS( &cond->cond, &cond->mtx, INFINITE); + /* + * SleepConditionVariableCS returns non-zero on success, 0 on timeout + * or failure. Check for timeout, else convert to a WiredTiger error + * value and fail. + */ if (ret == 0) { - if (GetLastError() == ERROR_TIMEOUT) { - ret = 1; - } - } + if ((err = GetLastError()) == ERROR_TIMEOUT) + *signalled = 0; + else + ret = __wt_errno(); + } else + ret = 0; - (void)WT_ATOMIC_SUB4(cond->waiters, 1); + (void)__wt_atomic_subi32(&cond->waiters, 1); if (locked) LeaveCriticalSection(&cond->mtx); - if (ret != 0) + + if (ret == 0) return (0); WT_RET_MSG(session, ret, "SleepConditionVariableCS"); } @@ -130,7 +141,7 @@ __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond) if (cond->waiters == -1) return (0); - if (cond->waiters > 0 || !WT_ATOMIC_CAS4(cond->waiters, 0, -1)) { + if (cond->waiters > 0 || !__wt_atomic_casi32(&cond->waiters, 0, -1)) { EnterCriticalSection(&cond->mtx); locked = 1; WakeAllConditionVariable(&cond->cond); diff --git a/src/os_win/os_open.c b/src/os_win/os_open.c index a77bef63b9d..3bd24369242 100644 --- a/src/os_win/os_open.c +++ b/src/os_win/os_open.c @@ -39,7 +39,7 @@ __wt_open(WT_SESSION_IMPL *session, /* Increment the reference count if we already have the file open. */ matched = 0; __wt_spin_lock(session, &conn->fh_lock); - SLIST_FOREACH(tfh, &conn->fhhash[bucket], hashl) + TAILQ_FOREACH(tfh, &conn->fhhash[bucket], hashq) if (strcmp(name, tfh->name) == 0) { ++tfh->ref; *fhp = tfh; @@ -160,7 +160,7 @@ setupfh: */ matched = 0; __wt_spin_lock(session, &conn->fh_lock); - SLIST_FOREACH(tfh, &conn->fhhash[bucket], hashl) + TAILQ_FOREACH(tfh, &conn->fhhash[bucket], hashq) if (strcmp(name, tfh->name) == 0) { ++tfh->ref; *fhp = tfh; @@ -169,7 +169,7 @@ setupfh: } if (!matched) { WT_CONN_FILE_INSERT(conn, fh, bucket); - (void)WT_ATOMIC_ADD4(conn->open_file_count, 1); + (void)__wt_atomic_add32(&conn->open_file_count, 1); *fhp = fh; } @@ -217,7 +217,7 @@ __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp) /* Remove from the list. */ bucket = fh->name_hash % WT_HASH_ARRAY_SIZE; WT_CONN_FILE_REMOVE(conn, fh, bucket); - (void)WT_ATOMIC_SUB4(conn->open_file_count, 1); + (void)__wt_atomic_sub32(&conn->open_file_count, 1); __wt_spin_unlock(session, &conn->fh_lock); diff --git a/src/os_win/os_path.c b/src/os_win/os_path.c index 89f05e238c4..9d001e50571 100644 --- a/src/os_win/os_path.c +++ b/src/os_win/os_path.c @@ -12,7 +12,7 @@ * __wt_absolute_path -- * Return if a filename is an absolute path. */ -int +bool __wt_absolute_path(const char *path) { /* @@ -21,7 +21,7 @@ __wt_absolute_path(const char *path) */ if (strlen(path) >= 3 && isalpha(path[0]) && path[1] == ':') path += 2; - return (path[0] == '/' || path[0] == '\\' ? 1 : 0); + return (path[0] == '/' || path[0] == '\\'); } /* diff --git a/src/os_win/os_remove.c b/src/os_win/os_remove.c index 0c6396c775f..55b50030064 100644 --- a/src/os_win/os_remove.c +++ b/src/os_win/os_remove.c @@ -29,7 +29,7 @@ __remove_file_check(WT_SESSION_IMPL *session, const char *name) * level should have closed it before removing. */ __wt_spin_lock(session, &conn->fh_lock); - SLIST_FOREACH(fh, &conn->fhhash[bucket], hashl) + TAILQ_FOREACH(fh, &conn->fhhash[bucket], hashq) if (strcmp(name, fh->name) == 0) break; __wt_spin_unlock(session, &conn->fh_lock); diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index 37acb28a00b..10daa8b717c 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -27,18 +27,30 @@ typedef struct { WT_ITEM dsk; /* Temporary disk-image buffer */ - /* Track whether all changes to the page are written. */ + /* + * Track start/stop write generation to decide if all changes to the + * page are written. + */ + uint32_t orig_write_gen; + + /* + * Track start/stop checkpoint generations to decide if lookaside table + * records are correct. + */ + uint64_t orig_btree_checkpoint_gen; + uint64_t orig_txn_checkpoint_gen; + + /* + * Track maximum transaction ID seen and first unwritten transaction ID. + */ uint64_t max_txn; uint64_t first_dirty_txn; - uint32_t orig_write_gen; /* - * If page updates are skipped because they are as yet unresolved, or - * the page has updates we cannot discard, the page is left "dirty": - * the page cannot be discarded and a subsequent reconciliation will - * be necessary to discard the page. + * When we can't mark the page clean (for example, checkpoint found some + * uncommitted updates), there's a leave-dirty flag. */ - int leave_dirty; + int leave_dirty; /* * Raw compression (don't get me started, as if normal reconciliation @@ -153,18 +165,12 @@ typedef struct { void *dsk; /* Split's disk image */ /* - * When busy pages get large, we need to be able to evict them - * even when they contain unresolved updates, or updates which - * cannot be evicted because of running transactions. In such - * cases, break the page into multiple blocks, write the blocks - * that can be evicted, saving lists of updates for blocks that - * cannot be evicted, then re-instantiate the blocks that cannot - * be evicted as new, in-memory pages, restoring the updates on - * those pages. + * Saved update list, supporting the WT_EVICT_UPDATE_RESTORE and + * WT_EVICT_LOOKASIDE configurations. */ - WT_UPD_SKIPPED *skip; /* Skipped updates */ - uint32_t skip_next; - size_t skip_allocated; + WT_SAVE_UPD *supd; /* Saved updates */ + uint32_t supd_next; + size_t supd_allocated; /* * The key for a row-store page; no column-store key is needed @@ -220,12 +226,14 @@ typedef struct { size_t space_avail; /* Remaining space in this chunk */ /* - * While reviewing updates for each page, we store skipped updates here, - * and then move them to per-block areas as the blocks are defined. + * Saved update list, supporting the WT_EVICT_UPDATE_RESTORE and + * WT_EVICT_LOOKASIDE configurations. While reviewing updates for each + * page, we save WT_UPDATE lists here, and then move them to per-block + * areas as the blocks are defined. */ - WT_UPD_SKIPPED *skip; /* Skipped updates */ - uint32_t skip_next; - size_t skip_allocated; + WT_SAVE_UPD *supd; /* Saved updates */ + uint32_t supd_next; + size_t supd_allocated; /* * We don't need to keep the 0th key around on internal pages, the @@ -277,7 +285,10 @@ typedef struct { WT_SALVAGE_COOKIE *salvage; /* If it's a salvage operation */ - int tested_ref_state; /* Debugging information */ + int cache_write_lookaside; /* Used the lookaside table */ + int cache_write_restore; /* Used update/restoration */ + + uint32_t tested_ref_state; /* Debugging information */ } WT_RECONCILE; static void __rec_bnd_cleanup(WT_SESSION_IMPL *, WT_RECONCILE *, int); @@ -318,8 +329,11 @@ static int __rec_split_row_promote( WT_SESSION_IMPL *, WT_RECONCILE *, WT_ITEM *, uint8_t); static int __rec_split_write(WT_SESSION_IMPL *, WT_RECONCILE *, WT_BOUNDARY *, WT_ITEM *, int); +static int __rec_update_las( + WT_SESSION_IMPL *, WT_RECONCILE *, uint32_t, WT_BOUNDARY *); static int __rec_write_init(WT_SESSION_IMPL *, WT_REF *, uint32_t, WT_SALVAGE_COOKIE *, void *); +static int __rec_write_status(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_write_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); static int __rec_write_wrapup_err( WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *); @@ -338,31 +352,19 @@ int __wt_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags) { - WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_RECONCILE *r; - int page_lock, scan_lock, split_lock; - conn = S2C(session); page = ref->page; mod = page->modify; - page_lock = scan_lock = split_lock = 0; - - /* We're shouldn't get called with a clean page, that's an error. */ - if (!__wt_page_is_modified(page)) - WT_RET_MSG(session, WT_ERROR, - "Attempt to reconcile a clean page."); WT_RET(__wt_verbose(session, WT_VERB_RECONCILE, "%s", __wt_page_type_string(page->type))); - WT_STAT_FAST_CONN_INCR(session, rec_pages); - WT_STAT_FAST_DATA_INCR(session, rec_pages); - if (LF_ISSET(WT_EVICTING)) { - WT_STAT_FAST_CONN_INCR(session, rec_pages_eviction); - WT_STAT_FAST_DATA_INCR(session, rec_pages_eviction); - } + + /* We shouldn't get called with a clean page, that's an error. */ + WT_ASSERT(session, __wt_page_is_modified(page)); #ifdef HAVE_DIAGNOSTIC { @@ -386,39 +388,15 @@ __wt_reconcile(WT_SESSION_IMPL *session, r = session->reconcile; /* - * The compaction process looks at the page's modification information; - * if compaction is running, acquire the page's lock. - */ - if (conn->compact_in_memory_pass) { - WT_PAGE_LOCK(session, page); - page_lock = 1; - } - - /* - * Reconciliation reads the lists of updates, so obsolete updates cannot - * be discarded while reconciliation is in progress. - */ - for (;;) { - F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret); - if (ret == 0) - break; - __wt_yield(); - } - scan_lock = 1; - - /* - * Mark internal pages as splitting to ensure we don't deadlock when - * performing an in-memory split during a checkpoint. + * Reconciliation locks the page for three reasons: + * Reconciliation reads the lists of page updates, obsolete updates + * cannot be discarded while reconciliation is in progress; + * The compaction process reads page modification information, which + * reconciliation modifies; + * In-memory splits: reconciliation of an internal page cannot handle + * a child page splitting during the reconciliation. */ - if (WT_PAGE_IS_INTERNAL(page)) { - for (;;) { - F_CAS_ATOMIC(page, WT_PAGE_SPLIT_LOCKED, ret); - if (ret == 0) - break; - __wt_yield(); - } - split_lock = 1; - } + F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION); /* Reconcile the page. */ switch (page->type) { @@ -445,19 +423,34 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_ILLEGAL_VALUE_SET(session); } + /* Get the final status for the reconciliation. */ + if (ret == 0) + ret = __rec_write_status(session, r, page); + /* Wrap up the page reconciliation. */ if (ret == 0) ret = __rec_write_wrapup(session, r, page); else WT_TRET(__rec_write_wrapup_err(session, r, page)); - /* Release the locks we're holding. */ - if (split_lock) - F_CLR_ATOMIC(page, WT_PAGE_SPLIT_LOCKED); - if (scan_lock) - F_CLR_ATOMIC(page, WT_PAGE_SCANNING); - if (page_lock) - WT_PAGE_UNLOCK(session, page); + /* Release the reconciliation lock. */ + F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION); + + /* Update statistics. */ + WT_STAT_FAST_CONN_INCR(session, rec_pages); + WT_STAT_FAST_DATA_INCR(session, rec_pages); + if (LF_ISSET(WT_EVICTING)) { + WT_STAT_FAST_CONN_INCR(session, rec_pages_eviction); + WT_STAT_FAST_DATA_INCR(session, rec_pages_eviction); + } + if (r->cache_write_lookaside) { + WT_STAT_FAST_CONN_INCR(session, cache_write_lookaside); + WT_STAT_FAST_DATA_INCR(session, cache_write_lookaside); + } + if (r->cache_write_restore) { + WT_STAT_FAST_CONN_INCR(session, cache_write_restore); + WT_STAT_FAST_DATA_INCR(session, cache_write_restore); + } /* * Clean up the boundary structures: some workloads result in millions @@ -489,6 +482,125 @@ __wt_reconcile(WT_SESSION_IMPL *session, } /* + * __rec_las_checkpoint_test -- + * Return if the lookaside table is going to collide with a checkpoint. + */ +static inline bool +__rec_las_checkpoint_test(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + WT_CONNECTION_IMPL *conn; + WT_BTREE *btree; + + conn = S2C(session); + btree = S2BT(session); + + /* + * Running checkpoints can collide with the lookaside table because + * reconciliation using the lookaside table writes the key's last + * committed value, which might not be the value checkpoint would write. + * If reconciliation was configured for lookaside table eviction, this + * file participates in checkpoints, and any of the tree or system + * transactional generation numbers don't match, there's a possible + * collision. + * + * It's a complicated test, but the alternative is to have checkpoint + * drain lookaside table reconciliations, and this isn't a problem for + * most workloads. + */ + if (!F_ISSET(r, WT_EVICT_LOOKASIDE)) + return (false); + if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT)) + return (false); + if (r->orig_btree_checkpoint_gen == btree->checkpoint_gen && + r->orig_txn_checkpoint_gen == conn->txn_global.checkpoint_gen && + r->orig_btree_checkpoint_gen == r->orig_txn_checkpoint_gen) + return (false); + return (true); +} + +/* + * __rec_write_status -- + * Return the final status for reconciliation. + */ +static int +__rec_write_status(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) +{ + WT_BTREE *btree; + WT_PAGE_MODIFY *mod; + + btree = S2BT(session); + mod = page->modify; + + /* Check for a lookaside table and checkpoint collision. */ + if (__rec_las_checkpoint_test(session, r)) + return (EBUSY); + + /* + * Set the page's status based on whether or not we cleaned the page. + */ + if (r->leave_dirty) { + /* + * Update the page's first unwritten transaction ID. + */ + mod->first_dirty_txn = r->first_dirty_txn; + + /* + * The page remains dirty. + * + * Any checkpoint call cleared the tree's modified flag before + * writing pages, so we must explicitly reset it. We insert a + * barrier after the change for clarity (the requirement is the + * flag be set before a subsequent checkpoint reads it, and + * as the current checkpoint is waiting on this reconciliation + * to complete, there's no risk of that happening) + */ + btree->modified = 1; + WT_FULL_BARRIER(); + + /* + * Eviction should only be here if following the save/restore + * eviction path. + */ + WT_ASSERT(session, + !F_ISSET(r, WT_EVICTING) || + F_ISSET(r, WT_EVICT_UPDATE_RESTORE)); + } else { + /* + * Track the page's maximum transaction ID (used to decide if + * we're likely to be able to evict this page in the future). + */ + mod->rec_max_txn = r->max_txn; + + /* + * Track the tree's maximum transaction ID (used to decide if + * it's safe to discard the tree). Reconciliation for eviction + * is multi-threaded, only update the tree's maximum transaction + * ID when doing a checkpoint. That's sufficient, we only care + * about the maximum transaction ID of current updates in the + * tree, and checkpoint visits every dirty page in the tree. + */ + if (!F_ISSET(r, WT_EVICTING) && + WT_TXNID_LT(btree->rec_max_txn, r->max_txn)) + btree->rec_max_txn = r->max_txn; + + /* + * The page only might be clean; if the write generation is + * unchanged since reconciliation started, it's clean. + * + * If the write generation changed, the page has been written + * since reconciliation started and remains dirty (that can't + * happen when evicting, the page is exclusively locked). + */ + if (__wt_atomic_cas32(&mod->write_gen, r->orig_write_gen, 0)) + __wt_cache_dirty_decr(session, page); + else + WT_ASSERT(session, !F_ISSET(r, WT_EVICTING)); + } + + return (0); +} + +/* * __rec_root_write -- * Handle the write of a root page. */ @@ -577,7 +689,7 @@ err: __wt_page_out(session, &next); * __rec_raw_compression_config -- * Configure raw compression. */ -static inline int +static inline bool __rec_raw_compression_config( WT_SESSION_IMPL *session, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage) { @@ -588,11 +700,11 @@ __rec_raw_compression_config( /* Check if raw compression configured. */ if (btree->compressor == NULL || btree->compressor->compress_raw == NULL) - return (0); + return (false); /* Only for row-store and variable-length column-store objects. */ if (page->type == WT_PAGE_COL_FIX) - return (0); + return (false); /* * Raw compression cannot support dictionary compression. (Technically, @@ -602,11 +714,11 @@ __rec_raw_compression_config( * that seems an unlikely use case.) */ if (btree->dictionary != 0) - return (0); + return (false); /* Raw compression cannot support prefix compression. */ if (btree->prefix_compression != 0) - return (0); + return (false); /* * Raw compression is also turned off during salvage: we can't allow @@ -614,9 +726,9 @@ __rec_raw_compression_config( * can't manipulate the page size. */ if (salvage != NULL) - return (0); + return (false); - return (1); + return (true); } /* @@ -628,10 +740,12 @@ __rec_write_init(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags, WT_SALVAGE_COOKIE *salvage, void *reconcilep) { WT_BTREE *btree; + WT_CONNECTION_IMPL *conn; WT_PAGE *page; WT_RECONCILE *r; btree = S2BT(session); + conn = S2C(session); page = ref->page; if ((r = *(WT_RECONCILE **)reconcilep) == NULL) { @@ -648,9 +762,59 @@ __rec_write_init(WT_SESSION_IMPL *session, F_SET(&r->dsk, WT_ITEM_ALIGNED); } + /* Reconciliation is not re-entrant, make sure that doesn't happen. */ + WT_ASSERT(session, r->ref == NULL); + /* Remember the configuration. */ r->ref = ref; r->page = page; + + /* + * Save the page's write generation before reading the page. + * Save the transaction generations before reading the page. + * These are all ordered reads, but we only need one. + */ + r->orig_btree_checkpoint_gen = btree->checkpoint_gen; + r->orig_txn_checkpoint_gen = conn->txn_global.checkpoint_gen; + WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen); + + /* + * Lookaside table eviction is configured when eviction gets aggressive, + * adjust the flags for cases we don't support. + */ + if (LF_ISSET(WT_EVICT_LOOKASIDE)) { + /* + * Saving lookaside table updates into the lookaside table won't + * work. + */ + if (F_ISSET(btree, WT_BTREE_LOOKASIDE)) + LF_CLR(WT_EVICT_LOOKASIDE); + + /* + * We don't yet support fixed-length column-store combined with + * the lookaside table. It's not hard to do, but the underlying + * function that reviews which updates can be written to the + * evicted page and which updates need to be written to the + * lookaside table needs access to the original value from the + * page being evicted, and there's no code path for that in the + * case of fixed-length column-store objects. (Row-store and + * variable-width column-store objects provide a reference to + * the unpacked on-page cell for this purpose, but there isn't + * an on-page cell for fixed-length column-store objects.) For + * now, turn it off. + */ + if (page->type == WT_PAGE_COL_FIX) + LF_CLR(WT_EVICT_LOOKASIDE); + + /* + * Check for a lookaside table and checkpoint collision, and if + * we find one, turn off the lookaside file (we've gone to all + * the effort of getting exclusive access to the page, might as + * well try and evict it). + */ + if (__rec_las_checkpoint_test(session, r)) + LF_CLR(WT_EVICT_LOOKASIDE); + } r->flags = flags; /* Track if the page can be marked clean. */ @@ -668,8 +832,8 @@ __rec_write_init(WT_SESSION_IMPL *session, r->all_empty_value = 1; r->any_empty_value = 0; - /* The list of cached, skipped updates. */ - r->skip_next = 0; + /* The list of saved updates. */ + r->supd_next = 0; /* * Dictionary compression only writes repeated values once. We grow @@ -714,14 +878,11 @@ __rec_write_init(WT_SESSION_IMPL *session, r->salvage = salvage; - /* Save the page's write generation before reading the page. */ - WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen); - /* * Running transactions may update the page after we write it, so * this is the highest ID we can be confident we will see. */ - r->first_dirty_txn = S2C(session)->txn_global.last_running; + r->first_dirty_txn = conn->txn_global.last_running; return (0); } @@ -748,7 +909,7 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep) __rec_bnd_cleanup(session, r, 1); - __wt_free(session, r->skip); + __wt_free(session, r->supd); __wt_buf_free(session, &r->k.buf); __wt_buf_free(session, &r->v.buf); @@ -784,6 +945,9 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy) if (r->bnd == NULL) return; + /* Reconciliation is not re-entrant, make sure that doesn't happen. */ + r->ref = NULL; + /* * Free the boundary structures' memory. In the case of normal cleanup, * discard any memory we won't reuse in the next reconciliation; in the @@ -799,7 +963,7 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy) for (bnd = r->bnd, i = 0; i < r->bnd_entries; ++bnd, ++i) { __wt_free(session, bnd->addr.addr); __wt_free(session, bnd->dsk); - __wt_free(session, bnd->skip); + __wt_free(session, bnd->supd); __wt_buf_free(session, &bnd->key); } __wt_free(session, r->bnd); @@ -820,66 +984,84 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy) for (bnd = r->bnd, i = 0; i < last_used; ++bnd, ++i) { __wt_free(session, bnd->addr.addr); __wt_free(session, bnd->dsk); - __wt_free(session, bnd->skip); + __wt_free(session, bnd->supd); } } } /* - * __rec_skip_update_save -- - * Save a skipped WT_UPDATE list for later restoration. + * __rec_block_free -- + * Helper function to free a block. */ static int -__rec_skip_update_save( - WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip) +__rec_block_free( + WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) +{ + WT_BM *bm; + WT_BTREE *btree; + + btree = S2BT(session); + bm = btree->bm; + + return (bm->free(bm, session, addr, addr_size)); +} + +/* + * __rec_update_save -- + * Save a WT_UPDATE list for later restoration. + */ +static int +__rec_update_save(WT_SESSION_IMPL *session, + WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip, uint64_t txnid) { WT_RET(__wt_realloc_def( - session, &r->skip_allocated, r->skip_next + 1, &r->skip)); - r->skip[r->skip_next].ins = ins; - r->skip[r->skip_next].rip = rip; - ++r->skip_next; + session, &r->supd_allocated, r->supd_next + 1, &r->supd)); + r->supd[r->supd_next].ins = ins; + r->supd[r->supd_next].rip = rip; + r->supd[r->supd_next].onpage_txn = txnid; + ++r->supd_next; return (0); } /* - * __rec_skip_update_move -- - * Move a skipped WT_UPDATE list from the per-page cache to a specific + * __rec_update_move -- + * Move a saved WT_UPDATE list from the per-page cache to a specific * block's list. */ static int -__rec_skip_update_move( - WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_UPD_SKIPPED *skip) +__rec_update_move(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_SAVE_UPD *supd) { WT_RET(__wt_realloc_def( - session, &bnd->skip_allocated, bnd->skip_next + 1, &bnd->skip)); - bnd->skip[bnd->skip_next] = *skip; - ++bnd->skip_next; + session, &bnd->supd_allocated, bnd->supd_next + 1, &bnd->supd)); + bnd->supd[bnd->supd_next] = *supd; + ++bnd->supd_next; - skip->ins = NULL; - skip->rip = NULL; + supd->ins = NULL; + supd->rip = NULL; return (0); } /* * __rec_txn_read -- - * Return the first visible update in a list (or NULL if none are visible), - * set a flag if any updates were skipped, track the maximum transaction ID on - * the page. + * Return the update in a list that should be written (or NULL if none can + * be written). */ -static inline int +static int __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp) { + WT_BTREE *btree; WT_DECL_RET; - WT_ITEM ovfl; + WT_DECL_ITEM(tmp); WT_PAGE *page; - WT_UPDATE *upd, *upd_list, *upd_ovfl; + WT_UPDATE *append, *upd, *upd_list; size_t notused; uint64_t max_txn, min_txn, txnid; - int skipped; + int append_origv, skipped; *updp = NULL; + btree = S2BT(session); page = r->page; /* @@ -893,13 +1075,16 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, } else upd_list = ins->upd; - skipped = 0; - for (max_txn = WT_TXN_NONE, min_txn = UINT64_MAX, upd = upd_list; - upd != NULL; upd = upd->next) { + for (skipped = 0, + max_txn = WT_TXN_NONE, min_txn = UINT64_MAX, + upd = upd_list; upd != NULL; upd = upd->next) { if ((txnid = upd->txnid) == WT_TXN_ABORTED) continue; - /* Track the largest/smallest transaction IDs on the list. */ + /* + * Track the largest/smallest transaction IDs on the list and + * the smallest not-globally-visible transaction on the page. + */ if (WT_TXNID_LT(max_txn, txnid)) max_txn = txnid; if (WT_TXNID_LT(txnid, min_txn)) @@ -909,132 +1094,231 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, r->first_dirty_txn = txnid; /* - * Record whether any updates were skipped on the way to finding - * the first visible update. - * - * If updates were skipped before the one being written, future - * reads without intervening modifications to the page could - * see a different value; if no updates were skipped, the page - * can safely be marked clean and does not need to be - * reconciled until modified again. + * Find the first update we can use. */ - if (*updp == NULL) { - if (__wt_txn_visible(session, txnid)) - *updp = upd; - else + if (F_ISSET(r, WT_EVICTING)) { + /* + * Eviction can write any committed update. + * + * When reconciling for eviction, track whether any + * uncommitted updates are found. + */ + if (__wt_txn_committed(session, txnid)) { + if (*updp == NULL) + *updp = upd; + } else skipped = 1; + } else { + /* + * Checkpoint can only write updates visible as of its + * snapshot. + * + * When reconciling for a checkpoint, track whether any + * updates were skipped on the way to finding the first + * visible update. + */ + if (*updp == NULL) { + if (__wt_txn_visible(session, txnid)) + *updp = upd; + else + skipped = 1; + } } } /* + * If all of the updates were aborted, quit. This test is not strictly + * necessary because the above loop exits with skipped not set and the + * maximum transaction left at its initial value of WT_TXN_NONE, so + * the test below will be branch true and return, but it's cheap and a + * little more explicit, and makes Coverity happy. + */ + if (max_txn == WT_TXN_NONE) + return (0); + + /* * Track the maximum transaction ID in the page. We store this in the - * page at the end of reconciliation if no updates are skipped, it's - * used to avoid evicting clean pages from memory with changes required - * to satisfy a snapshot read. + * tree at the end of reconciliation in the service of checkpoints, it + * is used to avoid discarding trees from memory when they have changes + * required to satisfy a snapshot read. */ if (WT_TXNID_LT(r->max_txn, max_txn)) r->max_txn = max_txn; /* - * If no updates were skipped and all updates are globally visible, the - * page can be marked clean and we're done, regardless of whether we're - * evicting or checkpointing. + * If there are no skipped updates and all updates are globally visible, + * the page can be marked clean and we're done, regardless if evicting + * or checkpointing. * * We have to check both: the oldest transaction ID may have moved while - * we were scanning the update list, so it is possible to skip an update - * but then find that by the end of the scan, all updates are stable. + * we were scanning the update list, so it is possible to find a skipped + * update, but then find all updates are stable at the end of the scan. + * + * Skip the visibility check for the lookaside table as a special-case, + * we know there are no older readers of that table. */ - if (!skipped && __wt_txn_visible_all(session, max_txn)) + if (!skipped && + (F_ISSET(btree, WT_BTREE_LOOKASIDE) || + __wt_txn_visible_all(session, max_txn))) return (0); /* - * If some updates are not globally visible, or were skipped, the page - * cannot be marked clean. + * In some cases, there had better not be skipped updates or updates not + * yet globally visible. */ - r->leave_dirty = 1; - - /* If we're not evicting, we're done, we know what we'll write. */ - if (!F_ISSET(r, WT_EVICTING)) - return (0); - - /* In some cases, there had better not be any updates we can't write. */ - if (F_ISSET(r, WT_SKIP_UPDATE_ERR)) + if (F_ISSET(r, WT_VISIBILITY_ERR)) WT_PANIC_RET(session, EINVAL, - "reconciliation illegally skipped an update"); + "reconciliation error, uncommitted update or update not " + "globally visible"); /* - * If evicting and we aren't able to save/restore the not-yet-visible - * updates, the page can't be evicted. + * If not trying to evict the page, we know what we'll write and we're + * done. Because some updates were skipped or are not globally visible, + * the page can't be marked clean. */ - if (!F_ISSET(r, WT_SKIP_UPDATE_RESTORE)) - return (EBUSY); + if (!F_ISSET(r, WT_EVICTING)) { + r->leave_dirty = 1; + return (0); + } /* - * Evicting a page with not-yet-visible updates: save and restore the - * list of updates on a newly instantiated page. - * - * The order of the updates on the list matters so we can't move only - * the unresolved updates, we have to move the entire update list. + * Evicting with either uncommitted changes or not-yet-globally-visible + * changes. There are two ways to continue, the save/restore eviction + * path or the lookaside table eviction path. Both cannot be configured + * because the paths track different information. The save/restore path + * can handle both uncommitted and not-yet-globally-visible changes, by + * evicting most of the page and then creating a new, smaller page into + * which we re-instantiate those changes. The lookaside table path can + * only handle not-yet-globally-visible changes by writing those changes + * into the lookaside table and restoring them on demand if and when the + * page is read back into memory. * - * Clear the returned update so our caller ignores the key/value pair - * in the case of an insert/append entry (everything we need is in the - * update list), and otherwise writes the original on-page key/value - * pair to which the update list applies. + * Both paths are configured outside of reconciliation: the save/restore + * path is the WT_EVICT_UPDATE_RESTORE flag, the lookaside table path is + * the WT_EVICT_LOOKASIDE flag. */ - *updp = NULL; + if (!F_ISSET(r, WT_EVICT_LOOKASIDE | WT_EVICT_UPDATE_RESTORE)) + return (EBUSY); + if (skipped && !F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) + return (EBUSY); + + append_origv = 0; + if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) { + /* + * The save/restore eviction path. + * + * Clear the returned update so our caller ignores the key/value + * pair in the case of an insert/append list entry (everything + * we need is in the update list), and otherwise writes the + * original on-page key/value pair to which the update list + * applies. + */ + *updp = NULL; + + /* The page can't be marked clean. */ + r->leave_dirty = 1; + + /* + * A special-case for overflow values, where we can't write the + * original on-page value item to disk because it's been updated + * or removed. + * + * What happens is that an overflow value is updated or removed + * and its backing blocks freed. If any reader in the system + * might still want the value, a copy was cached in the page + * reconciliation tracking memory, and the page cell set to + * WT_CELL_VALUE_OVFL_RM. Eviction then chose the page and + * we're splitting it up in order to push parts of it out of + * memory. + * + * We could write the original on-page value item to disk... if + * we had a copy. The cache may not have a copy (a globally + * visible update would have kept a value from being cached), or + * an update that subsequently became globally visible could + * cause a cached value to be discarded. Either way, once there + * is a globally visible update, we may not have the original + * value. + * + * Fortunately, if there's a globally visible update we don't + * care about the original version, so we simply ignore it, no + * transaction can ever try and read it. If there isn't a + * globally visible update, there had better be a cached value. + * + * In the latter case, we could write the value out to disk, but + * (1) we are planning on re-instantiating this page in memory, + * it isn't going to disk, and (2) the value item is eventually + * going to be discarded, that seems like a waste of a write. + * Instead, find the cached value and append it to the update + * list we're saving for later restoration. + */ + if (vpack != NULL && + vpack->raw == WT_CELL_VALUE_OVFL_RM && + !__wt_txn_visible_all(session, min_txn)) + append_origv = 1; + } else { + /* + * The lookaside table eviction path. + * + * If at least one update is globally visible, copy the update + * list and ignore the current on-page value. If no update is + * globally visible, readers require the page's original value. + */ + if (!__wt_txn_visible_all(session, min_txn)) + append_origv = 1; + } /* - * Handle the case were we don't want to write an original on-page value - * item to disk because it's been updated or removed. - * - * Here's the deal: an overflow value was updated or removed and its - * backing blocks freed. If any transaction in the system might still - * read the value, a copy was cached in page reconciliation tracking - * memory, and the page cell set to WT_CELL_VALUE_OVFL_RM. Eviction - * then chose the page and we're splitting it up in order to push parts - * of it out of memory. - * - * We could write the original on-page value item to disk... if we had - * a copy. The cache may not have a copy (a globally visible update - * would have kept a value from ever being cached), or an update that - * subsequent became globally visible could cause a cached value to be - * discarded. Either way, once there's a globally visible update, we - * may not have the value. - * - * Fortunately, if there's a globally visible update we don't care about - * the original version, so we simply ignore it, no transaction can ever - * try and read it. If there isn't a globally visible update, there had - * better be a cached value. - * - * In the latter case, we could write the value out to disk, but (1) we - * are planning on re-instantiating this page in memory, it isn't going - * to disk, and (2) the value item is eventually going to be discarded, - * that seems like a waste of a write. Instead, find the cached value - * and append it to the update list we're saving for later restoration. - */ - if (vpack != NULL && vpack->raw == WT_CELL_VALUE_OVFL_RM && - !__wt_txn_visible_all(session, min_txn)) { - if ((ret = __wt_ovfl_txnc_search( - page, vpack->data, vpack->size, &ovfl)) != 0) - WT_PANIC_RET(session, ret, - "cached overflow item discarded early"); + * We need the original on-page value for some reason: get a copy and + * append it to the end of the update list with a transaction ID that + * guarantees its visibility. + */ + if (append_origv) { + /* + * If we don't have a value cell, it's an insert/append list + * key/value pair which simply doesn't exist for some reader; + * place a deleted record at the end of the update list. + */ + if (vpack == NULL || vpack->type == WT_CELL_DEL) + WT_RET(__wt_update_alloc( + session, NULL, &append, ¬used)); + else { + WT_RET(__wt_scr_alloc(session, 0, &tmp)); + if ((ret = __wt_page_cell_data_ref( + session, page, vpack, tmp)) == 0) + ret = __wt_update_alloc( + session, tmp, &append, ¬used); + __wt_scr_free(session, &tmp); + WT_RET(ret); + } /* - * Create an update structure with an impossibly low transaction - * ID and append it to the update list we're about to save. - * Restoring that update list when this page is re-instantiated - * creates an update for the key/value pair visible to every - * running transaction in the system, ensuring the on-page value - * will be ignored. + * Give the entry an impossibly low transaction ID to ensure its + * global visibility, append it to the update list. + * + * Note the change to the actual reader-accessible update list: + * from now on, the original on-page value appears at the end + * of the update list, even if this reconciliation subsequently + * fails. */ - WT_RET(__wt_update_alloc(session, &ovfl, &upd_ovfl, ¬used)); - upd_ovfl->txnid = WT_TXN_NONE; + append->txnid = WT_TXN_NONE; for (upd = upd_list; upd->next != NULL; upd = upd->next) ; - upd->next = upd_ovfl; + upd->next = append; } - return (__rec_skip_update_save(session, r, ins, rip)); + /* + * The order of the updates on the list matters, we can't move only the + * unresolved updates, move the entire update list. + * + * If we skipped updates, the transaction value is never used. If we + * didn't skip updates, the list of updates are eventually written to + * the lookaside table, and associated with each update record is the + * transaction ID of the update we wrote in the reconciled page; once + * that transaction ID is globally visible, we know we no longer need + * the lookaside table records, allowing them to be discarded. + */ + return (__rec_update_save(session, + r, ins, rip, (*updp == NULL) ? WT_TXN_NONE : (*updp)->txnid)); } /* @@ -1104,8 +1388,8 @@ __rec_child_modify(WT_SESSION_IMPL *session, * to see if the delete is visible to us. Lock down the * structure. */ - if (!WT_ATOMIC_CAS4( - ref->state, WT_REF_DELETED, WT_REF_LOCKED)) + if (!__wt_atomic_casv32( + &ref->state, WT_REF_DELETED, WT_REF_LOCKED)) break; ret = __rec_child_deleted(session, r, ref, statep); WT_PUBLISH(ref->state, WT_REF_DELETED); @@ -1155,10 +1439,10 @@ __rec_child_modify(WT_SESSION_IMPL *session, * If called during checkpoint, acquire a hazard pointer * so the child isn't evicted, it's an in-memory case. * - * This call cannot return split/restart, dirty page - * eviction is shutout during checkpoint, all splits in - * process will have completed before we walk any pages - * for checkpoint. + * This call cannot return split/restart, eviction of + * pages that split into their parent is shutout during + * checkpoint, all splits in process will have completed + * before we walk any pages for checkpoint. */ ret = __wt_page_in(session, ref, WT_READ_CACHE | WT_READ_NO_EVICT | @@ -1215,7 +1499,7 @@ in_memory: * reason to write the cell. */ mod = ref->page->modify; - if (mod != NULL && mod->flags != 0) + if (mod != NULL && F_ISSET(mod, WT_PM_REC_MASK)) *statep = WT_CHILD_MODIFIED; else if (ref->addr == NULL) { *statep = WT_CHILD_IGNORE; @@ -1234,37 +1518,32 @@ static int __rec_child_deleted( WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, int *statep) { - WT_BM *bm; WT_PAGE_DELETED *page_del; size_t addr_size; const uint8_t *addr; - bm = S2BT(session)->bm; page_del = ref->page_del; /* * Internal pages with child leaf pages in the WT_REF_DELETED state are * a special case during reconciliation. First, if the deletion was a * result of a session truncate call, the deletion may not be visible to - * us. In that case, we proceed as with any change that's not visible - * during reconciliation by setting the skipped flag and ignoring the - * change for the purposes of writing the internal page. + * us. In that case, we proceed as with any change not visible during + * reconciliation by ignoring the change for the purposes of writing the + * internal page. * * In this case, there must be an associated page-deleted structure, and * it holds the transaction ID we care about. + * + * In some cases, there had better not be any updates we can't see. */ - if (page_del != NULL && !__wt_txn_visible(session, page_del->txnid)) { - /* - * In some cases, there had better not be any updates we can't - * write. - */ - if (F_ISSET(r, WT_SKIP_UPDATE_ERR)) - WT_PANIC_RET(session, EINVAL, - "reconciliation illegally skipped an update"); - } + if (F_ISSET(r, WT_VISIBILITY_ERR) && + page_del != NULL && !__wt_txn_visible(session, page_del->txnid)) + WT_PANIC_RET(session, EINVAL, + "reconciliation illegally skipped an update"); /* - * The deletion is visible to us, deal with any underlying disk blocks. + * Deal with any underlying disk blocks. * * First, check to see if there is an address associated with this leaf: * if there isn't, we're done, the underlying page is already gone. If @@ -1291,7 +1570,7 @@ __rec_child_deleted( (page_del == NULL || __wt_txn_visible_all(session, page_del->txnid))) { WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); - WT_RET(bm->free(bm, session, addr, addr_size)); + WT_RET(__rec_block_free(session, addr, addr_size)); if (__wt_off_page(ref->home, ref->addr)) { __wt_free(session, ((WT_ADDR *)ref->addr)->addr); @@ -1562,7 +1841,7 @@ static void __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd) { bnd->offset = 0; - bnd->recno = 0; + bnd->recno = WT_RECNO_OOB; bnd->entries = 0; __wt_free(session, bnd->addr.addr); @@ -1571,9 +1850,9 @@ __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd) bnd->cksum = 0; __wt_free(session, bnd->dsk); - __wt_free(session, bnd->skip); - bnd->skip_next = 0; - bnd->skip_allocated = 0; + __wt_free(session, bnd->supd); + bnd->supd_next = 0; + bnd->supd_allocated = 0; /* * Don't touch the key, we re-use that memory in each new @@ -1775,9 +2054,13 @@ __rec_split_init(WT_SESSION_IMPL *session, * __rec_is_checkpoint -- * Return if we're writing a checkpoint. */ -static int -__rec_is_checkpoint(WT_RECONCILE *r, WT_BOUNDARY *bnd) +static bool +__rec_is_checkpoint(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_BOUNDARY *bnd) { + WT_BTREE *btree; + + btree = S2BT(session); + /* * Check to see if we're going to create a checkpoint. * @@ -1792,13 +2075,14 @@ __rec_is_checkpoint(WT_RECONCILE *r, WT_BOUNDARY *bnd) * we don't do checkpoint writes here; clear the boundary information as * a reminder and create the checkpoint during wrapup. */ - if (bnd == &r->bnd[0] && __wt_ref_is_root(r->ref)) { + if (!F_ISSET(btree, WT_BTREE_NO_CHECKPOINT) && + bnd == &r->bnd[0] && __wt_ref_is_root(r->ref)) { bnd->addr.addr = NULL; bnd->addr.size = 0; bnd->addr.type = 0; - return (1); + return (true); } - return (0); + return (false); } /* @@ -1841,7 +2125,7 @@ __rec_split_row_promote( WT_DECL_ITEM(update); WT_DECL_RET; WT_ITEM *max; - WT_UPD_SKIPPED *skip; + WT_SAVE_UPD *supd; size_t cnt, len, size; uint32_t i; const uint8_t *pa, *pb; @@ -1892,36 +2176,37 @@ __rec_split_row_promote( * the last key and smaller than the current key. */ max = r->last; - for (i = r->skip_next; i > 0; --i) { - skip = &r->skip[i - 1]; - if (skip->ins == NULL) - WT_ERR(__wt_row_leaf_key( - session, r->page, skip->rip, update, 0)); - else { - update->data = WT_INSERT_KEY(skip->ins); - update->size = WT_INSERT_KEY_SIZE(skip->ins); - } + if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) + for (i = r->supd_next; i > 0; --i) { + supd = &r->supd[i - 1]; + if (supd->ins == NULL) + WT_ERR(__wt_row_leaf_key( + session, r->page, supd->rip, update, 0)); + else { + update->data = WT_INSERT_KEY(supd->ins); + update->size = WT_INSERT_KEY_SIZE(supd->ins); + } - /* Compare against the current key, it must be less. */ - WT_ERR(__wt_compare( - session, btree->collator, update, r->cur, &cmp)); - if (cmp >= 0) - continue; + /* Compare against the current key, it must be less. */ + WT_ERR(__wt_compare( + session, btree->collator, update, r->cur, &cmp)); + if (cmp >= 0) + continue; - /* Compare against the last key, it must be greater. */ - WT_ERR(__wt_compare( - session, btree->collator, update, r->last, &cmp)); - if (cmp >= 0) - max = update; + /* Compare against the last key, it must be greater. */ + WT_ERR(__wt_compare( + session, btree->collator, update, r->last, &cmp)); + if (cmp >= 0) + max = update; - /* - * The skipped updates are in key-sort order so the entry we're - * looking for is either the last one or the next-to-last one - * in the list. Once we've compared an entry against the last - * key on the page, we're done. - */ - break; - } + /* + * The saved updates are in key-sort order so the entry + * we're looking for is either the last or the next-to- + * last one in the list. Once we've compared an entry + * against the last key on the page, we're done. + */ + break; + } /* * The largest key on the last block must sort before the current key, @@ -2228,7 +2513,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, * We track the record number at each column-store split point, set an * initial value. */ - recno = 0; + recno = WT_RECNO_OOB; if (dsk->type == WT_PAGE_COL_VAR) recno = last->recno; @@ -2326,10 +2611,8 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, WT_RET(compressor->pre_size(compressor, wt_session, (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP, (size_t)r->raw_offsets[slots], &result_len)); - extra_skip = 0; - if (btree->kencryptor != NULL) - extra_skip = btree->kencryptor->size_const + - WT_ENCRYPT_LEN_SIZE; + extra_skip = btree->kencryptor == NULL ? 0 : + btree->kencryptor->size_const + WT_ENCRYPT_LEN_SIZE; corrected_page_size = result_len + WT_BLOCK_COMPRESS_SKIP; WT_RET(bm->write_size(bm, session, &corrected_page_size)); @@ -2477,7 +2760,7 @@ no_slots: break; case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: - next->recno = 0; + next->recno = WT_RECNO_OOB; if (!last_block) { /* * Confirm there was uncompressed data remaining @@ -2530,7 +2813,8 @@ no_slots: * * If it's not a checkpoint, write the block. */ - if (r->bnd_next == 1 && last_block && __rec_is_checkpoint(r, last)) { + if (r->bnd_next == 1 && + last_block && __rec_is_checkpoint(session, r, last)) { if (write_ref == dst) WT_RET(__wt_buf_set( session, &r->dsk, dst->mem, dst->size)); @@ -2647,13 +2931,29 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) } /* - * We only arrive here with no entries to write if the page was entirely - * empty, and if the page is empty, we merge it into its parent during - * the parent's reconciliation. A page with skipped updates isn't truly - * empty, continue on. + * We may arrive here with no entries to write if the page was entirely + * empty or if nothing on the page was visible to us. */ - if (r->entries == 0 && r->skip_next == 0) - return (0); + if (r->entries == 0) { + /* + * Pages with skipped or not-yet-globally visible updates aren't + * really empty; otherwise, the page is truly empty and we will + * merge it into its parent during the parent's reconciliation. + */ + if (r->supd_next == 0) + return (0); + + /* + * If using the save/restore eviction path, continue with the + * write, the page will be restored after we finish. + * + * If using the lookaside table eviction path, we can't continue + * (we need a page to be written, otherwise we won't ever find + * the updates for future reads). + */ + if (F_ISSET(r, WT_EVICT_LOOKASIDE)) + return (EBUSY); + } /* Set the boundary reference and increment the count. */ bnd = &r->bnd[r->bnd_next++]; @@ -2666,9 +2966,8 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r) dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk); /* If this is a checkpoint, we're done, otherwise write the page. */ - return ( - __rec_is_checkpoint(r, bnd) ? 0 : - __rec_split_write(session, r, bnd, &r->dsk, 1)); + return (__rec_is_checkpoint(session, r, bnd) ? + 0 : __rec_split_write(session, r, bnd, &r->dsk, 1)); } /* @@ -2794,7 +3093,7 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_PAGE *page; WT_PAGE_HEADER *dsk; WT_PAGE_MODIFY *mod; - WT_UPD_SKIPPED *skip; + WT_SAVE_UPD *supd; size_t addr_size; uint32_t bnd_slot, i, j; int cmp; @@ -2837,23 +3136,23 @@ __rec_split_write(WT_SESSION_IMPL *session, bnd->cksum = 0; /* - * Check if we've skipped updates that belong to this block, and move - * any to the per-block structure. Quit as soon as we find a skipped + * Check if we've saved updates that belong to this block, and move + * any to the per-block structure. Quit as soon as we find a saved * update that doesn't belong to the block, they're in sorted order. * * This code requires a key be filled in for the next block (or the * last block flag be set, if there's no next block). */ - for (i = 0, skip = r->skip; i < r->skip_next; ++i, ++skip) { - /* The last block gets all remaining skipped updates. */ + for (i = 0, supd = r->supd; i < r->supd_next; ++i, ++supd) { + /* The last block gets all remaining saved updates. */ if (last_block) { - WT_ERR(__rec_skip_update_move(session, bnd, skip)); + WT_ERR(__rec_update_move(session, bnd, supd)); continue; } /* - * Get the skipped update's key and compare it with this block's - * key range. If the skipped update list belongs with the block + * Get the saved update's key and compare it with this block's + * key range. If the saved update list belongs with the block * we're about to write, move it to the per-block memory. Check * only to the first update that doesn't go with the block, they * must be in sorted order. @@ -2861,43 +3160,56 @@ __rec_split_write(WT_SESSION_IMPL *session, switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: - if (WT_INSERT_RECNO(skip->ins) >= (bnd + 1)->recno) - goto skip_check_complete; + if (WT_INSERT_RECNO(supd->ins) >= (bnd + 1)->recno) + goto supd_check_complete; break; case WT_PAGE_ROW_LEAF: - if (skip->ins == NULL) + if (supd->ins == NULL) WT_ERR(__wt_row_leaf_key( - session, page, skip->rip, key, 0)); + session, page, supd->rip, key, 0)); else { - key->data = WT_INSERT_KEY(skip->ins); - key->size = WT_INSERT_KEY_SIZE(skip->ins); + key->data = WT_INSERT_KEY(supd->ins); + key->size = WT_INSERT_KEY_SIZE(supd->ins); } WT_ERR(__wt_compare(session, btree->collator, key, &(bnd + 1)->key, &cmp)); if (cmp >= 0) - goto skip_check_complete; + goto supd_check_complete; break; WT_ILLEGAL_VALUE_ERR(session); } - WT_ERR(__rec_skip_update_move(session, bnd, skip)); + WT_ERR(__rec_update_move(session, bnd, supd)); } -skip_check_complete: +supd_check_complete: /* * If there are updates that weren't moved to the block, shuffle them to - * the beginning of the cached list (we maintain the skipped updates in - * sorted order, new skipped updates must be appended to the list). + * the beginning of the cached list (we maintain the saved updates in + * sorted order, new saved updates must be appended to the list). + */ + for (j = 0; i < r->supd_next; ++j, ++i) + r->supd[j] = r->supd[i]; + r->supd_next = j; + + /* + * If using the lookaside table eviction path and we found updates that + * weren't globally visible when reconciling this page, note that in the + * page header. */ - for (j = 0; i < r->skip_next; ++j, ++i) - r->skip[j] = r->skip[i]; - r->skip_next = j; + if (F_ISSET(r, WT_EVICT_LOOKASIDE) && bnd->supd != NULL) { + F_SET(dsk, WT_PAGE_LAS_UPDATE); + r->cache_write_lookaside = 1; + } /* - * If we had to skip updates in order to build this disk image, we can't - * actually write it. Instead, we will re-instantiate the page using the - * disk image and the list of updates we skipped. + * If using the save/restore eviction path and we had to skip updates in + * order to build this disk image, we can't actually write it. Instead, + * we will re-instantiate the page using the disk image and the list of + * updates we skipped. */ - if (bnd->skip != NULL) { + if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) { + r->cache_write_restore = 1; + /* * If the buffer is compressed (raw compression was configured), * we have to decompress it so we can instantiate it later. It's @@ -2963,12 +3275,148 @@ skip_check_complete: WT_ERR(__wt_strndup(session, addr, addr_size, &bnd->addr.addr)); bnd->addr.size = (uint8_t)addr_size; + /* + * If using the lookaside table eviction path and we found updates that + * weren't globally visible when reconciling this page, copy them into + * the database's lookaside store. + */ + if (F_ISSET(r, WT_EVICT_LOOKASIDE) && bnd->supd != NULL) + ret = __rec_update_las(session, r, btree->id, bnd); + done: err: __wt_scr_free(session, &key); return (ret); } /* + * __rec_update_las -- + * Copy a set of updates into the database's lookaside buffer. + */ +static int +__rec_update_las(WT_SESSION_IMPL *session, + WT_RECONCILE *r, uint32_t btree_id, WT_BOUNDARY *bnd) +{ + WT_CURSOR *cursor; + WT_DECL_ITEM(key); + WT_DECL_RET; + WT_ITEM las_addr, las_value; + WT_PAGE *page; + WT_SAVE_UPD *list; + WT_UPDATE *upd; + uint64_t las_counter; + uint32_t i, session_flags, slot; + uint8_t *p; + + cursor = NULL; + WT_CLEAR(las_addr); + WT_CLEAR(las_value); + page = r->page; + + /* + * We're writing lookaside records: start instantiating them on pages + * we read (with the right flag set), and start sweeping the file. + */ + __wt_las_set_written(session); + + WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); + + /* Ensure enough room for a column-store key without checking. */ + WT_ERR(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key)); + + /* + * Each key in the lookaside table is associated with a block, and those + * blocks are freed and reallocated to other pages as pages in the tree + * are modified and reconciled. We want to be sure we don't add records + * to the lookaside table, then discard the block to which they apply, + * then write a new block to the same address, and then apply the old + * records to the new block when it's read. We don't want to clean old + * records out of the lookaside table every time we free a block because + * that happens a lot and would be costly; instead, we clean out the old + * records when adding new records into the lookaside table. This works + * because we only read from the lookaside table for pages marked with + * the WT_PAGE_LAS_UPDATE flag: that flag won't be set if we rewrite a + * block with no lookaside records, so the lookaside table won't be + * checked when the block is read, even if there are lookaside table + * records matching that block. If we rewrite a block that has lookaside + * records, we'll run this code, discarding any old records that might + * exist. + */ + WT_ERR(__wt_las_remove_block( + session, cursor, btree_id, bnd->addr.addr, bnd->addr.size)); + + /* Lookaside table key component: block address. */ + las_addr.data = bnd->addr.addr; + las_addr.size = bnd->addr.size; + + /* Enter each update in the boundary's list into the lookaside store. */ + for (las_counter = 0, i = 0, + list = bnd->supd; i < bnd->supd_next; ++i, ++list) { + /* Lookaside table key component: source key. */ + switch (page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + p = key->mem; + WT_ERR( + __wt_vpack_uint(&p, 0, WT_INSERT_RECNO(list->ins))); + key->size = WT_PTRDIFF(p, key->data); + + break; + case WT_PAGE_ROW_LEAF: + if (list->ins == NULL) + WT_ERR(__wt_row_leaf_key( + session, page, list->rip, key, 0)); + else { + key->data = WT_INSERT_KEY(list->ins); + key->size = WT_INSERT_KEY_SIZE(list->ins); + } + break; + WT_ILLEGAL_VALUE_ERR(session); + } + + /* Lookaside table value component: update reference. */ + switch (page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + upd = list->ins->upd; + break; + case WT_PAGE_ROW_LEAF: + if (list->ins == NULL) { + slot = WT_ROW_SLOT(page, list->rip); + upd = page->pg_row_upd[slot]; + } else + upd = list->ins->upd; + break; + WT_ILLEGAL_VALUE_ERR(session); + } + + /* + * Walk the list of updates, storing each key/value pair into + * the lookaside table. + */ + do { + cursor->set_key(cursor, btree_id, + &las_addr, ++las_counter, list->onpage_txn, key); + + if (WT_UPDATE_DELETED_ISSET(upd)) + las_value.size = 0; + else { + las_value.data = WT_UPDATE_DATA(upd); + las_value.size = upd->size; + } + cursor->set_value( + cursor, upd->txnid, upd->size, &las_value); + + WT_ERR(cursor->insert(cursor)); + } while ((upd = upd->next) != NULL); + } + +err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); + + __wt_scr_free(session, &key); + return (ret); +} + +/* * __wt_bulk_init -- * Bulk insert initialization. */ @@ -3008,7 +3456,7 @@ __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) recno = 1; break; case BTREE_ROW: - recno = 0; + recno = WT_RECNO_OOB; break; WT_ILLEGAL_VALUE(session); } @@ -3049,6 +3497,7 @@ __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) WT_RET(__rec_split_finish(session, r)); WT_RET(__rec_write_wrapup(session, r, r->page)); + WT_RET(__rec_write_status(session, r, r->page)); /* Mark the page's parent and the tree dirty. */ parent = r->ref->home; @@ -3824,7 +4273,7 @@ record_loop: /* * Write a placeholder. */ WT_ASSERT(session, - F_ISSET(r, WT_SKIP_UPDATE_RESTORE)); + F_ISSET(r, WT_EVICT_UPDATE_RESTORE)); data = "@"; size = 1; @@ -4207,7 +4656,7 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) vtype = state == WT_CHILD_PROXY ? WT_CELL_ADDR_DEL : (u_int)vpack->raw; } - __rec_cell_build_addr(r, p, size, vtype, 0); + __rec_cell_build_addr(r, p, size, vtype, WT_RECNO_OOB); CHILD_RELEASE_ERR(session, hazard, ref); /* @@ -4294,7 +4743,7 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) addr = &multi->addr; __rec_cell_build_addr( - r, addr->addr, addr->size, __rec_vtype(addr), 0); + r, addr->addr, addr->size, __rec_vtype(addr), WT_RECNO_OOB); /* Boundary: split or write the page. */ if (key->len + val->len > r->space_avail) @@ -4450,7 +4899,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session, * Assert the case. */ WT_ASSERT(session, - F_ISSET(r, WT_SKIP_UPDATE_RESTORE)); + F_ISSET(r, WT_EVICT_UPDATE_RESTORE)); /* * If the key is also a removed overflow item, @@ -4777,13 +5226,11 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) static int __rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page) { - WT_BM *bm; WT_DECL_RET; WT_PAGE_MODIFY *mod; WT_MULTI *multi; uint32_t i; - bm = S2BT(session)->bm; mod = page->modify; /* @@ -4799,17 +5246,17 @@ __rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page) __wt_free(session, multi->key.ikey); break; } - if (multi->skip == NULL) { + if (multi->supd == NULL) { if (multi->addr.reuse) multi->addr.addr = NULL; else { - WT_RET(bm->free(bm, session, + WT_RET(__rec_block_free(session, multi->addr.addr, multi->addr.size)); __wt_free(session, multi->addr.addr); } } else { - __wt_free(session, multi->skip); - __wt_free(session, multi->skip_dsk); + __wt_free(session, multi->supd); + __wt_free(session, multi->supd_dsk); } } __wt_free(session, mod->mod_multi); @@ -4882,7 +5329,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) */ WT_RET(__wt_ref_info( session, ref, &addr, &addr_size, NULL)); - WT_RET(bm->free(bm, session, addr, addr_size)); + WT_RET(__rec_block_free(session, addr, addr_size)); if (__wt_off_page(ref->home, ref->addr)) { __wt_free( session, ((WT_ADDR *)ref->addr)->addr); @@ -4908,7 +5355,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * are checkpoints, and must be explicitly dropped. */ if (!__wt_ref_is_root(ref)) - WT_RET(bm->free(bm, session, + WT_RET(__rec_block_free(session, mod->mod_replace.addr, mod->mod_replace.size)); /* Discard the replacement page's address. */ @@ -4962,14 +5409,14 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * nothing to write. Allocate, then initialize the array of * replacement blocks. */ - if (bnd->skip != NULL) { + if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) { WT_RET(__wt_calloc_def( session, r->bnd_next, &mod->mod_multi)); multi = mod->mod_multi; - multi->skip = bnd->skip; - multi->skip_entries = bnd->skip_next; - bnd->skip = NULL; - multi->skip_dsk = bnd->dsk; + multi->supd = bnd->supd; + multi->supd_entries = bnd->supd_next; + bnd->supd = NULL; + multi->supd_dsk = bnd->dsk; bnd->dsk = NULL; mod->mod_multi_entries = 1; @@ -5068,50 +5515,6 @@ err: __wt_scr_free(session, &tkey); F_SET(mod, WT_PM_REC_MULTIBLOCK); break; } - - /* - * If updates were skipped, the tree isn't clean. The checkpoint call - * cleared the tree's modified value before calling the eviction thread, - * so we must explicitly reset the tree's modified flag. We insert a - * barrier after the change for clarity (the requirement is the value - * be set before a subsequent checkpoint reads it, and because the - * current checkpoint is waiting on this reconciliation to complete, - * there's no risk of that happening). - */ - if (r->leave_dirty) { - mod->first_dirty_txn = r->first_dirty_txn; - - btree->modified = 1; - WT_FULL_BARRIER(); - } else { - /* - * If no updates were skipped, we have a new maximum transaction - * written for the page (used to decide if a clean page can be - * evicted). Set the highest transaction ID for the page. - * - * Track the highest transaction ID for the tree (used to decide - * if it's safe to discard all of the pages in the tree without - * further checking). Reconciliation in the service of eviction - * is multi-threaded, only update the tree's maximum transaction - * ID when doing a checkpoint. That's sufficient, we only care - * about the highest transaction ID of any update currently in - * the tree, and checkpoint visits every dirty page in the tree. - */ - mod->rec_max_txn = r->max_txn; - if (!F_ISSET(r, WT_EVICTING) && - WT_TXNID_LT(btree->rec_max_txn, r->max_txn)) - btree->rec_max_txn = r->max_txn; - - /* - * The page only might be clean; if the write generation is - * unchanged since reconciliation started, it's clean. If the - * write generation changed, the page has been written since - * we started reconciliation and remains dirty. - */ - if (WT_ATOMIC_CAS4(mod->write_gen, r->orig_write_gen, 0)) - __wt_cache_dirty_decr(session, page); - } - return (0); } @@ -5122,14 +5525,12 @@ err: __wt_scr_free(session, &tkey); static int __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) { - WT_BM *bm; WT_BOUNDARY *bnd; WT_DECL_RET; WT_MULTI *multi; WT_PAGE_MODIFY *mod; uint32_t i; - bm = S2BT(session)->bm; mod = page->modify; /* @@ -5160,7 +5561,7 @@ __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) if (bnd->addr.reuse) bnd->addr.addr = NULL; else { - WT_TRET(bm->free(bm, session, + WT_TRET(__rec_block_free(session, bnd->addr.addr, bnd->addr.size)); __wt_free(session, bnd->addr.addr); } @@ -5203,18 +5604,18 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_RET(__wt_row_ikey_alloc(session, 0, bnd->key.data, bnd->key.size, &multi->key.ikey)); - if (bnd->skip == NULL) { + if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) { + multi->supd = bnd->supd; + multi->supd_entries = bnd->supd_next; + bnd->supd = NULL; + multi->supd_dsk = bnd->dsk; + bnd->dsk = NULL; + } else { multi->addr = bnd->addr; multi->addr.reuse = 0; multi->size = bnd->size; multi->cksum = bnd->cksum; bnd->addr.addr = NULL; - } else { - multi->skip = bnd->skip; - multi->skip_entries = bnd->skip_next; - bnd->skip = NULL; - multi->skip_dsk = bnd->dsk; - bnd->dsk = NULL; } } mod->mod_multi_entries = r->bnd_next; @@ -5243,18 +5644,18 @@ __rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) { multi->key.recno = bnd->recno; - if (bnd->skip == NULL) { + if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) { + multi->supd = bnd->supd; + multi->supd_entries = bnd->supd_next; + bnd->supd = NULL; + multi->supd_dsk = bnd->dsk; + bnd->dsk = NULL; + } else { multi->addr = bnd->addr; multi->addr.reuse = 0; multi->size = bnd->size; multi->cksum = bnd->cksum; bnd->addr.addr = NULL; - } else { - multi->skip = bnd->skip; - multi->skip_entries = bnd->skip_next; - bnd->skip = NULL; - multi->skip_dsk = bnd->dsk; - bnd->dsk = NULL; } } mod->mod_multi_entries = r->bnd_next; diff --git a/src/schema/schema_list.c b/src/schema/schema_list.c index a36fd696079..d091a5d94da 100644 --- a/src/schema/schema_list.c +++ b/src/schema/schema_list.c @@ -29,8 +29,8 @@ __schema_add_table(WT_SESSION_IMPL *session, WT_RET(ret); bucket = table->name_hash % WT_HASH_ARRAY_SIZE; - SLIST_INSERT_HEAD(&session->tables, table, l); - SLIST_INSERT_HEAD(&session->tablehash[bucket], table, hashl); + TAILQ_INSERT_HEAD(&session->tables, table, q); + TAILQ_INSERT_HEAD(&session->tablehash[bucket], table, hashq); *tablep = table; return (0); @@ -51,7 +51,7 @@ __schema_find_table(WT_SESSION_IMPL *session, bucket = __wt_hash_city64(name, namelen) % WT_HASH_ARRAY_SIZE; restart: - SLIST_FOREACH(table, &session->tablehash[bucket], hashl) { + TAILQ_FOREACH(table, &session->tablehash[bucket], hashq) { tablename = table->name; (void)WT_PREFIX_SKIP(tablename, "table:"); if (WT_STRING_MATCH(tablename, name, namelen)) { @@ -228,8 +228,8 @@ __wt_schema_remove_table(WT_SESSION_IMPL *session, WT_TABLE *table) WT_ASSERT(session, table->refcnt <= 1); bucket = table->name_hash % WT_HASH_ARRAY_SIZE; - SLIST_REMOVE(&session->tables, table, __wt_table, l); - SLIST_REMOVE(&session->tablehash[bucket], table, __wt_table, hashl); + TAILQ_REMOVE(&session->tables, table, q); + TAILQ_REMOVE(&session->tablehash[bucket], table, hashq); return (__wt_schema_destroy_table(session, &table)); } @@ -243,7 +243,7 @@ __wt_schema_close_tables(WT_SESSION_IMPL *session) WT_DECL_RET; WT_TABLE *table; - while ((table = SLIST_FIRST(&session->tables)) != NULL) + while ((table = TAILQ_FIRST(&session->tables)) != NULL) WT_TRET(__wt_schema_remove_table(session, table)); return (ret); } diff --git a/src/schema/schema_stat.c b/src/schema/schema_stat.c index dea797f823d..e9439abe16f 100644 --- a/src/schema/schema_stat.c +++ b/src/schema/schema_stat.c @@ -90,7 +90,7 @@ __wt_curstat_table_init(WT_SESSION_IMPL *session, if (i == 0) *stats = *new; else - __wt_stat_aggregate_dsrc_stats(new, stats); + __wt_stat_dsrc_aggregate_single(new, stats); WT_ERR(stat_cursor->close(stat_cursor)); } @@ -102,7 +102,7 @@ __wt_curstat_table_init(WT_SESSION_IMPL *session, WT_ERR(__wt_curstat_open( session, buf->data, cfg, &stat_cursor)); new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor); - __wt_stat_aggregate_dsrc_stats(new, stats); + __wt_stat_dsrc_aggregate_single(new, stats); WT_ERR(stat_cursor->close(stat_cursor)); } diff --git a/src/session/session_api.c b/src/session/session_api.c index ef9735a8b98..a1f5618a317 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -383,6 +383,22 @@ err: if (cursor != NULL) } /* + * __wt_session_create -- + * Internal version of WT_SESSION::create. + */ +int +__wt_session_create( + WT_SESSION_IMPL *session, const char *uri, const char *config) +{ + WT_DECL_RET; + + WT_WITH_SCHEMA_LOCK(session, + WT_WITH_TABLE_LOCK(session, + ret = __wt_schema_create(session, uri, config))); + return (ret); +} + +/* * __session_create -- * WT_SESSION->create method. */ @@ -423,9 +439,7 @@ __session_create(WT_SESSION *wt_session, const char *uri, const char *config) WT_ERR_NOTFOUND_OK(ret); } - WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, - ret = __wt_schema_create(session, uri, config))); + ret = __wt_session_create(session, uri, config); err: API_END_RET_NOTFOUND_MAP(session, ret); } @@ -529,6 +543,21 @@ __session_compact(WT_SESSION *wt_session, const char *uri, const char *config) } /* + * __wt_session_drop -- + * Internal version of WT_SESSION::drop. + */ +int +__wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) +{ + WT_DECL_RET; + + WT_WITH_SCHEMA_LOCK(session, + WT_WITH_TABLE_LOCK(session, + ret = __wt_schema_drop(session, uri, cfg))); + return (ret); +} + +/* * __session_drop -- * WT_SESSION->drop method. */ @@ -544,9 +573,7 @@ __session_drop(WT_SESSION *wt_session, const char *uri, const char *config) /* Disallow objects in the WiredTiger name space. */ WT_ERR(__wt_str_name_check(session, uri)); - WT_WITH_SCHEMA_LOCK(session, - WT_WITH_TABLE_LOCK(session, - ret = __wt_schema_drop(session, uri, cfg))); + ret = __wt_session_drop(session, uri, cfg); err: /* Note: drop operations cannot be unrolled (yet?). */ API_END_RET_NOTFOUND_MAP(session, ret); @@ -800,7 +827,7 @@ __session_commit_transaction(WT_SESSION *wt_session, const char *config) WT_STAT_FAST_CONN_INCR(session, txn_commit); txn = &session->txn; - if (F_ISSET(txn, WT_TXN_ERROR)) { + if (F_ISSET(txn, WT_TXN_ERROR) && txn->mod_count != 0) { __wt_errx(session, "failed transaction requires rollback"); ret = EINVAL; } @@ -915,7 +942,7 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config) * If our LSN is smaller than the current sync LSN then our * transaction is stable. We're done. */ - if (WT_LOG_CMP(&session->bg_sync_lsn, &log->sync_lsn) <= 0) + if (__wt_log_cmp(&session->bg_sync_lsn, &log->sync_lsn) <= 0) goto err; /* @@ -937,7 +964,7 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config) * Keep checking the LSNs until we find it is stable or we reach * our timeout. */ - while (WT_LOG_CMP(&session->bg_sync_lsn, &log->sync_lsn) > 0) { + while (__wt_log_cmp(&session->bg_sync_lsn, &log->sync_lsn) > 0) { WT_ERR(__wt_cond_signal(session, conn->log_file_cond)); WT_ERR(__wt_epoch(session, &now)); waited_ms = WT_TIMEDIFF(now, start) / WT_MILLION; @@ -1001,7 +1028,7 @@ __session_checkpoint(WT_SESSION *wt_session, const char *config) * operations, but checkpoint does enough I/O it may be called upon to * perform slow operations for the block manager. */ - F_SET(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_CACHE_CHECK); + F_SET(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION); /* * Only one checkpoint can be active at a time, and checkpoints must run @@ -1016,7 +1043,7 @@ __session_checkpoint(WT_SESSION *wt_session, const char *config) WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 0); -err: F_CLR(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_CACHE_CHECK); +err: F_CLR(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION); API_END_RET_NOTFOUND_MAP(session, ret); } @@ -1166,8 +1193,8 @@ __wt_open_session(WT_CONNECTION_IMPL *conn, if (i == conn->session_size) WT_ERR_MSG(session, ENOMEM, "only configured to support %" PRIu32 " sessions" - " (including %" PRIu32 " internal)", - conn->session_size, WT_NUM_INTERNAL_SESSIONS); + " (including %d additional internal sessions)", + conn->session_size, WT_EXTRA_INTERNAL_SESSIONS); /* * If the active session count is increasing, update it. We don't worry @@ -1190,7 +1217,7 @@ __wt_open_session(WT_CONNECTION_IMPL *conn, event_handler == NULL ? session->event_handler : event_handler); TAILQ_INIT(&session_ret->cursors); - SLIST_INIT(&session_ret->dhandles); + TAILQ_INIT(&session_ret->dhandles); /* * If we don't have one, allocate the dhandle hash array. * Allocate the table hash array as well. @@ -1202,8 +1229,8 @@ __wt_open_session(WT_CONNECTION_IMPL *conn, WT_ERR(__wt_calloc(session_ret, WT_HASH_ARRAY_SIZE, sizeof(struct __tables_hash), &session_ret->tablehash)); for (i = 0; i < WT_HASH_ARRAY_SIZE; i++) { - SLIST_INIT(&session_ret->dhhash[i]); - SLIST_INIT(&session_ret->tablehash[i]); + TAILQ_INIT(&session_ret->dhhash[i]); + TAILQ_INIT(&session_ret->tablehash[i]); } /* Initialize transaction support: default to read-committed. */ diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c index be8ca494778..dd0b50cc094 100644 --- a/src/session/session_dhandle.c +++ b/src/session/session_dhandle.c @@ -25,8 +25,8 @@ __session_add_dhandle( dhandle_cache->dhandle = session->dhandle; bucket = dhandle_cache->dhandle->name_hash % WT_HASH_ARRAY_SIZE; - SLIST_INSERT_HEAD(&session->dhandles, dhandle_cache, l); - SLIST_INSERT_HEAD(&session->dhhash[bucket], dhandle_cache, hashl); + TAILQ_INSERT_HEAD(&session->dhandles, dhandle_cache, q); + TAILQ_INSERT_HEAD(&session->dhhash[bucket], dhandle_cache, hashq); if (dhandle_cachep != NULL) *dhandle_cachep = dhandle_cache; @@ -36,6 +36,61 @@ __session_add_dhandle( } /* + * __session_discard_dhandle -- + * Remove a data handle from the session cache. + */ +static void +__session_discard_dhandle( + WT_SESSION_IMPL *session, WT_DATA_HANDLE_CACHE *dhandle_cache) +{ + uint64_t bucket; + + bucket = dhandle_cache->dhandle->name_hash % WT_HASH_ARRAY_SIZE; + TAILQ_REMOVE(&session->dhandles, dhandle_cache, q); + TAILQ_REMOVE(&session->dhhash[bucket], dhandle_cache, hashq); + + (void)__wt_atomic_sub32(&dhandle_cache->dhandle->session_ref, 1); + + __wt_overwrite_and_free(session, dhandle_cache); +} + +/* + * __session_find_dhandle -- + * Search for a data handle in the session cache. + */ +static void +__session_find_dhandle(WT_SESSION_IMPL *session, + const char *uri, const char *checkpoint, + WT_DATA_HANDLE_CACHE **dhandle_cachep) +{ + WT_DATA_HANDLE *dhandle; + WT_DATA_HANDLE_CACHE *dhandle_cache; + uint64_t bucket; + + dhandle = NULL; + + bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE; +retry: TAILQ_FOREACH(dhandle_cache, &session->dhhash[bucket], hashq) { + dhandle = dhandle_cache->dhandle; + if (WT_DHANDLE_INACTIVE(dhandle) && !WT_IS_METADATA(dhandle)) { + __session_discard_dhandle(session, dhandle_cache); + /* We deleted our entry, retry from the start. */ + goto retry; + } + + if (strcmp(uri, dhandle->name) != 0) + continue; + if (checkpoint == NULL && dhandle->checkpoint == NULL) + break; + if (checkpoint != NULL && dhandle->checkpoint != NULL && + strcmp(checkpoint, dhandle->checkpoint) == 0) + break; + } + + *dhandle_cachep = dhandle_cache; +} + +/* * __wt_session_lock_dhandle -- * Return when the current data handle is either (a) open with the * requested lock mode; or (b) closed and write locked. If exclusive @@ -173,6 +228,7 @@ __wt_session_release_btree(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_DATA_HANDLE *dhandle; + WT_DATA_HANDLE_CACHE *dhandle_cache; WT_DECL_RET; int locked, write_locked; @@ -185,6 +241,13 @@ __wt_session_release_btree(WT_SESSION_IMPL *session) * If we had special flags set, close the handle so that future access * can get a handle without special flags. */ + if (F_ISSET(dhandle, WT_DHANDLE_DISCARD | WT_DHANDLE_DISCARD_FORCE)) { + __session_find_dhandle(session, + dhandle->name, dhandle->checkpoint, &dhandle_cache); + if (dhandle_cache != NULL) + __session_discard_dhandle(session, dhandle_cache); + } + if (F_ISSET(dhandle, WT_DHANDLE_DISCARD_FORCE)) { ret = __wt_conn_btree_sync_and_close(session, 0, 1); F_CLR(dhandle, WT_DHANDLE_DISCARD_FORCE); @@ -272,26 +335,6 @@ retry: WT_RET(__wt_meta_checkpoint_last_name( } /* - * __session_discard_btree -- - * Discard our reference to the btree. - */ -static void -__session_discard_btree( - WT_SESSION_IMPL *session, WT_DATA_HANDLE_CACHE *dhandle_cache) -{ - uint64_t bucket; - - bucket = dhandle_cache->dhandle->name_hash % WT_HASH_ARRAY_SIZE; - SLIST_REMOVE( - &session->dhandles, dhandle_cache, __wt_data_handle_cache, l); - SLIST_REMOVE(&session->dhhash[bucket], - dhandle_cache, __wt_data_handle_cache, hashl); - - (void)WT_ATOMIC_SUB4(dhandle_cache->dhandle->session_ref, 1); - __wt_overwrite_and_free(session, dhandle_cache); -} - -/* * __wt_session_close_cache -- * Close any cached handles in a session. */ @@ -300,8 +343,8 @@ __wt_session_close_cache(WT_SESSION_IMPL *session) { WT_DATA_HANDLE_CACHE *dhandle_cache; - while ((dhandle_cache = SLIST_FIRST(&session->dhandles)) != NULL) - __session_discard_btree(session, dhandle_cache); + while ((dhandle_cache = TAILQ_FIRST(&session->dhandles)) != NULL) + __session_discard_dhandle(session, dhandle_cache); } /* @@ -329,18 +372,18 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session) WT_STAT_FAST_CONN_INCR(session, dh_session_sweeps); - dhandle_cache = SLIST_FIRST(&session->dhandles); + dhandle_cache = TAILQ_FIRST(&session->dhandles); while (dhandle_cache != NULL) { - dhandle_cache_next = SLIST_NEXT(dhandle_cache, l); + dhandle_cache_next = TAILQ_NEXT(dhandle_cache, q); dhandle = dhandle_cache->dhandle; if (dhandle != session->dhandle && dhandle->session_inuse == 0 && - (F_ISSET(dhandle, WT_DHANDLE_DEAD) || + (WT_DHANDLE_INACTIVE(dhandle) || (dhandle->timeofdeath != 0 && now - dhandle->timeofdeath > conn->sweep_idle_time))) { WT_STAT_FAST_CONN_INCR(session, dh_session_handles); WT_ASSERT(session, !WT_IS_METADATA(dhandle)); - __session_discard_btree(session, dhandle_cache); + __session_discard_dhandle(session, dhandle_cache); } dhandle_cache = dhandle_cache_next; } @@ -348,51 +391,37 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session) } /* - * __session_dhandle_find_shared -- + * __session_find_shared_dhandle -- * Search for a data handle in the connection and add it to a session's * cache. Since the data handle isn't locked, this must be called holding * the handle list lock, and we must increment the handle's reference * count before releasing it. */ static int -__session_dhandle_find_shared( +__session_find_shared_dhandle( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) { WT_RET(__wt_conn_dhandle_find(session, uri, checkpoint)); - (void)WT_ATOMIC_ADD4(session->dhandle->session_ref, 1); + (void)__wt_atomic_add32(&session->dhandle->session_ref, 1); return (0); } + /* - * __session_dhandle_find -- + * __session_get_dhandle -- * Search for a data handle, first in the session cache, then in the * connection. */ static int -__session_dhandle_find( +__session_get_dhandle( WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) { - WT_DATA_HANDLE *dhandle; WT_DATA_HANDLE_CACHE *dhandle_cache; WT_DECL_RET; - uint64_t bucket; - bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE; -retry: SLIST_FOREACH(dhandle_cache, &session->dhhash[bucket], hashl) { - dhandle = dhandle_cache->dhandle; - if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) { - WT_ASSERT(session, !WT_IS_METADATA(dhandle)); - __session_discard_btree(session, dhandle_cache); - /* We deleted our entry, retry from the start. */ - goto retry; - } - if (strcmp(uri, dhandle->name) != 0) - continue; - if ((checkpoint == NULL && dhandle->checkpoint == NULL) || - (checkpoint != NULL && dhandle->checkpoint != NULL && - strcmp(checkpoint, dhandle->checkpoint) == 0)) { - session->dhandle = dhandle; - return (0); - } + __session_find_dhandle(session, uri, checkpoint, &dhandle_cache); + if (dhandle_cache != NULL) { + session->dhandle = dhandle_cache->dhandle; + return (0); } /* @@ -400,7 +429,7 @@ retry: SLIST_FOREACH(dhandle_cache, &session->dhhash[bucket], hashl) { * handle list and cache the handle we find. */ WT_WITH_HANDLE_LIST_LOCK(session, ret = - __session_dhandle_find_shared(session, uri, checkpoint)); + __session_find_shared_dhandle(session, uri, checkpoint)); if (ret == 0) ret = __session_add_dhandle(session, NULL); @@ -422,7 +451,7 @@ __wt_session_get_btree(WT_SESSION_IMPL *session, WT_ASSERT(session, !F_ISSET(session, WT_SESSION_NO_DATA_HANDLES)); for (;;) { - WT_RET(__session_dhandle_find(session, uri, checkpoint)); + WT_RET(__session_get_dhandle(session, uri, checkpoint)); dhandle = session->dhandle; /* diff --git a/src/support/pow.c b/src/support/pow.c index 8e42113a2ee..0f50bfe56a1 100644 --- a/src/support/pow.c +++ b/src/support/pow.c @@ -100,7 +100,7 @@ __wt_log2_int(uint32_t n) * __wt_ispo2 -- * Return if a number is a power-of-two. */ -int +bool __wt_ispo2(uint32_t v) { /* diff --git a/src/support/rand.c b/src/support/rand.c index caac04d3529..f5ecb12633e 100644 --- a/src/support/rand.c +++ b/src/support/rand.c @@ -84,8 +84,11 @@ __wt_random(WT_RAND_STATE volatile * rnd_state) * to initialize the state, or initializes with a seed that results in a * short period. */ - if (z == 0 || w == 0) - __wt_random_init(rnd_state); + if (z == 0 || w == 0) { + __wt_random_init(&rnd); + w = M_W(rnd); + z = M_Z(rnd); + } M_Z(rnd) = z = 36969 * (z & 65535) + (z >> 16); M_W(rnd) = w = 18000 * (w & 65535) + (w >> 16); diff --git a/src/support/stat.c b/src/support/stat.c index b0e7d660587..79248b0652c 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -2,672 +2,1016 @@ #include "wt_internal.h" +static const char * const __stats_dsrc_desc[] = { + "block-manager: file allocation unit size", + "block-manager: blocks allocated", + "block-manager: checkpoint size", + "block-manager: allocations requiring file extension", + "block-manager: blocks freed", + "block-manager: file magic number", + "block-manager: file major version number", + "block-manager: minor version number", + "block-manager: file bytes available for reuse", + "block-manager: file size in bytes", + "LSM: bloom filters in the LSM tree", + "LSM: bloom filter false positives", + "LSM: bloom filter hits", + "LSM: bloom filter misses", + "LSM: bloom filter pages evicted from cache", + "LSM: bloom filter pages read into cache", + "LSM: total size of bloom filters", + "btree: btree checkpoint generation", + "btree: column-store variable-size deleted values", + "btree: column-store fixed-size leaf pages", + "btree: column-store internal pages", + "btree: column-store variable-size RLE encoded values", + "btree: column-store variable-size leaf pages", + "btree: pages rewritten by compaction", + "btree: number of key/value pairs", + "btree: fixed-record size", + "btree: maximum tree depth", + "btree: maximum internal page key size", + "btree: maximum internal page size", + "btree: maximum leaf page key size", + "btree: maximum leaf page size", + "btree: maximum leaf page value size", + "btree: overflow pages", + "btree: row-store internal pages", + "btree: row-store leaf pages", + "cache: bytes read into cache", + "cache: bytes written from cache", + "cache: checkpoint blocked page eviction", + "cache: unmodified pages evicted", + "cache: page split during eviction deepened the tree", + "cache: modified pages evicted", + "cache: data source pages selected for eviction unable to be evicted", + "cache: hazard pointer blocked page eviction", + "cache: internal pages evicted", + "cache: pages split during eviction", + "cache: in-memory page splits", + "cache: in-memory page passed criteria to be split", + "cache: overflow values cached in memory", + "cache: pages read into cache", + "cache: pages read into cache requiring lookaside entries", + "cache: overflow pages read into cache", + "cache: pages written from cache", + "cache: page written requiring lookaside records", + "cache: pages written requiring in-memory restoration", + "compression: raw compression call failed, no additional data available", + "compression: raw compression call failed, additional data available", + "compression: raw compression call succeeded", + "compression: compressed pages read", + "compression: compressed pages written", + "compression: page written failed to compress", + "compression: page written was too small to compress", + "cursor: create calls", + "cursor: insert calls", + "cursor: bulk-loaded cursor-insert calls", + "cursor: cursor-insert key and value bytes inserted", + "cursor: next calls", + "cursor: prev calls", + "cursor: remove calls", + "cursor: cursor-remove key bytes removed", + "cursor: reset calls", + "cursor: restarted searches", + "cursor: search calls", + "cursor: search near calls", + "cursor: update calls", + "cursor: cursor-update value bytes updated", + "LSM: sleep for LSM checkpoint throttle", + "LSM: chunks in the LSM tree", + "LSM: highest merge generation in the LSM tree", + "LSM: queries that could have benefited from a Bloom filter that did not exist", + "LSM: sleep for LSM merge throttle", + "reconciliation: dictionary matches", + "reconciliation: internal page multi-block writes", + "reconciliation: leaf page multi-block writes", + "reconciliation: maximum blocks required for a page", + "reconciliation: internal-page overflow keys", + "reconciliation: leaf-page overflow keys", + "reconciliation: overflow values written", + "reconciliation: pages deleted", + "reconciliation: page checksum matches", + "reconciliation: page reconciliation calls", + "reconciliation: page reconciliation calls for eviction", + "reconciliation: leaf page key bytes discarded using prefix compression", + "reconciliation: internal page key bytes discarded using suffix compression", + "session: object compaction", + "session: open cursor count", + "transaction: update conflicts", +}; + +const char * +__wt_stat_dsrc_desc(int slot) +{ + return (__stats_dsrc_desc[slot]); +} + void -__wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats) +__wt_stat_dsrc_init_single(WT_DSRC_STATS *stats) { - /* Clear, so can also be called for reinitialization. */ memset(stats, 0, sizeof(*stats)); +} + +void +__wt_stat_dsrc_init(WT_DATA_HANDLE *handle) +{ + int i; - stats->block_extension.desc = - "block-manager: allocations requiring file extension"; - stats->block_alloc.desc = "block-manager: blocks allocated"; - stats->block_free.desc = "block-manager: blocks freed"; - stats->block_checkpoint_size.desc = "block-manager: checkpoint size"; - stats->allocation_size.desc = - "block-manager: file allocation unit size"; - stats->block_reuse_bytes.desc = - "block-manager: file bytes available for reuse"; - stats->block_magic.desc = "block-manager: file magic number"; - stats->block_major.desc = "block-manager: file major version number"; - stats->block_size.desc = "block-manager: file size in bytes"; - stats->block_minor.desc = "block-manager: minor version number"; - stats->btree_checkpoint_generation.desc = - "btree: btree checkpoint generation"; - stats->btree_column_fix.desc = - "btree: column-store fixed-size leaf pages"; - stats->btree_column_internal.desc = - "btree: column-store internal pages"; - stats->btree_column_deleted.desc = - "btree: column-store variable-size deleted values"; - stats->btree_column_variable.desc = - "btree: column-store variable-size leaf pages"; - stats->btree_fixed_len.desc = "btree: fixed-record size"; - stats->btree_maxintlkey.desc = "btree: maximum internal page key size"; - stats->btree_maxintlpage.desc = "btree: maximum internal page size"; - stats->btree_maxleafkey.desc = "btree: maximum leaf page key size"; - stats->btree_maxleafpage.desc = "btree: maximum leaf page size"; - stats->btree_maxleafvalue.desc = "btree: maximum leaf page value size"; - stats->btree_maximum_depth.desc = "btree: maximum tree depth"; - stats->btree_entries.desc = "btree: number of key/value pairs"; - stats->btree_overflow.desc = "btree: overflow pages"; - stats->btree_compact_rewrite.desc = - "btree: pages rewritten by compaction"; - stats->btree_row_internal.desc = "btree: row-store internal pages"; - stats->btree_row_leaf.desc = "btree: row-store leaf pages"; - stats->cache_bytes_read.desc = "cache: bytes read into cache"; - stats->cache_bytes_write.desc = "cache: bytes written from cache"; - stats->cache_eviction_checkpoint.desc = - "cache: checkpoint blocked page eviction"; - stats->cache_eviction_fail.desc = - "cache: data source pages selected for eviction unable to be evicted"; - stats->cache_eviction_hazard.desc = - "cache: hazard pointer blocked page eviction"; - stats->cache_inmem_split.desc = "cache: in-memory page splits"; - stats->cache_eviction_internal.desc = "cache: internal pages evicted"; - stats->cache_eviction_dirty.desc = "cache: modified pages evicted"; - stats->cache_read_overflow.desc = - "cache: overflow pages read into cache"; - stats->cache_overflow_value.desc = - "cache: overflow values cached in memory"; - stats->cache_eviction_deepen.desc = - "cache: page split during eviction deepened the tree"; - stats->cache_read.desc = "cache: pages read into cache"; - stats->cache_eviction_split.desc = - "cache: pages split during eviction"; - stats->cache_write.desc = "cache: pages written from cache"; - stats->cache_eviction_clean.desc = "cache: unmodified pages evicted"; - stats->compress_read.desc = "compression: compressed pages read"; - stats->compress_write.desc = "compression: compressed pages written"; - stats->compress_write_fail.desc = - "compression: page written failed to compress"; - stats->compress_write_too_small.desc = - "compression: page written was too small to compress"; - stats->compress_raw_fail_temporary.desc = - "compression: raw compression call failed, additional data available"; - stats->compress_raw_fail.desc = - "compression: raw compression call failed, no additional data available"; - stats->compress_raw_ok.desc = - "compression: raw compression call succeeded"; - stats->cursor_insert_bulk.desc = - "cursor: bulk-loaded cursor-insert calls"; - stats->cursor_create.desc = "cursor: create calls"; - stats->cursor_insert_bytes.desc = - "cursor: cursor-insert key and value bytes inserted"; - stats->cursor_remove_bytes.desc = - "cursor: cursor-remove key bytes removed"; - stats->cursor_update_bytes.desc = - "cursor: cursor-update value bytes updated"; - stats->cursor_insert.desc = "cursor: insert calls"; - stats->cursor_next.desc = "cursor: next calls"; - stats->cursor_prev.desc = "cursor: prev calls"; - stats->cursor_remove.desc = "cursor: remove calls"; - stats->cursor_reset.desc = "cursor: reset calls"; - stats->cursor_search.desc = "cursor: search calls"; - stats->cursor_search_near.desc = "cursor: search near calls"; - stats->cursor_update.desc = "cursor: update calls"; - stats->bloom_false_positive.desc = "LSM: bloom filter false positives"; - stats->bloom_hit.desc = "LSM: bloom filter hits"; - stats->bloom_miss.desc = "LSM: bloom filter misses"; - stats->bloom_page_evict.desc = - "LSM: bloom filter pages evicted from cache"; - stats->bloom_page_read.desc = - "LSM: bloom filter pages read into cache"; - stats->bloom_count.desc = "LSM: bloom filters in the LSM tree"; - stats->lsm_chunk_count.desc = "LSM: chunks in the LSM tree"; - stats->lsm_generation_max.desc = - "LSM: highest merge generation in the LSM tree"; - stats->lsm_lookup_no_bloom.desc = - "LSM: queries that could have benefited from a Bloom filter that did not exist"; - stats->lsm_checkpoint_throttle.desc = - "LSM: sleep for LSM checkpoint throttle"; - stats->lsm_merge_throttle.desc = "LSM: sleep for LSM merge throttle"; - stats->bloom_size.desc = "LSM: total size of bloom filters"; - stats->rec_dictionary.desc = "reconciliation: dictionary matches"; - stats->rec_suffix_compression.desc = - "reconciliation: internal page key bytes discarded using suffix compression"; - stats->rec_multiblock_internal.desc = - "reconciliation: internal page multi-block writes"; - stats->rec_overflow_key_internal.desc = - "reconciliation: internal-page overflow keys"; - stats->rec_prefix_compression.desc = - "reconciliation: leaf page key bytes discarded using prefix compression"; - stats->rec_multiblock_leaf.desc = - "reconciliation: leaf page multi-block writes"; - stats->rec_overflow_key_leaf.desc = - "reconciliation: leaf-page overflow keys"; - stats->rec_multiblock_max.desc = - "reconciliation: maximum blocks required for a page"; - stats->rec_overflow_value.desc = - "reconciliation: overflow values written"; - stats->rec_page_match.desc = "reconciliation: page checksum matches"; - stats->rec_pages.desc = "reconciliation: page reconciliation calls"; - stats->rec_pages_eviction.desc = - "reconciliation: page reconciliation calls for eviction"; - stats->rec_page_delete.desc = "reconciliation: pages deleted"; - stats->session_compact.desc = "session: object compaction"; - stats->session_cursor_open.desc = "session: open cursor count"; - stats->txn_update_conflict.desc = "transaction: update conflicts"; + for (i = 0; i < WT_COUNTER_SLOTS; ++i) { + handle->stats[i] = &handle->stat_array[i]; + __wt_stat_dsrc_init_single(handle->stats[i]); + } } void -__wt_stat_refresh_dsrc_stats(void *stats_arg) +__wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats) { - WT_DSRC_STATS *stats; + stats->block_extension = 0; + stats->block_alloc = 0; + stats->block_free = 0; + stats->block_checkpoint_size = 0; + stats->allocation_size = 0; + stats->block_reuse_bytes = 0; + stats->block_magic = 0; + stats->block_major = 0; + stats->block_size = 0; + stats->block_minor = 0; + /* not clearing btree_checkpoint_generation */ + stats->btree_column_fix = 0; + stats->btree_column_internal = 0; + stats->btree_column_deleted = 0; + stats->btree_column_variable = 0; + stats->btree_column_rle = 0; + stats->btree_fixed_len = 0; + stats->btree_maxintlkey = 0; + stats->btree_maxintlpage = 0; + stats->btree_maxleafkey = 0; + stats->btree_maxleafpage = 0; + stats->btree_maxleafvalue = 0; + stats->btree_maximum_depth = 0; + stats->btree_entries = 0; + stats->btree_overflow = 0; + stats->btree_compact_rewrite = 0; + stats->btree_row_internal = 0; + stats->btree_row_leaf = 0; + stats->cache_bytes_read = 0; + stats->cache_bytes_write = 0; + stats->cache_eviction_checkpoint = 0; + stats->cache_eviction_fail = 0; + stats->cache_eviction_hazard = 0; + stats->cache_inmem_splittable = 0; + stats->cache_inmem_split = 0; + stats->cache_eviction_internal = 0; + stats->cache_eviction_dirty = 0; + stats->cache_read_overflow = 0; + stats->cache_overflow_value = 0; + stats->cache_eviction_deepen = 0; + stats->cache_write_lookaside = 0; + stats->cache_read = 0; + stats->cache_read_lookaside = 0; + stats->cache_eviction_split = 0; + stats->cache_write = 0; + stats->cache_write_restore = 0; + stats->cache_eviction_clean = 0; + stats->compress_read = 0; + stats->compress_write = 0; + stats->compress_write_fail = 0; + stats->compress_write_too_small = 0; + stats->compress_raw_fail_temporary = 0; + stats->compress_raw_fail = 0; + stats->compress_raw_ok = 0; + stats->cursor_insert_bulk = 0; + stats->cursor_create = 0; + stats->cursor_insert_bytes = 0; + stats->cursor_remove_bytes = 0; + stats->cursor_update_bytes = 0; + stats->cursor_insert = 0; + stats->cursor_next = 0; + stats->cursor_prev = 0; + stats->cursor_remove = 0; + stats->cursor_reset = 0; + stats->cursor_restart = 0; + stats->cursor_search = 0; + stats->cursor_search_near = 0; + stats->cursor_update = 0; + stats->bloom_false_positive = 0; + stats->bloom_hit = 0; + stats->bloom_miss = 0; + stats->bloom_page_evict = 0; + stats->bloom_page_read = 0; + stats->bloom_count = 0; + stats->lsm_chunk_count = 0; + stats->lsm_generation_max = 0; + stats->lsm_lookup_no_bloom = 0; + stats->lsm_checkpoint_throttle = 0; + stats->lsm_merge_throttle = 0; + stats->bloom_size = 0; + stats->rec_dictionary = 0; + stats->rec_suffix_compression = 0; + stats->rec_multiblock_internal = 0; + stats->rec_overflow_key_internal = 0; + stats->rec_prefix_compression = 0; + stats->rec_multiblock_leaf = 0; + stats->rec_overflow_key_leaf = 0; + stats->rec_multiblock_max = 0; + stats->rec_overflow_value = 0; + stats->rec_page_match = 0; + stats->rec_pages = 0; + stats->rec_pages_eviction = 0; + stats->rec_page_delete = 0; + stats->session_compact = 0; + /* not clearing session_cursor_open */ + stats->txn_update_conflict = 0; +} + +void +__wt_stat_dsrc_clear_all(WT_DSRC_STATS **stats) +{ + u_int i; - stats = (WT_DSRC_STATS *)stats_arg; - stats->block_extension.v = 0; - stats->block_alloc.v = 0; - stats->block_free.v = 0; - stats->block_checkpoint_size.v = 0; - stats->allocation_size.v = 0; - stats->block_reuse_bytes.v = 0; - stats->block_magic.v = 0; - stats->block_major.v = 0; - stats->block_size.v = 0; - stats->block_minor.v = 0; - stats->btree_column_fix.v = 0; - stats->btree_column_internal.v = 0; - stats->btree_column_deleted.v = 0; - stats->btree_column_variable.v = 0; - stats->btree_fixed_len.v = 0; - stats->btree_maxintlkey.v = 0; - stats->btree_maxintlpage.v = 0; - stats->btree_maxleafkey.v = 0; - stats->btree_maxleafpage.v = 0; - stats->btree_maxleafvalue.v = 0; - stats->btree_maximum_depth.v = 0; - stats->btree_entries.v = 0; - stats->btree_overflow.v = 0; - stats->btree_compact_rewrite.v = 0; - stats->btree_row_internal.v = 0; - stats->btree_row_leaf.v = 0; - stats->cache_bytes_read.v = 0; - stats->cache_bytes_write.v = 0; - stats->cache_eviction_checkpoint.v = 0; - stats->cache_eviction_fail.v = 0; - stats->cache_eviction_hazard.v = 0; - stats->cache_inmem_split.v = 0; - stats->cache_eviction_internal.v = 0; - stats->cache_eviction_dirty.v = 0; - stats->cache_read_overflow.v = 0; - stats->cache_overflow_value.v = 0; - stats->cache_eviction_deepen.v = 0; - stats->cache_read.v = 0; - stats->cache_eviction_split.v = 0; - stats->cache_write.v = 0; - stats->cache_eviction_clean.v = 0; - stats->compress_read.v = 0; - stats->compress_write.v = 0; - stats->compress_write_fail.v = 0; - stats->compress_write_too_small.v = 0; - stats->compress_raw_fail_temporary.v = 0; - stats->compress_raw_fail.v = 0; - stats->compress_raw_ok.v = 0; - stats->cursor_insert_bulk.v = 0; - stats->cursor_create.v = 0; - stats->cursor_insert_bytes.v = 0; - stats->cursor_remove_bytes.v = 0; - stats->cursor_update_bytes.v = 0; - stats->cursor_insert.v = 0; - stats->cursor_next.v = 0; - stats->cursor_prev.v = 0; - stats->cursor_remove.v = 0; - stats->cursor_reset.v = 0; - stats->cursor_search.v = 0; - stats->cursor_search_near.v = 0; - stats->cursor_update.v = 0; - stats->bloom_false_positive.v = 0; - stats->bloom_hit.v = 0; - stats->bloom_miss.v = 0; - stats->bloom_page_evict.v = 0; - stats->bloom_page_read.v = 0; - stats->bloom_count.v = 0; - stats->lsm_chunk_count.v = 0; - stats->lsm_generation_max.v = 0; - stats->lsm_lookup_no_bloom.v = 0; - stats->lsm_checkpoint_throttle.v = 0; - stats->lsm_merge_throttle.v = 0; - stats->bloom_size.v = 0; - stats->rec_dictionary.v = 0; - stats->rec_suffix_compression.v = 0; - stats->rec_multiblock_internal.v = 0; - stats->rec_overflow_key_internal.v = 0; - stats->rec_prefix_compression.v = 0; - stats->rec_multiblock_leaf.v = 0; - stats->rec_overflow_key_leaf.v = 0; - stats->rec_multiblock_max.v = 0; - stats->rec_overflow_value.v = 0; - stats->rec_page_match.v = 0; - stats->rec_pages.v = 0; - stats->rec_pages_eviction.v = 0; - stats->rec_page_delete.v = 0; - stats->session_compact.v = 0; - stats->txn_update_conflict.v = 0; + for (i = 0; i < WT_COUNTER_SLOTS; ++i) + __wt_stat_dsrc_clear_single(stats[i]); +} + +void +__wt_stat_dsrc_aggregate_single( + WT_DSRC_STATS *from, WT_DSRC_STATS *to) +{ + to->block_extension += from->block_extension; + to->block_alloc += from->block_alloc; + to->block_free += from->block_free; + to->block_checkpoint_size += from->block_checkpoint_size; + to->allocation_size = from->allocation_size; + to->block_reuse_bytes += from->block_reuse_bytes; + to->block_magic = from->block_magic; + to->block_major = from->block_major; + to->block_size += from->block_size; + to->block_minor = from->block_minor; + to->btree_checkpoint_generation += from->btree_checkpoint_generation; + to->btree_column_fix += from->btree_column_fix; + to->btree_column_internal += from->btree_column_internal; + to->btree_column_deleted += from->btree_column_deleted; + to->btree_column_variable += from->btree_column_variable; + to->btree_column_rle += from->btree_column_rle; + to->btree_fixed_len = from->btree_fixed_len; + if (from->btree_maxintlkey > to->btree_maxintlkey) + to->btree_maxintlkey = from->btree_maxintlkey; + if (from->btree_maxintlpage > to->btree_maxintlpage) + to->btree_maxintlpage = from->btree_maxintlpage; + if (from->btree_maxleafkey > to->btree_maxleafkey) + to->btree_maxleafkey = from->btree_maxleafkey; + if (from->btree_maxleafpage > to->btree_maxleafpage) + to->btree_maxleafpage = from->btree_maxleafpage; + if (from->btree_maxleafvalue > to->btree_maxleafvalue) + to->btree_maxleafvalue = from->btree_maxleafvalue; + if (from->btree_maximum_depth > to->btree_maximum_depth) + to->btree_maximum_depth = from->btree_maximum_depth; + to->btree_entries += from->btree_entries; + to->btree_overflow += from->btree_overflow; + to->btree_compact_rewrite += from->btree_compact_rewrite; + to->btree_row_internal += from->btree_row_internal; + to->btree_row_leaf += from->btree_row_leaf; + to->cache_bytes_read += from->cache_bytes_read; + to->cache_bytes_write += from->cache_bytes_write; + to->cache_eviction_checkpoint += from->cache_eviction_checkpoint; + to->cache_eviction_fail += from->cache_eviction_fail; + to->cache_eviction_hazard += from->cache_eviction_hazard; + to->cache_inmem_splittable += from->cache_inmem_splittable; + to->cache_inmem_split += from->cache_inmem_split; + to->cache_eviction_internal += from->cache_eviction_internal; + to->cache_eviction_dirty += from->cache_eviction_dirty; + to->cache_read_overflow += from->cache_read_overflow; + to->cache_overflow_value += from->cache_overflow_value; + to->cache_eviction_deepen += from->cache_eviction_deepen; + to->cache_write_lookaside += from->cache_write_lookaside; + to->cache_read += from->cache_read; + to->cache_read_lookaside += from->cache_read_lookaside; + to->cache_eviction_split += from->cache_eviction_split; + to->cache_write += from->cache_write; + to->cache_write_restore += from->cache_write_restore; + to->cache_eviction_clean += from->cache_eviction_clean; + to->compress_read += from->compress_read; + to->compress_write += from->compress_write; + to->compress_write_fail += from->compress_write_fail; + to->compress_write_too_small += from->compress_write_too_small; + to->compress_raw_fail_temporary += from->compress_raw_fail_temporary; + to->compress_raw_fail += from->compress_raw_fail; + to->compress_raw_ok += from->compress_raw_ok; + to->cursor_insert_bulk += from->cursor_insert_bulk; + to->cursor_create += from->cursor_create; + to->cursor_insert_bytes += from->cursor_insert_bytes; + to->cursor_remove_bytes += from->cursor_remove_bytes; + to->cursor_update_bytes += from->cursor_update_bytes; + to->cursor_insert += from->cursor_insert; + to->cursor_next += from->cursor_next; + to->cursor_prev += from->cursor_prev; + to->cursor_remove += from->cursor_remove; + to->cursor_reset += from->cursor_reset; + to->cursor_restart += from->cursor_restart; + to->cursor_search += from->cursor_search; + to->cursor_search_near += from->cursor_search_near; + to->cursor_update += from->cursor_update; + to->bloom_false_positive += from->bloom_false_positive; + to->bloom_hit += from->bloom_hit; + to->bloom_miss += from->bloom_miss; + to->bloom_page_evict += from->bloom_page_evict; + to->bloom_page_read += from->bloom_page_read; + to->bloom_count += from->bloom_count; + to->lsm_chunk_count += from->lsm_chunk_count; + if (from->lsm_generation_max > to->lsm_generation_max) + to->lsm_generation_max = from->lsm_generation_max; + to->lsm_lookup_no_bloom += from->lsm_lookup_no_bloom; + to->lsm_checkpoint_throttle += from->lsm_checkpoint_throttle; + to->lsm_merge_throttle += from->lsm_merge_throttle; + to->bloom_size += from->bloom_size; + to->rec_dictionary += from->rec_dictionary; + to->rec_suffix_compression += from->rec_suffix_compression; + to->rec_multiblock_internal += from->rec_multiblock_internal; + to->rec_overflow_key_internal += from->rec_overflow_key_internal; + to->rec_prefix_compression += from->rec_prefix_compression; + to->rec_multiblock_leaf += from->rec_multiblock_leaf; + to->rec_overflow_key_leaf += from->rec_overflow_key_leaf; + if (from->rec_multiblock_max > to->rec_multiblock_max) + to->rec_multiblock_max = from->rec_multiblock_max; + to->rec_overflow_value += from->rec_overflow_value; + to->rec_page_match += from->rec_page_match; + to->rec_pages += from->rec_pages; + to->rec_pages_eviction += from->rec_pages_eviction; + to->rec_page_delete += from->rec_page_delete; + to->session_compact += from->session_compact; + to->session_cursor_open += from->session_cursor_open; + to->txn_update_conflict += from->txn_update_conflict; } void -__wt_stat_aggregate_dsrc_stats(const void *child, const void *parent) +__wt_stat_dsrc_aggregate( + WT_DSRC_STATS **from, WT_DSRC_STATS *to) { - WT_DSRC_STATS *c, *p; + int64_t v; - c = (WT_DSRC_STATS *)child; - p = (WT_DSRC_STATS *)parent; - p->block_extension.v += c->block_extension.v; - p->block_alloc.v += c->block_alloc.v; - p->block_free.v += c->block_free.v; - p->block_checkpoint_size.v += c->block_checkpoint_size.v; - p->block_reuse_bytes.v += c->block_reuse_bytes.v; - p->block_size.v += c->block_size.v; - p->btree_checkpoint_generation.v += c->btree_checkpoint_generation.v; - p->btree_column_fix.v += c->btree_column_fix.v; - p->btree_column_internal.v += c->btree_column_internal.v; - p->btree_column_deleted.v += c->btree_column_deleted.v; - p->btree_column_variable.v += c->btree_column_variable.v; - if (c->btree_maxintlkey.v > p->btree_maxintlkey.v) - p->btree_maxintlkey.v = c->btree_maxintlkey.v; - if (c->btree_maxintlpage.v > p->btree_maxintlpage.v) - p->btree_maxintlpage.v = c->btree_maxintlpage.v; - if (c->btree_maxleafkey.v > p->btree_maxleafkey.v) - p->btree_maxleafkey.v = c->btree_maxleafkey.v; - if (c->btree_maxleafpage.v > p->btree_maxleafpage.v) - p->btree_maxleafpage.v = c->btree_maxleafpage.v; - if (c->btree_maxleafvalue.v > p->btree_maxleafvalue.v) - p->btree_maxleafvalue.v = c->btree_maxleafvalue.v; - if (c->btree_maximum_depth.v > p->btree_maximum_depth.v) - p->btree_maximum_depth.v = c->btree_maximum_depth.v; - p->btree_entries.v += c->btree_entries.v; - p->btree_overflow.v += c->btree_overflow.v; - p->btree_compact_rewrite.v += c->btree_compact_rewrite.v; - p->btree_row_internal.v += c->btree_row_internal.v; - p->btree_row_leaf.v += c->btree_row_leaf.v; - p->cache_bytes_read.v += c->cache_bytes_read.v; - p->cache_bytes_write.v += c->cache_bytes_write.v; - p->cache_eviction_checkpoint.v += c->cache_eviction_checkpoint.v; - p->cache_eviction_fail.v += c->cache_eviction_fail.v; - p->cache_eviction_hazard.v += c->cache_eviction_hazard.v; - p->cache_inmem_split.v += c->cache_inmem_split.v; - p->cache_eviction_internal.v += c->cache_eviction_internal.v; - p->cache_eviction_dirty.v += c->cache_eviction_dirty.v; - p->cache_read_overflow.v += c->cache_read_overflow.v; - p->cache_overflow_value.v += c->cache_overflow_value.v; - p->cache_eviction_deepen.v += c->cache_eviction_deepen.v; - p->cache_read.v += c->cache_read.v; - p->cache_eviction_split.v += c->cache_eviction_split.v; - p->cache_write.v += c->cache_write.v; - p->cache_eviction_clean.v += c->cache_eviction_clean.v; - p->compress_read.v += c->compress_read.v; - p->compress_write.v += c->compress_write.v; - p->compress_write_fail.v += c->compress_write_fail.v; - p->compress_write_too_small.v += c->compress_write_too_small.v; - p->compress_raw_fail_temporary.v += c->compress_raw_fail_temporary.v; - p->compress_raw_fail.v += c->compress_raw_fail.v; - p->compress_raw_ok.v += c->compress_raw_ok.v; - p->cursor_insert_bulk.v += c->cursor_insert_bulk.v; - p->cursor_create.v += c->cursor_create.v; - p->cursor_insert_bytes.v += c->cursor_insert_bytes.v; - p->cursor_remove_bytes.v += c->cursor_remove_bytes.v; - p->cursor_update_bytes.v += c->cursor_update_bytes.v; - p->cursor_insert.v += c->cursor_insert.v; - p->cursor_next.v += c->cursor_next.v; - p->cursor_prev.v += c->cursor_prev.v; - p->cursor_remove.v += c->cursor_remove.v; - p->cursor_reset.v += c->cursor_reset.v; - p->cursor_search.v += c->cursor_search.v; - p->cursor_search_near.v += c->cursor_search_near.v; - p->cursor_update.v += c->cursor_update.v; - p->bloom_false_positive.v += c->bloom_false_positive.v; - p->bloom_hit.v += c->bloom_hit.v; - p->bloom_miss.v += c->bloom_miss.v; - p->bloom_page_evict.v += c->bloom_page_evict.v; - p->bloom_page_read.v += c->bloom_page_read.v; - p->bloom_count.v += c->bloom_count.v; - p->lsm_chunk_count.v += c->lsm_chunk_count.v; - if (c->lsm_generation_max.v > p->lsm_generation_max.v) - p->lsm_generation_max.v = c->lsm_generation_max.v; - p->lsm_lookup_no_bloom.v += c->lsm_lookup_no_bloom.v; - p->lsm_checkpoint_throttle.v += c->lsm_checkpoint_throttle.v; - p->lsm_merge_throttle.v += c->lsm_merge_throttle.v; - p->bloom_size.v += c->bloom_size.v; - p->rec_dictionary.v += c->rec_dictionary.v; - p->rec_suffix_compression.v += c->rec_suffix_compression.v; - p->rec_multiblock_internal.v += c->rec_multiblock_internal.v; - p->rec_overflow_key_internal.v += c->rec_overflow_key_internal.v; - p->rec_prefix_compression.v += c->rec_prefix_compression.v; - p->rec_multiblock_leaf.v += c->rec_multiblock_leaf.v; - p->rec_overflow_key_leaf.v += c->rec_overflow_key_leaf.v; - if (c->rec_multiblock_max.v > p->rec_multiblock_max.v) - p->rec_multiblock_max.v = c->rec_multiblock_max.v; - p->rec_overflow_value.v += c->rec_overflow_value.v; - p->rec_page_match.v += c->rec_page_match.v; - p->rec_pages.v += c->rec_pages.v; - p->rec_pages_eviction.v += c->rec_pages_eviction.v; - p->rec_page_delete.v += c->rec_page_delete.v; - p->session_compact.v += c->session_compact.v; - p->session_cursor_open.v += c->session_cursor_open.v; - p->txn_update_conflict.v += c->txn_update_conflict.v; + to->block_extension += WT_STAT_READ(from, block_extension); + to->block_alloc += WT_STAT_READ(from, block_alloc); + to->block_free += WT_STAT_READ(from, block_free); + to->block_checkpoint_size += + WT_STAT_READ(from, block_checkpoint_size); + to->allocation_size = from[0]->allocation_size; + to->block_reuse_bytes += WT_STAT_READ(from, block_reuse_bytes); + to->block_magic = from[0]->block_magic; + to->block_major = from[0]->block_major; + to->block_size += WT_STAT_READ(from, block_size); + to->block_minor = from[0]->block_minor; + to->btree_checkpoint_generation += + WT_STAT_READ(from, btree_checkpoint_generation); + to->btree_column_fix += WT_STAT_READ(from, btree_column_fix); + to->btree_column_internal += + WT_STAT_READ(from, btree_column_internal); + to->btree_column_deleted += WT_STAT_READ(from, btree_column_deleted); + to->btree_column_variable += + WT_STAT_READ(from, btree_column_variable); + to->btree_column_rle += WT_STAT_READ(from, btree_column_rle); + to->btree_fixed_len = from[0]->btree_fixed_len; + if ((v = WT_STAT_READ(from, btree_maxintlkey)) > + to->btree_maxintlkey) + to->btree_maxintlkey = v; + if ((v = WT_STAT_READ(from, btree_maxintlpage)) > + to->btree_maxintlpage) + to->btree_maxintlpage = v; + if ((v = WT_STAT_READ(from, btree_maxleafkey)) > + to->btree_maxleafkey) + to->btree_maxleafkey = v; + if ((v = WT_STAT_READ(from, btree_maxleafpage)) > + to->btree_maxleafpage) + to->btree_maxleafpage = v; + if ((v = WT_STAT_READ(from, btree_maxleafvalue)) > + to->btree_maxleafvalue) + to->btree_maxleafvalue = v; + if ((v = WT_STAT_READ(from, btree_maximum_depth)) > + to->btree_maximum_depth) + to->btree_maximum_depth = v; + to->btree_entries += WT_STAT_READ(from, btree_entries); + to->btree_overflow += WT_STAT_READ(from, btree_overflow); + to->btree_compact_rewrite += + WT_STAT_READ(from, btree_compact_rewrite); + to->btree_row_internal += WT_STAT_READ(from, btree_row_internal); + to->btree_row_leaf += WT_STAT_READ(from, btree_row_leaf); + to->cache_bytes_read += WT_STAT_READ(from, cache_bytes_read); + to->cache_bytes_write += WT_STAT_READ(from, cache_bytes_write); + to->cache_eviction_checkpoint += + WT_STAT_READ(from, cache_eviction_checkpoint); + to->cache_eviction_fail += WT_STAT_READ(from, cache_eviction_fail); + to->cache_eviction_hazard += + WT_STAT_READ(from, cache_eviction_hazard); + to->cache_inmem_splittable += + WT_STAT_READ(from, cache_inmem_splittable); + to->cache_inmem_split += WT_STAT_READ(from, cache_inmem_split); + to->cache_eviction_internal += + WT_STAT_READ(from, cache_eviction_internal); + to->cache_eviction_dirty += WT_STAT_READ(from, cache_eviction_dirty); + to->cache_read_overflow += WT_STAT_READ(from, cache_read_overflow); + to->cache_overflow_value += WT_STAT_READ(from, cache_overflow_value); + to->cache_eviction_deepen += + WT_STAT_READ(from, cache_eviction_deepen); + to->cache_write_lookaside += + WT_STAT_READ(from, cache_write_lookaside); + to->cache_read += WT_STAT_READ(from, cache_read); + to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside); + to->cache_eviction_split += WT_STAT_READ(from, cache_eviction_split); + to->cache_write += WT_STAT_READ(from, cache_write); + to->cache_write_restore += WT_STAT_READ(from, cache_write_restore); + to->cache_eviction_clean += WT_STAT_READ(from, cache_eviction_clean); + to->compress_read += WT_STAT_READ(from, compress_read); + to->compress_write += WT_STAT_READ(from, compress_write); + to->compress_write_fail += WT_STAT_READ(from, compress_write_fail); + to->compress_write_too_small += + WT_STAT_READ(from, compress_write_too_small); + to->compress_raw_fail_temporary += + WT_STAT_READ(from, compress_raw_fail_temporary); + to->compress_raw_fail += WT_STAT_READ(from, compress_raw_fail); + to->compress_raw_ok += WT_STAT_READ(from, compress_raw_ok); + to->cursor_insert_bulk += WT_STAT_READ(from, cursor_insert_bulk); + to->cursor_create += WT_STAT_READ(from, cursor_create); + to->cursor_insert_bytes += WT_STAT_READ(from, cursor_insert_bytes); + to->cursor_remove_bytes += WT_STAT_READ(from, cursor_remove_bytes); + to->cursor_update_bytes += WT_STAT_READ(from, cursor_update_bytes); + to->cursor_insert += WT_STAT_READ(from, cursor_insert); + to->cursor_next += WT_STAT_READ(from, cursor_next); + to->cursor_prev += WT_STAT_READ(from, cursor_prev); + to->cursor_remove += WT_STAT_READ(from, cursor_remove); + to->cursor_reset += WT_STAT_READ(from, cursor_reset); + to->cursor_restart += WT_STAT_READ(from, cursor_restart); + to->cursor_search += WT_STAT_READ(from, cursor_search); + to->cursor_search_near += WT_STAT_READ(from, cursor_search_near); + to->cursor_update += WT_STAT_READ(from, cursor_update); + to->bloom_false_positive += WT_STAT_READ(from, bloom_false_positive); + to->bloom_hit += WT_STAT_READ(from, bloom_hit); + to->bloom_miss += WT_STAT_READ(from, bloom_miss); + to->bloom_page_evict += WT_STAT_READ(from, bloom_page_evict); + to->bloom_page_read += WT_STAT_READ(from, bloom_page_read); + to->bloom_count += WT_STAT_READ(from, bloom_count); + to->lsm_chunk_count += WT_STAT_READ(from, lsm_chunk_count); + if ((v = WT_STAT_READ(from, lsm_generation_max)) > + to->lsm_generation_max) + to->lsm_generation_max = v; + to->lsm_lookup_no_bloom += WT_STAT_READ(from, lsm_lookup_no_bloom); + to->lsm_checkpoint_throttle += + WT_STAT_READ(from, lsm_checkpoint_throttle); + to->lsm_merge_throttle += WT_STAT_READ(from, lsm_merge_throttle); + to->bloom_size += WT_STAT_READ(from, bloom_size); + to->rec_dictionary += WT_STAT_READ(from, rec_dictionary); + to->rec_suffix_compression += + WT_STAT_READ(from, rec_suffix_compression); + to->rec_multiblock_internal += + WT_STAT_READ(from, rec_multiblock_internal); + to->rec_overflow_key_internal += + WT_STAT_READ(from, rec_overflow_key_internal); + to->rec_prefix_compression += + WT_STAT_READ(from, rec_prefix_compression); + to->rec_multiblock_leaf += WT_STAT_READ(from, rec_multiblock_leaf); + to->rec_overflow_key_leaf += + WT_STAT_READ(from, rec_overflow_key_leaf); + if ((v = WT_STAT_READ(from, rec_multiblock_max)) > + to->rec_multiblock_max) + to->rec_multiblock_max = v; + to->rec_overflow_value += WT_STAT_READ(from, rec_overflow_value); + to->rec_page_match += WT_STAT_READ(from, rec_page_match); + to->rec_pages += WT_STAT_READ(from, rec_pages); + to->rec_pages_eviction += WT_STAT_READ(from, rec_pages_eviction); + to->rec_page_delete += WT_STAT_READ(from, rec_page_delete); + to->session_compact += WT_STAT_READ(from, session_compact); + to->session_cursor_open += WT_STAT_READ(from, session_cursor_open); + to->txn_update_conflict += WT_STAT_READ(from, txn_update_conflict); +} + +static const char * const __stats_connection_desc[] = { + "async: number of allocation state races", + "async: number of operation slots viewed for allocation", + "async: current work queue length", + "async: number of flush calls", + "async: number of times operation allocation failed", + "async: maximum work queue length", + "async: number of times worker found no work", + "async: total allocations", + "async: total compact calls", + "async: total insert calls", + "async: total remove calls", + "async: total search calls", + "async: total update calls", + "block-manager: mapped bytes read", + "block-manager: bytes read", + "block-manager: bytes written", + "block-manager: mapped blocks read", + "block-manager: blocks pre-loaded", + "block-manager: blocks read", + "block-manager: blocks written", + "cache: tracked dirty bytes in the cache", + "cache: tracked bytes belonging to internal pages in the cache", + "cache: bytes currently in the cache", + "cache: tracked bytes belonging to leaf pages in the cache", + "cache: maximum bytes configured", + "cache: tracked bytes belonging to overflow pages in the cache", + "cache: bytes read into cache", + "cache: bytes written from cache", + "cache: pages evicted by application threads", + "cache: checkpoint blocked page eviction", + "cache: unmodified pages evicted", + "cache: page split during eviction deepened the tree", + "cache: modified pages evicted", + "cache: pages selected for eviction unable to be evicted", + "cache: pages evicted because they exceeded the in-memory maximum", + "cache: pages evicted because they had chains of deleted items", + "cache: failed eviction of pages that exceeded the in-memory maximum", + "cache: hazard pointer blocked page eviction", + "cache: internal pages evicted", + "cache: maximum page size at eviction", + "cache: eviction server candidate queue empty when topping up", + "cache: eviction server candidate queue not empty when topping up", + "cache: eviction server evicting pages", + "cache: eviction server populating queue, but not evicting pages", + "cache: eviction server unable to reach eviction goal", + "cache: pages split during eviction", + "cache: pages walked for eviction", + "cache: eviction worker thread evicting pages", + "cache: in-memory page splits", + "cache: in-memory page passed criteria to be split", + "cache: lookaside table insert calls", + "cache: lookaside table remove calls", + "cache: percentage overhead", + "cache: tracked dirty pages in the cache", + "cache: pages currently held in the cache", + "cache: pages read into cache", + "cache: pages read into cache requiring lookaside entries", + "cache: pages written from cache", + "cache: page written requiring lookaside records", + "cache: pages written requiring in-memory restoration", + "connection: pthread mutex condition wait calls", + "cursor: cursor create calls", + "cursor: cursor insert calls", + "cursor: cursor next calls", + "cursor: cursor prev calls", + "cursor: cursor remove calls", + "cursor: cursor reset calls", + "cursor: cursor restarted searches", + "cursor: cursor search calls", + "cursor: cursor search near calls", + "cursor: cursor update calls", + "data-handle: connection data handles currently active", + "data-handle: session dhandles swept", + "data-handle: session sweep attempts", + "data-handle: connection sweep dhandles closed", + "data-handle: connection sweep candidate became referenced", + "data-handle: connection sweep dhandles removed from hash list", + "data-handle: connection sweep time-of-death sets", + "data-handle: connection sweeps", + "connection: files currently open", + "log: total log buffer size", + "log: log bytes of payload data", + "log: log bytes written", + "log: yields waiting for previous log file close", + "log: total size of compressed records", + "log: total in-memory size of compressed records", + "log: log records too small to compress", + "log: log records not compressed", + "log: log records compressed", + "log: maximum log file size", + "log: pre-allocated log files prepared", + "log: number of pre-allocated log files to create", + "log: pre-allocated log files used", + "log: log release advances write LSN", + "log: records processed by log scan", + "log: log scan records requiring two reads", + "log: log scan operations", + "log: consolidated slot closures", + "log: written slots coalesced", + "log: logging bytes consolidated", + "log: consolidated slot joins", + "log: consolidated slot join races", + "log: busy returns attempting to switch slots", + "log: consolidated slot join transitions", + "log: consolidated slot unbuffered writes", + "log: log sync operations", + "log: log sync_dir operations", + "log: log server thread advances write LSN", + "log: log write operations", + "LSM: sleep for LSM checkpoint throttle", + "LSM: sleep for LSM merge throttle", + "LSM: rows merged in an LSM tree", + "LSM: application work units currently queued", + "LSM: merge work units currently queued", + "LSM: tree queue hit maximum", + "LSM: switch work units currently queued", + "LSM: tree maintenance operations scheduled", + "LSM: tree maintenance operations discarded", + "LSM: tree maintenance operations executed", + "connection: memory allocations", + "connection: memory frees", + "connection: memory re-allocations", + "thread-yield: page acquire busy blocked", + "thread-yield: page acquire eviction blocked", + "thread-yield: page acquire locked blocked", + "thread-yield: page acquire read blocked", + "thread-yield: page acquire time sleeping (usecs)", + "connection: total read I/Os", + "reconciliation: page reconciliation calls", + "reconciliation: page reconciliation calls for eviction", + "reconciliation: split bytes currently awaiting free", + "reconciliation: split objects currently awaiting free", + "connection: pthread mutex shared lock read-lock calls", + "connection: pthread mutex shared lock write-lock calls", + "session: open cursor count", + "session: open session count", + "transaction: transaction begins", + "transaction: transaction checkpoints", + "transaction: transaction checkpoint generation", + "transaction: transaction checkpoint currently running", + "transaction: transaction checkpoint max time (msecs)", + "transaction: transaction checkpoint min time (msecs)", + "transaction: transaction checkpoint most recent time (msecs)", + "transaction: transaction checkpoint total time (msecs)", + "transaction: transactions committed", + "transaction: transaction failures due to cache overflow", + "transaction: transaction range of IDs currently pinned by a checkpoint", + "transaction: transaction range of IDs currently pinned", + "transaction: transactions rolled back", + "transaction: transaction sync calls", + "connection: total write I/Os", +}; + +const char * +__wt_stat_connection_desc(int slot) +{ + return (__stats_connection_desc[slot]); } void -__wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats) +__wt_stat_connection_init_single(WT_CONNECTION_STATS *stats) { - /* Clear, so can also be called for reinitialization. */ memset(stats, 0, sizeof(*stats)); +} + +void +__wt_stat_connection_init(WT_CONNECTION_IMPL *handle) +{ + int i; - stats->async_cur_queue.desc = "async: current work queue length"; - stats->async_max_queue.desc = "async: maximum work queue length"; - stats->async_alloc_race.desc = - "async: number of allocation state races"; - stats->async_flush.desc = "async: number of flush calls"; - stats->async_alloc_view.desc = - "async: number of operation slots viewed for allocation"; - stats->async_full.desc = - "async: number of times operation allocation failed"; - stats->async_nowork.desc = - "async: number of times worker found no work"; - stats->async_op_alloc.desc = "async: total allocations"; - stats->async_op_compact.desc = "async: total compact calls"; - stats->async_op_insert.desc = "async: total insert calls"; - stats->async_op_remove.desc = "async: total remove calls"; - stats->async_op_search.desc = "async: total search calls"; - stats->async_op_update.desc = "async: total update calls"; - stats->block_preload.desc = "block-manager: blocks pre-loaded"; - stats->block_read.desc = "block-manager: blocks read"; - stats->block_write.desc = "block-manager: blocks written"; - stats->block_byte_read.desc = "block-manager: bytes read"; - stats->block_byte_write.desc = "block-manager: bytes written"; - stats->block_map_read.desc = "block-manager: mapped blocks read"; - stats->block_byte_map_read.desc = "block-manager: mapped bytes read"; - stats->cache_bytes_inuse.desc = "cache: bytes currently in the cache"; - stats->cache_bytes_read.desc = "cache: bytes read into cache"; - stats->cache_bytes_write.desc = "cache: bytes written from cache"; - stats->cache_eviction_checkpoint.desc = - "cache: checkpoint blocked page eviction"; - stats->cache_eviction_queue_empty.desc = - "cache: eviction server candidate queue empty when topping up"; - stats->cache_eviction_queue_not_empty.desc = - "cache: eviction server candidate queue not empty when topping up"; - stats->cache_eviction_server_evicting.desc = - "cache: eviction server evicting pages"; - stats->cache_eviction_server_not_evicting.desc = - "cache: eviction server populating queue, but not evicting pages"; - stats->cache_eviction_slow.desc = - "cache: eviction server unable to reach eviction goal"; - stats->cache_eviction_worker_evicting.desc = - "cache: eviction worker thread evicting pages"; - stats->cache_eviction_force_fail.desc = - "cache: failed eviction of pages that exceeded the in-memory maximum"; - stats->cache_eviction_hazard.desc = - "cache: hazard pointer blocked page eviction"; - stats->cache_inmem_split.desc = "cache: in-memory page splits"; - stats->cache_eviction_internal.desc = "cache: internal pages evicted"; - stats->cache_bytes_max.desc = "cache: maximum bytes configured"; - stats->cache_eviction_maximum_page_size.desc = - "cache: maximum page size at eviction"; - stats->cache_eviction_dirty.desc = "cache: modified pages evicted"; - stats->cache_eviction_deepen.desc = - "cache: page split during eviction deepened the tree"; - stats->cache_pages_inuse.desc = - "cache: pages currently held in the cache"; - stats->cache_eviction_force.desc = - "cache: pages evicted because they exceeded the in-memory maximum"; - stats->cache_eviction_force_delete.desc = - "cache: pages evicted because they had chains of deleted items"; - stats->cache_eviction_app.desc = - "cache: pages evicted by application threads"; - stats->cache_read.desc = "cache: pages read into cache"; - stats->cache_eviction_fail.desc = - "cache: pages selected for eviction unable to be evicted"; - stats->cache_eviction_split.desc = - "cache: pages split during eviction"; - stats->cache_eviction_walk.desc = "cache: pages walked for eviction"; - stats->cache_write.desc = "cache: pages written from cache"; - stats->cache_overhead.desc = "cache: percentage overhead"; - stats->cache_bytes_internal.desc = - "cache: tracked bytes belonging to internal pages in the cache"; - stats->cache_bytes_leaf.desc = - "cache: tracked bytes belonging to leaf pages in the cache"; - stats->cache_bytes_overflow.desc = - "cache: tracked bytes belonging to overflow pages in the cache"; - stats->cache_bytes_dirty.desc = - "cache: tracked dirty bytes in the cache"; - stats->cache_pages_dirty.desc = - "cache: tracked dirty pages in the cache"; - stats->cache_eviction_clean.desc = "cache: unmodified pages evicted"; - stats->file_open.desc = "connection: files currently open"; - stats->memory_allocation.desc = "connection: memory allocations"; - stats->memory_free.desc = "connection: memory frees"; - stats->memory_grow.desc = "connection: memory re-allocations"; - stats->cond_wait.desc = - "connection: pthread mutex condition wait calls"; - stats->rwlock_read.desc = - "connection: pthread mutex shared lock read-lock calls"; - stats->rwlock_write.desc = - "connection: pthread mutex shared lock write-lock calls"; - stats->read_io.desc = "connection: total read I/Os"; - stats->write_io.desc = "connection: total write I/Os"; - stats->cursor_create.desc = "cursor: cursor create calls"; - stats->cursor_insert.desc = "cursor: cursor insert calls"; - stats->cursor_next.desc = "cursor: cursor next calls"; - stats->cursor_prev.desc = "cursor: cursor prev calls"; - stats->cursor_remove.desc = "cursor: cursor remove calls"; - stats->cursor_reset.desc = "cursor: cursor reset calls"; - stats->cursor_search.desc = "cursor: cursor search calls"; - stats->cursor_search_near.desc = "cursor: cursor search near calls"; - stats->cursor_update.desc = "cursor: cursor update calls"; - stats->dh_conn_ref.desc = - "data-handle: connection candidate referenced"; - stats->dh_conn_handles.desc = "data-handle: connection dhandles swept"; - stats->dh_conn_sweeps.desc = "data-handle: connection sweeps"; - stats->dh_conn_tod.desc = "data-handle: connection time-of-death sets"; - stats->dh_session_handles.desc = "data-handle: session dhandles swept"; - stats->dh_session_sweeps.desc = "data-handle: session sweep attempts"; - stats->log_slot_closes.desc = "log: consolidated slot closures"; - stats->log_slot_races.desc = "log: consolidated slot join races"; - stats->log_slot_transitions.desc = - "log: consolidated slot join transitions"; - stats->log_slot_joins.desc = "log: consolidated slot joins"; - stats->log_slot_toosmall.desc = - "log: failed to find a slot large enough for record"; - stats->log_bytes_payload.desc = "log: log bytes of payload data"; - stats->log_bytes_written.desc = "log: log bytes written"; - stats->log_compress_writes.desc = "log: log records compressed"; - stats->log_compress_write_fails.desc = - "log: log records not compressed"; - stats->log_compress_small.desc = - "log: log records too small to compress"; - stats->log_release_write_lsn.desc = - "log: log release advances write LSN"; - stats->log_scans.desc = "log: log scan operations"; - stats->log_scan_rereads.desc = - "log: log scan records requiring two reads"; - stats->log_write_lsn.desc = - "log: log server thread advances write LSN"; - stats->log_sync.desc = "log: log sync operations"; - stats->log_sync_dir.desc = "log: log sync_dir operations"; - stats->log_writes.desc = "log: log write operations"; - stats->log_slot_consolidated.desc = "log: logging bytes consolidated"; - stats->log_max_filesize.desc = "log: maximum log file size"; - stats->log_prealloc_max.desc = - "log: number of pre-allocated log files to create"; - stats->log_prealloc_files.desc = - "log: pre-allocated log files prepared"; - stats->log_prealloc_used.desc = "log: pre-allocated log files used"; - stats->log_slot_toobig.desc = "log: record size exceeded maximum"; - stats->log_scan_records.desc = "log: records processed by log scan"; - stats->log_compress_mem.desc = - "log: total in-memory size of compressed records"; - stats->log_buffer_size.desc = "log: total log buffer size"; - stats->log_compress_len.desc = "log: total size of compressed records"; - stats->log_slot_coalesced.desc = "log: written slots coalesced"; - stats->log_close_yields.desc = - "log: yields waiting for previous log file close"; - stats->lsm_work_queue_app.desc = - "LSM: application work units currently queued"; - stats->lsm_work_queue_manager.desc = - "LSM: merge work units currently queued"; - stats->lsm_rows_merged.desc = "LSM: rows merged in an LSM tree"; - stats->lsm_checkpoint_throttle.desc = - "LSM: sleep for LSM checkpoint throttle"; - stats->lsm_merge_throttle.desc = "LSM: sleep for LSM merge throttle"; - stats->lsm_work_queue_switch.desc = - "LSM: switch work units currently queued"; - stats->lsm_work_units_discarded.desc = - "LSM: tree maintenance operations discarded"; - stats->lsm_work_units_done.desc = - "LSM: tree maintenance operations executed"; - stats->lsm_work_units_created.desc = - "LSM: tree maintenance operations scheduled"; - stats->lsm_work_queue_max.desc = "LSM: tree queue hit maximum"; - stats->rec_pages.desc = "reconciliation: page reconciliation calls"; - stats->rec_pages_eviction.desc = - "reconciliation: page reconciliation calls for eviction"; - stats->rec_split_stashed_bytes.desc = - "reconciliation: split bytes currently awaiting free"; - stats->rec_split_stashed_objects.desc = - "reconciliation: split objects currently awaiting free"; - stats->session_cursor_open.desc = "session: open cursor count"; - stats->session_open.desc = "session: open session count"; - stats->page_busy_blocked.desc = - "thread-yield: page acquire busy blocked"; - stats->page_forcible_evict_blocked.desc = - "thread-yield: page acquire eviction blocked"; - stats->page_locked_blocked.desc = - "thread-yield: page acquire locked blocked"; - stats->page_read_blocked.desc = - "thread-yield: page acquire read blocked"; - stats->page_sleep.desc = - "thread-yield: page acquire time sleeping (usecs)"; - stats->txn_begin.desc = "transaction: transaction begins"; - stats->txn_checkpoint_running.desc = - "transaction: transaction checkpoint currently running"; - stats->txn_checkpoint_generation.desc = - "transaction: transaction checkpoint generation"; - stats->txn_checkpoint_time_max.desc = - "transaction: transaction checkpoint max time (msecs)"; - stats->txn_checkpoint_time_min.desc = - "transaction: transaction checkpoint min time (msecs)"; - stats->txn_checkpoint_time_recent.desc = - "transaction: transaction checkpoint most recent time (msecs)"; - stats->txn_checkpoint_time_total.desc = - "transaction: transaction checkpoint total time (msecs)"; - stats->txn_checkpoint.desc = "transaction: transaction checkpoints"; - stats->txn_fail_cache.desc = - "transaction: transaction failures due to cache overflow"; - stats->txn_pinned_range.desc = - "transaction: transaction range of IDs currently pinned"; - stats->txn_pinned_checkpoint_range.desc = - "transaction: transaction range of IDs currently pinned by a checkpoint"; - stats->txn_sync.desc = "transaction: transaction sync calls"; - stats->txn_commit.desc = "transaction: transactions committed"; - stats->txn_rollback.desc = "transaction: transactions rolled back"; + for (i = 0; i < WT_COUNTER_SLOTS; ++i) { + handle->stats[i] = &handle->stat_array[i]; + __wt_stat_connection_init_single(handle->stats[i]); + } } void -__wt_stat_refresh_connection_stats(void *stats_arg) +__wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) { - WT_CONNECTION_STATS *stats; + stats->async_cur_queue = 0; + /* not clearing async_max_queue */ + stats->async_alloc_race = 0; + stats->async_flush = 0; + stats->async_alloc_view = 0; + stats->async_full = 0; + stats->async_nowork = 0; + stats->async_op_alloc = 0; + stats->async_op_compact = 0; + stats->async_op_insert = 0; + stats->async_op_remove = 0; + stats->async_op_search = 0; + stats->async_op_update = 0; + stats->block_preload = 0; + stats->block_read = 0; + stats->block_write = 0; + stats->block_byte_read = 0; + stats->block_byte_write = 0; + stats->block_map_read = 0; + stats->block_byte_map_read = 0; + /* not clearing cache_bytes_inuse */ + stats->cache_bytes_read = 0; + stats->cache_bytes_write = 0; + stats->cache_eviction_checkpoint = 0; + stats->cache_eviction_queue_empty = 0; + stats->cache_eviction_queue_not_empty = 0; + stats->cache_eviction_server_evicting = 0; + stats->cache_eviction_server_not_evicting = 0; + stats->cache_eviction_slow = 0; + stats->cache_eviction_worker_evicting = 0; + stats->cache_eviction_force_fail = 0; + stats->cache_eviction_hazard = 0; + stats->cache_inmem_splittable = 0; + stats->cache_inmem_split = 0; + stats->cache_eviction_internal = 0; + stats->cache_lookaside_insert = 0; + stats->cache_lookaside_remove = 0; + /* not clearing cache_bytes_max */ + /* not clearing cache_eviction_maximum_page_size */ + stats->cache_eviction_dirty = 0; + stats->cache_eviction_deepen = 0; + stats->cache_write_lookaside = 0; + /* not clearing cache_pages_inuse */ + stats->cache_eviction_force = 0; + stats->cache_eviction_force_delete = 0; + stats->cache_eviction_app = 0; + stats->cache_read = 0; + stats->cache_read_lookaside = 0; + stats->cache_eviction_fail = 0; + stats->cache_eviction_split = 0; + stats->cache_eviction_walk = 0; + stats->cache_write = 0; + stats->cache_write_restore = 0; + /* not clearing cache_overhead */ + /* not clearing cache_bytes_internal */ + /* not clearing cache_bytes_leaf */ + /* not clearing cache_bytes_overflow */ + /* not clearing cache_bytes_dirty */ + /* not clearing cache_pages_dirty */ + stats->cache_eviction_clean = 0; + /* not clearing file_open */ + stats->memory_allocation = 0; + stats->memory_free = 0; + stats->memory_grow = 0; + stats->cond_wait = 0; + stats->rwlock_read = 0; + stats->rwlock_write = 0; + stats->read_io = 0; + stats->write_io = 0; + stats->cursor_create = 0; + stats->cursor_insert = 0; + stats->cursor_next = 0; + stats->cursor_prev = 0; + stats->cursor_remove = 0; + stats->cursor_reset = 0; + stats->cursor_restart = 0; + stats->cursor_search = 0; + stats->cursor_search_near = 0; + stats->cursor_update = 0; + /* not clearing dh_conn_handle_count */ + stats->dh_sweep_ref = 0; + stats->dh_sweep_close = 0; + stats->dh_sweep_remove = 0; + stats->dh_sweep_tod = 0; + stats->dh_sweeps = 0; + stats->dh_session_handles = 0; + stats->dh_session_sweeps = 0; + stats->log_slot_switch_busy = 0; + stats->log_slot_closes = 0; + stats->log_slot_races = 0; + stats->log_slot_transitions = 0; + stats->log_slot_joins = 0; + stats->log_slot_unbuffered = 0; + stats->log_bytes_payload = 0; + stats->log_bytes_written = 0; + stats->log_compress_writes = 0; + stats->log_compress_write_fails = 0; + stats->log_compress_small = 0; + stats->log_release_write_lsn = 0; + stats->log_scans = 0; + stats->log_scan_rereads = 0; + stats->log_write_lsn = 0; + stats->log_sync = 0; + stats->log_sync_dir = 0; + stats->log_writes = 0; + stats->log_slot_consolidated = 0; + /* not clearing log_max_filesize */ + /* not clearing log_prealloc_max */ + stats->log_prealloc_files = 0; + stats->log_prealloc_used = 0; + stats->log_scan_records = 0; + stats->log_compress_mem = 0; + /* not clearing log_buffer_size */ + stats->log_compress_len = 0; + stats->log_slot_coalesced = 0; + stats->log_close_yields = 0; + /* not clearing lsm_work_queue_app */ + /* not clearing lsm_work_queue_manager */ + stats->lsm_rows_merged = 0; + stats->lsm_checkpoint_throttle = 0; + stats->lsm_merge_throttle = 0; + /* not clearing lsm_work_queue_switch */ + stats->lsm_work_units_discarded = 0; + stats->lsm_work_units_done = 0; + stats->lsm_work_units_created = 0; + stats->lsm_work_queue_max = 0; + stats->rec_pages = 0; + stats->rec_pages_eviction = 0; + /* not clearing rec_split_stashed_bytes */ + /* not clearing rec_split_stashed_objects */ + /* not clearing session_cursor_open */ + /* not clearing session_open */ + stats->page_busy_blocked = 0; + stats->page_forcible_evict_blocked = 0; + stats->page_locked_blocked = 0; + stats->page_read_blocked = 0; + stats->page_sleep = 0; + stats->txn_begin = 0; + /* not clearing txn_checkpoint_running */ + /* not clearing txn_checkpoint_generation */ + /* not clearing txn_checkpoint_time_max */ + /* not clearing txn_checkpoint_time_min */ + /* not clearing txn_checkpoint_time_recent */ + /* not clearing txn_checkpoint_time_total */ + stats->txn_checkpoint = 0; + stats->txn_fail_cache = 0; + /* not clearing txn_pinned_range */ + /* not clearing txn_pinned_checkpoint_range */ + stats->txn_sync = 0; + stats->txn_commit = 0; + stats->txn_rollback = 0; +} + +void +__wt_stat_connection_clear_all(WT_CONNECTION_STATS **stats) +{ + u_int i; - stats = (WT_CONNECTION_STATS *)stats_arg; - stats->async_cur_queue.v = 0; - stats->async_alloc_race.v = 0; - stats->async_flush.v = 0; - stats->async_alloc_view.v = 0; - stats->async_full.v = 0; - stats->async_nowork.v = 0; - stats->async_op_alloc.v = 0; - stats->async_op_compact.v = 0; - stats->async_op_insert.v = 0; - stats->async_op_remove.v = 0; - stats->async_op_search.v = 0; - stats->async_op_update.v = 0; - stats->block_preload.v = 0; - stats->block_read.v = 0; - stats->block_write.v = 0; - stats->block_byte_read.v = 0; - stats->block_byte_write.v = 0; - stats->block_map_read.v = 0; - stats->block_byte_map_read.v = 0; - stats->cache_bytes_read.v = 0; - stats->cache_bytes_write.v = 0; - stats->cache_eviction_checkpoint.v = 0; - stats->cache_eviction_queue_empty.v = 0; - stats->cache_eviction_queue_not_empty.v = 0; - stats->cache_eviction_server_evicting.v = 0; - stats->cache_eviction_server_not_evicting.v = 0; - stats->cache_eviction_slow.v = 0; - stats->cache_eviction_worker_evicting.v = 0; - stats->cache_eviction_force_fail.v = 0; - stats->cache_eviction_hazard.v = 0; - stats->cache_inmem_split.v = 0; - stats->cache_eviction_internal.v = 0; - stats->cache_eviction_dirty.v = 0; - stats->cache_eviction_deepen.v = 0; - stats->cache_eviction_force.v = 0; - stats->cache_eviction_force_delete.v = 0; - stats->cache_eviction_app.v = 0; - stats->cache_read.v = 0; - stats->cache_eviction_fail.v = 0; - stats->cache_eviction_split.v = 0; - stats->cache_eviction_walk.v = 0; - stats->cache_write.v = 0; - stats->cache_eviction_clean.v = 0; - stats->memory_allocation.v = 0; - stats->memory_free.v = 0; - stats->memory_grow.v = 0; - stats->cond_wait.v = 0; - stats->rwlock_read.v = 0; - stats->rwlock_write.v = 0; - stats->read_io.v = 0; - stats->write_io.v = 0; - stats->cursor_create.v = 0; - stats->cursor_insert.v = 0; - stats->cursor_next.v = 0; - stats->cursor_prev.v = 0; - stats->cursor_remove.v = 0; - stats->cursor_reset.v = 0; - stats->cursor_search.v = 0; - stats->cursor_search_near.v = 0; - stats->cursor_update.v = 0; - stats->dh_conn_ref.v = 0; - stats->dh_conn_handles.v = 0; - stats->dh_conn_sweeps.v = 0; - stats->dh_conn_tod.v = 0; - stats->dh_session_handles.v = 0; - stats->dh_session_sweeps.v = 0; - stats->log_slot_closes.v = 0; - stats->log_slot_races.v = 0; - stats->log_slot_transitions.v = 0; - stats->log_slot_joins.v = 0; - stats->log_slot_toosmall.v = 0; - stats->log_bytes_payload.v = 0; - stats->log_bytes_written.v = 0; - stats->log_compress_writes.v = 0; - stats->log_compress_write_fails.v = 0; - stats->log_compress_small.v = 0; - stats->log_release_write_lsn.v = 0; - stats->log_scans.v = 0; - stats->log_scan_rereads.v = 0; - stats->log_write_lsn.v = 0; - stats->log_sync.v = 0; - stats->log_sync_dir.v = 0; - stats->log_writes.v = 0; - stats->log_slot_consolidated.v = 0; - stats->log_prealloc_files.v = 0; - stats->log_prealloc_used.v = 0; - stats->log_slot_toobig.v = 0; - stats->log_scan_records.v = 0; - stats->log_compress_mem.v = 0; - stats->log_compress_len.v = 0; - stats->log_slot_coalesced.v = 0; - stats->log_close_yields.v = 0; - stats->lsm_rows_merged.v = 0; - stats->lsm_checkpoint_throttle.v = 0; - stats->lsm_merge_throttle.v = 0; - stats->lsm_work_units_discarded.v = 0; - stats->lsm_work_units_done.v = 0; - stats->lsm_work_units_created.v = 0; - stats->lsm_work_queue_max.v = 0; - stats->rec_pages.v = 0; - stats->rec_pages_eviction.v = 0; - stats->page_busy_blocked.v = 0; - stats->page_forcible_evict_blocked.v = 0; - stats->page_locked_blocked.v = 0; - stats->page_read_blocked.v = 0; - stats->page_sleep.v = 0; - stats->txn_begin.v = 0; - stats->txn_checkpoint.v = 0; - stats->txn_fail_cache.v = 0; - stats->txn_sync.v = 0; - stats->txn_commit.v = 0; - stats->txn_rollback.v = 0; + for (i = 0; i < WT_COUNTER_SLOTS; ++i) + __wt_stat_connection_clear_single(stats[i]); +} + +void +__wt_stat_connection_aggregate( + WT_CONNECTION_STATS **from, WT_CONNECTION_STATS *to) +{ + to->async_cur_queue += WT_STAT_READ(from, async_cur_queue); + to->async_max_queue += WT_STAT_READ(from, async_max_queue); + to->async_alloc_race += WT_STAT_READ(from, async_alloc_race); + to->async_flush += WT_STAT_READ(from, async_flush); + to->async_alloc_view += WT_STAT_READ(from, async_alloc_view); + to->async_full += WT_STAT_READ(from, async_full); + to->async_nowork += WT_STAT_READ(from, async_nowork); + to->async_op_alloc += WT_STAT_READ(from, async_op_alloc); + to->async_op_compact += WT_STAT_READ(from, async_op_compact); + to->async_op_insert += WT_STAT_READ(from, async_op_insert); + to->async_op_remove += WT_STAT_READ(from, async_op_remove); + to->async_op_search += WT_STAT_READ(from, async_op_search); + to->async_op_update += WT_STAT_READ(from, async_op_update); + to->block_preload += WT_STAT_READ(from, block_preload); + to->block_read += WT_STAT_READ(from, block_read); + to->block_write += WT_STAT_READ(from, block_write); + to->block_byte_read += WT_STAT_READ(from, block_byte_read); + to->block_byte_write += WT_STAT_READ(from, block_byte_write); + to->block_map_read += WT_STAT_READ(from, block_map_read); + to->block_byte_map_read += WT_STAT_READ(from, block_byte_map_read); + to->cache_bytes_inuse += WT_STAT_READ(from, cache_bytes_inuse); + to->cache_bytes_read += WT_STAT_READ(from, cache_bytes_read); + to->cache_bytes_write += WT_STAT_READ(from, cache_bytes_write); + to->cache_eviction_checkpoint += + WT_STAT_READ(from, cache_eviction_checkpoint); + to->cache_eviction_queue_empty += + WT_STAT_READ(from, cache_eviction_queue_empty); + to->cache_eviction_queue_not_empty += + WT_STAT_READ(from, cache_eviction_queue_not_empty); + to->cache_eviction_server_evicting += + WT_STAT_READ(from, cache_eviction_server_evicting); + to->cache_eviction_server_not_evicting += + WT_STAT_READ(from, cache_eviction_server_not_evicting); + to->cache_eviction_slow += WT_STAT_READ(from, cache_eviction_slow); + to->cache_eviction_worker_evicting += + WT_STAT_READ(from, cache_eviction_worker_evicting); + to->cache_eviction_force_fail += + WT_STAT_READ(from, cache_eviction_force_fail); + to->cache_eviction_hazard += + WT_STAT_READ(from, cache_eviction_hazard); + to->cache_inmem_splittable += + WT_STAT_READ(from, cache_inmem_splittable); + to->cache_inmem_split += WT_STAT_READ(from, cache_inmem_split); + to->cache_eviction_internal += + WT_STAT_READ(from, cache_eviction_internal); + to->cache_lookaside_insert += + WT_STAT_READ(from, cache_lookaside_insert); + to->cache_lookaside_remove += + WT_STAT_READ(from, cache_lookaside_remove); + to->cache_bytes_max += WT_STAT_READ(from, cache_bytes_max); + to->cache_eviction_maximum_page_size += + WT_STAT_READ(from, cache_eviction_maximum_page_size); + to->cache_eviction_dirty += WT_STAT_READ(from, cache_eviction_dirty); + to->cache_eviction_deepen += + WT_STAT_READ(from, cache_eviction_deepen); + to->cache_write_lookaside += + WT_STAT_READ(from, cache_write_lookaside); + to->cache_pages_inuse += WT_STAT_READ(from, cache_pages_inuse); + to->cache_eviction_force += WT_STAT_READ(from, cache_eviction_force); + to->cache_eviction_force_delete += + WT_STAT_READ(from, cache_eviction_force_delete); + to->cache_eviction_app += WT_STAT_READ(from, cache_eviction_app); + to->cache_read += WT_STAT_READ(from, cache_read); + to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside); + to->cache_eviction_fail += WT_STAT_READ(from, cache_eviction_fail); + to->cache_eviction_split += WT_STAT_READ(from, cache_eviction_split); + to->cache_eviction_walk += WT_STAT_READ(from, cache_eviction_walk); + to->cache_write += WT_STAT_READ(from, cache_write); + to->cache_write_restore += WT_STAT_READ(from, cache_write_restore); + to->cache_overhead += WT_STAT_READ(from, cache_overhead); + to->cache_bytes_internal += WT_STAT_READ(from, cache_bytes_internal); + to->cache_bytes_leaf += WT_STAT_READ(from, cache_bytes_leaf); + to->cache_bytes_overflow += WT_STAT_READ(from, cache_bytes_overflow); + to->cache_bytes_dirty += WT_STAT_READ(from, cache_bytes_dirty); + to->cache_pages_dirty += WT_STAT_READ(from, cache_pages_dirty); + to->cache_eviction_clean += WT_STAT_READ(from, cache_eviction_clean); + to->file_open += WT_STAT_READ(from, file_open); + to->memory_allocation += WT_STAT_READ(from, memory_allocation); + to->memory_free += WT_STAT_READ(from, memory_free); + to->memory_grow += WT_STAT_READ(from, memory_grow); + to->cond_wait += WT_STAT_READ(from, cond_wait); + to->rwlock_read += WT_STAT_READ(from, rwlock_read); + to->rwlock_write += WT_STAT_READ(from, rwlock_write); + to->read_io += WT_STAT_READ(from, read_io); + to->write_io += WT_STAT_READ(from, write_io); + to->cursor_create += WT_STAT_READ(from, cursor_create); + to->cursor_insert += WT_STAT_READ(from, cursor_insert); + to->cursor_next += WT_STAT_READ(from, cursor_next); + to->cursor_prev += WT_STAT_READ(from, cursor_prev); + to->cursor_remove += WT_STAT_READ(from, cursor_remove); + to->cursor_reset += WT_STAT_READ(from, cursor_reset); + to->cursor_restart += WT_STAT_READ(from, cursor_restart); + to->cursor_search += WT_STAT_READ(from, cursor_search); + to->cursor_search_near += WT_STAT_READ(from, cursor_search_near); + to->cursor_update += WT_STAT_READ(from, cursor_update); + to->dh_conn_handle_count += WT_STAT_READ(from, dh_conn_handle_count); + to->dh_sweep_ref += WT_STAT_READ(from, dh_sweep_ref); + to->dh_sweep_close += WT_STAT_READ(from, dh_sweep_close); + to->dh_sweep_remove += WT_STAT_READ(from, dh_sweep_remove); + to->dh_sweep_tod += WT_STAT_READ(from, dh_sweep_tod); + to->dh_sweeps += WT_STAT_READ(from, dh_sweeps); + to->dh_session_handles += WT_STAT_READ(from, dh_session_handles); + to->dh_session_sweeps += WT_STAT_READ(from, dh_session_sweeps); + to->log_slot_switch_busy += WT_STAT_READ(from, log_slot_switch_busy); + to->log_slot_closes += WT_STAT_READ(from, log_slot_closes); + to->log_slot_races += WT_STAT_READ(from, log_slot_races); + to->log_slot_transitions += WT_STAT_READ(from, log_slot_transitions); + to->log_slot_joins += WT_STAT_READ(from, log_slot_joins); + to->log_slot_unbuffered += WT_STAT_READ(from, log_slot_unbuffered); + to->log_bytes_payload += WT_STAT_READ(from, log_bytes_payload); + to->log_bytes_written += WT_STAT_READ(from, log_bytes_written); + to->log_compress_writes += WT_STAT_READ(from, log_compress_writes); + to->log_compress_write_fails += + WT_STAT_READ(from, log_compress_write_fails); + to->log_compress_small += WT_STAT_READ(from, log_compress_small); + to->log_release_write_lsn += + WT_STAT_READ(from, log_release_write_lsn); + to->log_scans += WT_STAT_READ(from, log_scans); + to->log_scan_rereads += WT_STAT_READ(from, log_scan_rereads); + to->log_write_lsn += WT_STAT_READ(from, log_write_lsn); + to->log_sync += WT_STAT_READ(from, log_sync); + to->log_sync_dir += WT_STAT_READ(from, log_sync_dir); + to->log_writes += WT_STAT_READ(from, log_writes); + to->log_slot_consolidated += + WT_STAT_READ(from, log_slot_consolidated); + to->log_max_filesize += WT_STAT_READ(from, log_max_filesize); + to->log_prealloc_max += WT_STAT_READ(from, log_prealloc_max); + to->log_prealloc_files += WT_STAT_READ(from, log_prealloc_files); + to->log_prealloc_used += WT_STAT_READ(from, log_prealloc_used); + to->log_scan_records += WT_STAT_READ(from, log_scan_records); + to->log_compress_mem += WT_STAT_READ(from, log_compress_mem); + to->log_buffer_size += WT_STAT_READ(from, log_buffer_size); + to->log_compress_len += WT_STAT_READ(from, log_compress_len); + to->log_slot_coalesced += WT_STAT_READ(from, log_slot_coalesced); + to->log_close_yields += WT_STAT_READ(from, log_close_yields); + to->lsm_work_queue_app += WT_STAT_READ(from, lsm_work_queue_app); + to->lsm_work_queue_manager += + WT_STAT_READ(from, lsm_work_queue_manager); + to->lsm_rows_merged += WT_STAT_READ(from, lsm_rows_merged); + to->lsm_checkpoint_throttle += + WT_STAT_READ(from, lsm_checkpoint_throttle); + to->lsm_merge_throttle += WT_STAT_READ(from, lsm_merge_throttle); + to->lsm_work_queue_switch += + WT_STAT_READ(from, lsm_work_queue_switch); + to->lsm_work_units_discarded += + WT_STAT_READ(from, lsm_work_units_discarded); + to->lsm_work_units_done += WT_STAT_READ(from, lsm_work_units_done); + to->lsm_work_units_created += + WT_STAT_READ(from, lsm_work_units_created); + to->lsm_work_queue_max += WT_STAT_READ(from, lsm_work_queue_max); + to->rec_pages += WT_STAT_READ(from, rec_pages); + to->rec_pages_eviction += WT_STAT_READ(from, rec_pages_eviction); + to->rec_split_stashed_bytes += + WT_STAT_READ(from, rec_split_stashed_bytes); + to->rec_split_stashed_objects += + WT_STAT_READ(from, rec_split_stashed_objects); + to->session_cursor_open += WT_STAT_READ(from, session_cursor_open); + to->session_open += WT_STAT_READ(from, session_open); + to->page_busy_blocked += WT_STAT_READ(from, page_busy_blocked); + to->page_forcible_evict_blocked += + WT_STAT_READ(from, page_forcible_evict_blocked); + to->page_locked_blocked += WT_STAT_READ(from, page_locked_blocked); + to->page_read_blocked += WT_STAT_READ(from, page_read_blocked); + to->page_sleep += WT_STAT_READ(from, page_sleep); + to->txn_begin += WT_STAT_READ(from, txn_begin); + to->txn_checkpoint_running += + WT_STAT_READ(from, txn_checkpoint_running); + to->txn_checkpoint_generation += + WT_STAT_READ(from, txn_checkpoint_generation); + to->txn_checkpoint_time_max += + WT_STAT_READ(from, txn_checkpoint_time_max); + to->txn_checkpoint_time_min += + WT_STAT_READ(from, txn_checkpoint_time_min); + to->txn_checkpoint_time_recent += + WT_STAT_READ(from, txn_checkpoint_time_recent); + to->txn_checkpoint_time_total += + WT_STAT_READ(from, txn_checkpoint_time_total); + to->txn_checkpoint += WT_STAT_READ(from, txn_checkpoint); + to->txn_fail_cache += WT_STAT_READ(from, txn_fail_cache); + to->txn_pinned_range += WT_STAT_READ(from, txn_pinned_range); + to->txn_pinned_checkpoint_range += + WT_STAT_READ(from, txn_pinned_checkpoint_range); + to->txn_sync += WT_STAT_READ(from, txn_sync); + to->txn_commit += WT_STAT_READ(from, txn_commit); + to->txn_rollback += WT_STAT_READ(from, txn_rollback); } diff --git a/src/txn/txn.c b/src/txn/txn.c index 9e8def39fb0..e81f8a68251 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -134,7 +134,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) if ((count = txn_global->scan_count) < 0) WT_PAUSE(); } while (count < 0 || - !WT_ATOMIC_CAS4(txn_global->scan_count, count, count + 1)); + !__wt_atomic_casiv32(&txn_global->scan_count, count, count + 1)); current_id = snap_min = txn_global->current; prev_oldest_id = txn_global->oldest_id; @@ -147,7 +147,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) /* Check that the oldest ID has not moved in the meantime. */ if (prev_oldest_id == txn_global->oldest_id) { WT_ASSERT(session, txn_global->scan_count > 0); - (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1); + (void)__wt_atomic_subiv32(&txn_global->scan_count, 1); return; } } @@ -182,12 +182,8 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id); txn_state->snap_min = snap_min; - /* Update the last running ID if we have a much newer value. */ - if (snap_min > txn_global->last_running + 100) - txn_global->last_running = snap_min; - WT_ASSERT(session, txn_global->scan_count > 0); - (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1); + (void)__wt_atomic_subiv32(&txn_global->scan_count, 1); __txn_sort_snapshot(session, n, current_id); } @@ -212,7 +208,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) WT_SESSION_IMPL *oldest_session; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *s; - uint64_t current_id, id, oldest_id, prev_oldest_id, snap_min; + uint64_t current_id, id, last_running, oldest_id, prev_oldest_id; uint32_t i, session_cnt; int32_t count; int last_running_moved; @@ -220,7 +216,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) conn = S2C(session); txn_global = &conn->txn_global; - current_id = snap_min = txn_global->current; + current_id = last_running = txn_global->current; oldest_session = NULL; prev_oldest_id = txn_global->oldest_id; @@ -241,11 +237,11 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) if ((count = txn_global->scan_count) < 0) WT_PAUSE(); } while (count < 0 || - !WT_ATOMIC_CAS4(txn_global->scan_count, count, count + 1)); + !__wt_atomic_casiv32(&txn_global->scan_count, count, count + 1)); /* The oldest ID cannot change until the scan count goes to zero. */ prev_oldest_id = txn_global->oldest_id; - current_id = oldest_id = snap_min = txn_global->current; + current_id = oldest_id = last_running = txn_global->current; /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); @@ -260,8 +256,8 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) */ if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LE(prev_oldest_id, id) && - WT_TXNID_LT(id, snap_min)) - snap_min = id; + WT_TXNID_LT(id, last_running)) + last_running = id; /* * !!! @@ -278,8 +274,8 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) } } - if (WT_TXNID_LT(snap_min, oldest_id)) - oldest_id = snap_min; + if (WT_TXNID_LT(last_running, oldest_id)) + oldest_id = last_running; /* The oldest ID can't move past any named snapshots. */ if ((id = txn_global->nsnap_oldest_id) != WT_TXN_NONE && @@ -287,25 +283,25 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) oldest_id = id; /* Update the last running ID. */ - if (WT_TXNID_LT(txn_global->last_running, snap_min)) { - txn_global->last_running = snap_min; - last_running_moved = 1; - } else - last_running_moved = 0; + last_running_moved = + WT_TXNID_LT(txn_global->last_running, last_running); /* Update the oldest ID. */ - if (WT_TXNID_LT(prev_oldest_id, oldest_id) && - WT_ATOMIC_CAS4(txn_global->scan_count, 1, -1)) { + if ((WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) && + __wt_atomic_casiv32(&txn_global->scan_count, 1, -1)) { WT_ORDERED_READ(session_cnt, conn->session_cnt); for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) { if ((id = s->id) != WT_TXN_NONE && - WT_TXNID_LT(id, oldest_id)) - oldest_id = id; + WT_TXNID_LT(id, last_running)) + last_running = id; if ((id = s->snap_min) != WT_TXN_NONE && WT_TXNID_LT(id, oldest_id)) oldest_id = id; } + if (WT_TXNID_LT(last_running, oldest_id)) + oldest_id = last_running; + #ifdef HAVE_DIAGNOSTIC /* * Make sure the ID doesn't move past any named snapshots. @@ -318,8 +314,11 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) WT_ASSERT(session, id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id)); #endif + if (WT_TXNID_LT(txn_global->last_running, last_running)) + txn_global->last_running = last_running; if (WT_TXNID_LT(txn_global->oldest_id, oldest_id)) txn_global->oldest_id = oldest_id; + WT_ASSERT(session, txn_global->scan_count == -1); txn_global->scan_count = 0; } else { if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) && @@ -334,7 +333,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force) oldest_session->txn.snap_min); } WT_ASSERT(session, txn_global->scan_count > 0); - (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1); + (void)__wt_atomic_subiv32(&txn_global->scan_count, 1); } } @@ -400,7 +399,6 @@ __wt_txn_release(WT_SESSION_IMPL *session) WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; - int was_oldest; txn = &session->txn; WT_ASSERT(session, txn->mod_count == 0); @@ -408,7 +406,6 @@ __wt_txn_release(WT_SESSION_IMPL *session) txn_global = &S2C(session)->txn_global; txn_state = WT_SESSION_TXN_STATE(session); - was_oldest = 0; /* Clear the transaction's ID from the global table. */ if (WT_SESSION_IS_CHECKPOINT(session)) { @@ -419,12 +416,12 @@ __wt_txn_release(WT_SESSION_IMPL *session) txn_global->checkpoint_id = 0; txn_global->checkpoint_pinned = WT_TXN_NONE; } else if (F_ISSET(txn, WT_TXN_HAS_ID)) { + WT_ASSERT(session, + !WT_TXNID_LT(txn->id, txn_global->last_running)); + WT_ASSERT(session, txn_state->id != WT_TXN_NONE && txn->id != WT_TXN_NONE); WT_PUBLISH(txn_state->id, WT_TXN_NONE); - - /* Quick check for the oldest transaction. */ - was_oldest = (txn->id == txn_global->last_running); txn->id = WT_TXN_NONE; } @@ -443,14 +440,6 @@ __wt_txn_release(WT_SESSION_IMPL *session) txn->isolation = session->isolation; /* Ensure the transaction flags are cleared on exit */ txn->flags = 0; - - /* - * When the oldest transaction in the system completes, bump the oldest - * ID. This is racy and so not guaranteed, but in practice it keeps - * the oldest ID from falling too far behind. - */ - if (was_oldest) - __wt_txn_update_oldest(session, 1); } /* @@ -469,7 +458,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) txn = &session->txn; conn = S2C(session); - WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR)); + WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0); if (!F_ISSET(txn, WT_TXN_RUNNING)) WT_RET_MSG(session, EINVAL, "No transaction is active"); @@ -593,6 +582,7 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]) switch (op->type) { case WT_TXN_OP_BASIC: case WT_TXN_OP_INMEM: + WT_ASSERT(session, op->u.upd->txnid == txn->id); op->u.upd->txnid = WT_TXN_ABORTED; break; case WT_TXN_OP_REF: @@ -660,20 +650,29 @@ __wt_txn_stats_update(WT_SESSION_IMPL *session) { WT_TXN_GLOBAL *txn_global; WT_CONNECTION_IMPL *conn; - WT_CONNECTION_STATS *stats; + WT_CONNECTION_STATS **stats; uint64_t checkpoint_pinned; conn = S2C(session); txn_global = &conn->txn_global; - stats = &conn->stats; + stats = conn->stats; checkpoint_pinned = txn_global->checkpoint_pinned; - WT_STAT_SET(stats, txn_pinned_range, - txn_global->current - txn_global->oldest_id); + WT_STAT_SET(session, stats, txn_pinned_range, + txn_global->current - txn_global->oldest_id); - WT_STAT_SET(stats, txn_pinned_checkpoint_range, + WT_STAT_SET(session, stats, txn_pinned_checkpoint_range, checkpoint_pinned == WT_TXN_NONE ? 0 : txn_global->current - checkpoint_pinned); + + WT_STAT_SET( + session, stats, txn_checkpoint_time_max, conn->ckpt_time_max); + WT_STAT_SET( + session, stats, txn_checkpoint_time_min, conn->ckpt_time_min); + WT_STAT_SET( + session, stats, txn_checkpoint_time_recent, conn->ckpt_time_recent); + WT_STAT_SET( + session, stats, txn_checkpoint_time_total, conn->ckpt_time_total); } /* @@ -712,10 +711,11 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_rwlock_alloc(session, &txn_global->nsnap_rwlock, "named snapshot lock")); txn_global->nsnap_oldest_id = WT_TXN_NONE; - STAILQ_INIT(&txn_global->nsnaph); + TAILQ_INIT(&txn_global->nsnaph); WT_RET(__wt_calloc_def( session, conn->session_size, &txn_global->states)); + WT_CACHE_LINE_ALIGNMENT_VERIFY(session, txn_global->states); for (i = 0, s = txn_global->states; i < conn->session_size; i++, s++) s->id = s->snap_min = WT_TXN_NONE; diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c index 49fcd69ffed..9f59c53314e 100644 --- a/src/txn/txn_ckpt.c +++ b/src/txn/txn_ckpt.c @@ -246,6 +246,10 @@ __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[]) WT_ASSERT(session, session->dhandle->checkpoint == NULL); WT_ASSERT(session, WT_PREFIX_MATCH(session->dhandle->name, "file:")); + /* Skip files that are never involved in a checkpoint. */ + if (F_ISSET(S2BT(session), WT_BTREE_NO_CHECKPOINT)) + return (0); + /* Make sure there is space for the next entry. */ WT_RET(__wt_realloc_def(session, &session->ckpt_handle_allocated, session->ckpt_handle_next + 1, &session->ckpt_handle)); @@ -285,19 +289,22 @@ static void __checkpoint_stats( WT_SESSION_IMPL *session, struct timespec *start, struct timespec *stop) { + WT_CONNECTION_IMPL *conn; uint64_t msec; + conn = S2C(session); + /* * Get time diff in microseconds. */ msec = WT_TIMEDIFF(*stop, *start) / WT_MILLION; - if (msec > WT_CONN_STAT(session, txn_checkpoint_time_max)) - WT_STAT_FAST_CONN_SET(session, txn_checkpoint_time_max, msec); - if (WT_CONN_STAT(session, txn_checkpoint_time_min) == 0 || - msec < WT_CONN_STAT(session, txn_checkpoint_time_min)) - WT_STAT_FAST_CONN_SET(session, txn_checkpoint_time_min, msec); - WT_STAT_FAST_CONN_SET(session, txn_checkpoint_time_recent, msec); - WT_STAT_FAST_CONN_INCRV(session, txn_checkpoint_time_total, msec); + + if (msec > conn->ckpt_time_max) + conn->ckpt_time_max = msec; + if (conn->ckpt_time_min == 0 || msec < conn->ckpt_time_min) + conn->ckpt_time_min = msec; + conn->ckpt_time_recent = msec; + conn->ckpt_time_total += msec; } /* @@ -1161,9 +1168,17 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, int final) btree = S2BT(session); bulk = F_ISSET(btree, WT_BTREE_BULK) ? 1 : 0; - /* If the handle is already dead, force the discard. */ + /* + * If the handle is already dead or the file isn't durable, force the + * discard. + * + * If the file isn't durable, mark the handle dead, there are asserts + * later on that only dead handles can have modified pages. + */ + if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT)) + F_SET(session->dhandle, WT_DHANDLE_DEAD); if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) - return (__wt_cache_op(session, NULL, WT_SYNC_DISCARD_FORCE)); + return (__wt_cache_op(session, NULL, WT_SYNC_DISCARD)); /* * If closing an unmodified file, check that no update is required diff --git a/src/txn/txn_log.c b/src/txn/txn_log.c index 0d66eccd7dc..a63720d736f 100644 --- a/src/txn/txn_log.c +++ b/src/txn/txn_log.c @@ -33,18 +33,7 @@ __txn_op_log(WT_SESSION_IMPL *session, * 3) row store remove; or * 4) row store insert/update. */ - if (cbt->btree->type != BTREE_ROW) { - WT_ASSERT(session, cbt->ins != NULL); - recno = WT_INSERT_RECNO(cbt->ins); - WT_ASSERT(session, recno != 0); - - if (WT_UPDATE_DELETED_ISSET(upd)) - WT_ERR(__wt_logop_col_remove_pack(session, logrec, - op->fileid, recno)); - else - WT_ERR(__wt_logop_col_put_pack(session, logrec, - op->fileid, recno, &value)); - } else { + if (cbt->btree->type == BTREE_ROW) { WT_ERR(__wt_cursor_row_leaf_key(cbt, &key)); if (WT_UPDATE_DELETED_ISSET(upd)) @@ -53,6 +42,16 @@ __txn_op_log(WT_SESSION_IMPL *session, else WT_ERR(__wt_logop_row_put_pack(session, logrec, op->fileid, &key, &value)); + } else { + recno = WT_INSERT_RECNO(cbt->ins); + WT_ASSERT(session, recno != WT_RECNO_OOB); + + if (WT_UPDATE_DELETED_ISSET(upd)) + WT_ERR(__wt_logop_col_remove_pack(session, logrec, + op->fileid, recno)); + else + WT_ERR(__wt_logop_col_put_pack(session, logrec, + op->fileid, recno, &value)); } err: __wt_buf_free(session, &key); @@ -308,7 +307,7 @@ __wt_txn_checkpoint_log( switch (flags) { case WT_TXN_LOG_CKPT_PREPARE: txn->full_ckpt = 1; - *ckpt_lsn = S2C(session)->log->write_start_lsn; + WT_ERR(__wt_log_ckpt_lsn(session, ckpt_lsn)); /* * We need to make sure that the log records in the checkpoint * LSN are on disk. In particular to make sure that the @@ -337,7 +336,7 @@ __wt_txn_checkpoint_log( txn->ckpt_nsnapshot = 0; WT_CLEAR(empty); ckpt_snapshot = ∅ - *ckpt_lsn = S2C(session)->log->write_start_lsn; + WT_ERR(__wt_log_ckpt_lsn(session, ckpt_lsn)); } else ckpt_snapshot = txn->ckpt_snapshot; @@ -419,9 +418,9 @@ __wt_txn_truncate_log( } else { op->type = WT_TXN_OP_TRUNCATE_COL; op->u.truncate_col.start = - (start == NULL) ? 0 : start->recno; + (start == NULL) ? WT_RECNO_OOB : start->recno; op->u.truncate_col.stop = - (stop == NULL) ? 0 : stop->recno; + (stop == NULL) ? WT_RECNO_OOB : stop->recno; } /* Write that operation into the in-memory log. */ diff --git a/src/txn/txn_nsnap.c b/src/txn/txn_nsnap.c index bd352c2237e..be736cc1c98 100644 --- a/src/txn/txn_nsnap.c +++ b/src/txn/txn_nsnap.c @@ -34,7 +34,7 @@ __nsnap_drop_one(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name) txn_global = &S2C(session)->txn_global; - STAILQ_FOREACH(found, &txn_global->nsnaph, q) + TAILQ_FOREACH(found, &txn_global->nsnaph, q) if (WT_STRING_MATCH(found->name, name->str, name->len)) break; @@ -42,10 +42,10 @@ __nsnap_drop_one(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name) return (WT_NOTFOUND); /* Bump the global ID if we are removing the first entry */ - if (found == STAILQ_FIRST(&txn_global->nsnaph)) - txn_global->nsnap_oldest_id = (STAILQ_NEXT(found, q) != NULL) ? - STAILQ_NEXT(found, q)->snap_min : WT_TXN_NONE; - STAILQ_REMOVE(&txn_global->nsnaph, found, __wt_named_snapshot, q); + if (found == TAILQ_FIRST(&txn_global->nsnaph)) + txn_global->nsnap_oldest_id = (TAILQ_NEXT(found, q) != NULL) ? + TAILQ_NEXT(found, q)->snap_min : WT_TXN_NONE; + TAILQ_REMOVE(&txn_global->nsnaph, found, q); __nsnap_destroy(session, found); return (ret); @@ -67,7 +67,7 @@ __nsnap_drop_to(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name, int inclusive) last = nsnap = prev = NULL; txn_global = &S2C(session)->txn_global; - if (STAILQ_EMPTY(&txn_global->nsnaph)) { + if (TAILQ_EMPTY(&txn_global->nsnaph)) { if (name == NULL) return (0); /* @@ -85,7 +85,7 @@ __nsnap_drop_to(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name, int inclusive) */ new_nsnap_oldest = WT_TXN_NONE; if (name != NULL) { - STAILQ_FOREACH(last, &txn_global->nsnaph, q) { + TAILQ_FOREACH(last, &txn_global->nsnaph, q) { if (WT_STRING_MATCH(last->name, name->str, name->len)) break; prev = last; @@ -102,17 +102,17 @@ __nsnap_drop_to(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name, int inclusive) last = prev; } - if (STAILQ_NEXT(last, q) != NULL) - new_nsnap_oldest = STAILQ_NEXT(last, q)->snap_min; + if (TAILQ_NEXT(last, q) != NULL) + new_nsnap_oldest = TAILQ_NEXT(last, q)->snap_min; } do { - nsnap = STAILQ_FIRST(&txn_global->nsnaph); + nsnap = TAILQ_FIRST(&txn_global->nsnaph); WT_ASSERT(session, nsnap != NULL); - STAILQ_REMOVE_HEAD(&txn_global->nsnaph, q); + TAILQ_REMOVE(&txn_global->nsnaph, nsnap, q); __nsnap_destroy(session, nsnap); /* Last will be NULL in the all case so it will never match */ - } while (nsnap != last && !STAILQ_EMPTY(&txn_global->nsnaph)); + } while (nsnap != last && !TAILQ_EMPTY(&txn_global->nsnaph)); /* Now that the queue of named snapshots is updated, update the ID */ txn_global->nsnap_oldest_id = new_nsnap_oldest; @@ -173,9 +173,9 @@ __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[]) */ WT_ERR_NOTFOUND_OK(__nsnap_drop_one(session, &cval)); - if (STAILQ_EMPTY(&txn_global->nsnaph)) + if (TAILQ_EMPTY(&txn_global->nsnaph)) txn_global->nsnap_oldest_id = nsnap_new->snap_min; - STAILQ_INSERT_TAIL(&txn_global->nsnaph, nsnap_new, q); + TAILQ_INSERT_TAIL(&txn_global->nsnaph, nsnap_new, q); nsnap_new = NULL; err: if (started_txn) @@ -254,7 +254,7 @@ __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval) WT_RET(__wt_session_copy_values(session)); WT_RET(__wt_readlock(session, txn_global->nsnap_rwlock)); - STAILQ_FOREACH(nsnap, &txn_global->nsnaph, q) + TAILQ_FOREACH(nsnap, &txn_global->nsnaph, q) if (WT_STRING_MATCH(nsnap->name, nameval->str, nameval->len)) { txn->snap_min = txn_state->snap_min = nsnap->snap_min; txn->snap_max = nsnap->snap_max; @@ -358,10 +358,8 @@ __wt_txn_named_snapshot_destroy(WT_SESSION_IMPL *session) txn_global = &S2C(session)->txn_global; txn_global->nsnap_oldest_id = WT_TXN_NONE; - while (!STAILQ_EMPTY(&txn_global->nsnaph)) { - nsnap = STAILQ_FIRST(&txn_global->nsnaph); - WT_ASSERT(session, nsnap != NULL); - STAILQ_REMOVE_HEAD(&txn_global->nsnaph, q); + while ((nsnap = TAILQ_FIRST(&txn_global->nsnaph)) != NULL) { + TAILQ_REMOVE(&txn_global->nsnaph, nsnap, q); __nsnap_destroy(session, nsnap); } diff --git a/src/txn/txn_recover.c b/src/txn/txn_recover.c index 0eadcbf3b01..240d0a5ffd3 100644 --- a/src/txn/txn_recover.c +++ b/src/txn/txn_recover.c @@ -65,7 +65,7 @@ __recovery_cursor(WT_SESSION_IMPL *session, WT_RECOVERY *r, "No file found with ID %u (max %u)", id, r->nfiles)); r->missing = 1; - } else if (WT_LOG_CMP(lsnp, &r->files[id].ckpt_lsn) >= 0) { + } else if (__wt_log_cmp(lsnp, &r->files[id].ckpt_lsn) >= 0) { /* * We're going to apply the operation. Get the cursor, opening * one if none is cached. @@ -144,10 +144,10 @@ __txn_op_apply( GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); /* Set up the cursors. */ - if (start_recno == 0) { + if (start_recno == WT_RECNO_OOB) { start = NULL; stop = cursor; - } else if (stop_recno == 0) { + } else if (stop_recno == WT_RECNO_OOB) { start = cursor; stop = NULL; } else { @@ -522,7 +522,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session) */ WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); -done: +done: FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE); err: WT_TRET(__recovery_free(&r)); __wt_free(session, config); WT_TRET(session->iface.close(&session->iface, NULL)); diff --git a/src/utilities/util_list.c b/src/utilities/util_list.c index 1888c7d967b..1d35f2efc72 100644 --- a/src/utilities/util_list.c +++ b/src/utilities/util_list.c @@ -97,12 +97,15 @@ list_print(WT_SESSION *session, const char *name, int cflag, int vflag) } /* - * XXX - * We don't normally say anything about the WiredTiger - * metadata, it's not a normal "object" in the database. I'm - * making an exception for the checkpoint and verbose options. + * !!! + * We don't normally say anything about the WiredTiger metadata + * and lookaside tables, they're not application/user "objects" + * in the database. I'm making an exception for the checkpoint + * and verbose options. */ - if (strcmp(key, WT_METADATA_URI) != 0 || cflag || vflag) + if (cflag || vflag || + (strcmp(key, WT_METADATA_URI) != 0 && + strcmp(key, WT_LAS_URI) != 0)) printf("%s\n", key); if (!cflag && !vflag) diff --git a/test/checkpoint/checkpointer.c b/test/checkpoint/checkpointer.c index dd6fcd6b95a..c4f36ac69ba 100644 --- a/test/checkpoint/checkpointer.c +++ b/test/checkpoint/checkpointer.c @@ -134,8 +134,7 @@ done: if ((ret = session->close(session, NULL)) != 0) /* * verify_checkpoint -- * Open a cursor on each table at the last checkpoint and walk through - * the tables in parallel. The key/values should match across all - * tables. + * the tables in parallel. The key/values should match across all tables. */ static int verify_checkpoint(WT_SESSION *session) @@ -245,41 +244,36 @@ compare_cursors( WT_CURSOR *cursor2, const char *type2) { uint64_t key1, key2; - char *val1, *val2; - char buf[128]; + char *val1, *val2, buf[128]; + int ret; + ret = 0; memset(buf, 0, 128); if (cursor1->get_key(cursor1, &key1) != 0 || cursor2->get_key(cursor2, &key2) != 0) return (log_print_err("Error getting keys", EINVAL, 1)); - if (key1 != key2) { - printf("Key mismatch %" PRIu64 " from a %s table " - "is not %" PRIu64 " from a %s table\n", - key1, type1, key2, type2); - - return (ERR_KEY_MISMATCH); - } - - /* Now check the values. */ if (cursor1->get_value(cursor1, &val1) != 0 || cursor2->get_value(cursor2, &val2) != 0) return (log_print_err("Error getting values", EINVAL, 1)); if (g.logfp != NULL) fprintf(g.logfp, "k1: %" PRIu64 " k2: %" PRIu64 - " val1: %s val2: %s \n", - key1, key2, val1, val2); - if (strlen(val1) != strlen(val2) || - strcmp(val1, val2) != 0) { - printf("Value mismatch for key %" PRIu64 - ", %s from a %s table is not %s from a %s table\n", - key1, val1, type1, val2, type2); - return (ERR_DATA_MISMATCH); - } + " val1: %s val2: %s \n", key1, key2, val1, val2); - return (0); + if (key1 != key2) + ret = ERR_KEY_MISMATCH; + else if (strlen(val1) != strlen(val2) || strcmp(val1, val2) != 0) + ret = ERR_DATA_MISMATCH; + else + return (0); + + printf("Key/value mismatch: %" PRIu64 "/%s from a %s table is not %" + PRIu64 "/%s from a %s table\n", + key1, val1, type1, key2, val2, type2); + + return (ret); } /* @@ -349,10 +343,10 @@ diagnose_key_error( return (1); c->set_key(c, key1_orig); if ((ret = c->search(c)) != 0) - (void)log_print_err("1st cursor didn't find 1st key\n", ret, 0); + (void)log_print_err("1st cursor didn't find 1st key", ret, 0); c->set_key(c, key2_orig); if ((ret = c->search(c)) != 0) - (void)log_print_err("1st cursor didn't find 2nd key\n", ret, 0); + (void)log_print_err("1st cursor didn't find 2nd key", ret, 0); if (c->close(c) != 0) return (1); @@ -361,10 +355,10 @@ diagnose_key_error( return (1); c->set_key(c, key1_orig); if ((ret = c->search(c)) != 0) - (void)log_print_err("2nd cursor didn't find 1st key\n", ret, 0); + (void)log_print_err("2nd cursor didn't find 1st key", ret, 0); c->set_key(c, key2_orig); if ((ret = c->search(c)) != 0) - (void)log_print_err("2nd cursor didn't find 2nd key\n", ret, 0); + (void)log_print_err("2nd cursor didn't find 2nd key", ret, 0); if (c->close(c) != 0) return (1); @@ -378,7 +372,7 @@ live_check: return (1); c->set_key(c, key1_orig); if ((ret = c->search(c)) != 0) - (void)log_print_err("1st cursor didn't find 1st key\n", ret, 0); + (void)log_print_err("1st cursor didn't find 1st key", ret, 0); if (c->close(c) != 0) return (1); @@ -387,7 +381,7 @@ live_check: return (1); c->set_key(c, key2_orig); if ((ret = c->search(c)) != 0) - (void)log_print_err("2nd cursor didn't find 2nd key\n", ret, 0); + (void)log_print_err("2nd cursor didn't find 2nd key", ret, 0); if (c->close(c) != 0) return (1); diff --git a/test/checkpoint/workers.c b/test/checkpoint/workers.c index 5cd2ef4e97b..b8ca5a37d2b 100644 --- a/test/checkpoint/workers.c +++ b/test/checkpoint/workers.c @@ -44,8 +44,7 @@ create_table(WT_SESSION *session, COOKIE *cookie) p = config; end = config + sizeof(config); p += snprintf(p, (size_t)(end - p), - "key_format=%s,value_format=S", - cookie->type == COL ? "r" : "q"); + "key_format=%s,value_format=S", cookie->type == COL ? "r" : "q"); if (cookie->type == LSM) (void)snprintf(p, (size_t)(end - p), ",type=lsm"); @@ -133,8 +132,7 @@ worker_op(WT_CURSOR *cursor, uint64_t keyno, u_int new_val) char valuebuf[64]; cursor->set_key(cursor, keyno); - (void)snprintf( - valuebuf, sizeof(valuebuf), "%037u", new_val); + (void)snprintf(valuebuf, sizeof(valuebuf), "%037u", new_val); cursor->set_value(cursor, valuebuf); if ((ret = cursor->insert(cursor)) != 0) { if (ret == WT_ROLLBACK) diff --git a/test/format/backup.c b/test/format/backup.c index 3b95ea92b5e..5805012e1e0 100644 --- a/test/format/backup.c +++ b/test/format/backup.c @@ -65,8 +65,7 @@ copy_file(const char *name) int ret; len = strlen(g.home) + strlen(g.home_backup) + strlen(name) * 2 + 20; - if ((cmd = malloc(len)) == NULL) - die(errno, "malloc"); + cmd = dmalloc(len); (void)snprintf(cmd, len, "cp %s/%s %s/%s", g.home, name, g.home_backup, name); if ((ret = system(cmd)) != 0) diff --git a/test/format/bulk.c b/test/format/bulk.c index 7cf4ba559dc..203043166a4 100644 --- a/test/format/bulk.c +++ b/test/format/bulk.c @@ -39,6 +39,7 @@ wts_load(void) int is_bulk, ret; conn = g.wts_conn; + keybuf = valbuf = NULL; if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) die(ret, "connection.open_session"); diff --git a/test/format/config.c b/test/format/config.c index 6e767a2c6a2..1f19ecf2cd2 100644 --- a/test/format/config.c +++ b/test/format/config.c @@ -36,6 +36,7 @@ static const char *config_file_type(u_int); static CONFIG *config_find(const char *, size_t); static int config_is_perm(const char *); static void config_isolation(void); +static void config_lrt(void); static void config_map_checksum(const char *, u_int *); static void config_map_compression(const char *, u_int *); static void config_map_encryption(const char *, u_int *); @@ -102,8 +103,7 @@ config_setup(void) * our configuration, LSM or KVS devices are "tables", but files are * tested as well. */ - if ((g.uri = malloc(256)) == NULL) - die(errno, "malloc"); + g.uri = dmalloc(256); strcpy(g.uri, DATASOURCE("file") ? "file:" : "table:"); if (DATASOURCE("helium")) strcat(g.uri, "dev1/"); @@ -135,12 +135,6 @@ config_setup(void) if (DATASOURCE("helium") || DATASOURCE("kvsbdb")) g.c_reverse = 0; - config_checksum(); - config_compression("compression"); - config_compression("logging_compression"); - config_encryption(); - config_isolation(); - /* * Periodically, run single-threaded so we can compare the results to * a Berkeley DB copy, as long as the thread-count isn't nailed down. @@ -149,6 +143,13 @@ config_setup(void) if (!g.replay && g.run_cnt % 20 == 19 && !config_is_perm("threads")) g.c_threads = 1; + config_checksum(); + config_compression("compression"); + config_compression("logging_compression"); + config_encryption(); + config_isolation(); + config_lrt(); + /* * Periodically, set the delete percentage to 0 so salvage gets run, * as long as the delete percentage isn't nailed down. @@ -329,6 +330,26 @@ config_isolation(void) } /* + * config_lrt -- + * Long-running transaction configuration. + */ +static void +config_lrt(void) +{ + /* + * The underlying engine doesn't support a lookaside file for + * fixed-length column stores. + */ + if (g.type == FIX) { + if (config_is_perm("long_running_txn")) + die(EINVAL, + "long_running_txn not supported with fixed-length " + "column store"); + g.c_long_running_txn = 0; + } +} + +/* * config_error -- * Display configuration information on error. */ diff --git a/test/format/format.h b/test/format/format.h index 4ec2734aee9..d82dea5451f 100644 --- a/test/format/format.h +++ b/test/format/format.h @@ -310,6 +310,8 @@ void config_file(const char *); void config_print(int); void config_setup(void); void config_single(const char *, int); +void *dmalloc(size_t); +char *dstrdup(const char *); void fclose_and_clear(FILE **); void key_gen(uint8_t *, size_t *, uint64_t); void key_gen_insert(WT_RAND_STATE *, uint8_t *, size_t *, uint64_t); @@ -317,6 +319,7 @@ void key_gen_setup(uint8_t **); void key_len_setup(void); void *lrt(void *); void path_setup(const char *); +int read_row(WT_CURSOR *, WT_ITEM *, uint64_t, int); uint32_t rng(WT_RAND_STATE *); void track(const char *, uint64_t, TINFO *); void val_gen(WT_RAND_STATE *, uint8_t *, size_t *, uint64_t); diff --git a/test/format/lrt.c b/test/format/lrt.c index a00a4e07879..85b6e29f224 100644 --- a/test/format/lrt.c +++ b/test/format/lrt.c @@ -37,33 +37,120 @@ lrt(void *arg) { WT_CONNECTION *conn; WT_CURSOR *cursor; + WT_ITEM key, value; WT_SESSION *session; + size_t buf_len, buf_size; + uint64_t keyno, saved_keyno; u_int period; int pinned, ret; + uint8_t bitfield, *keybuf; + void *buf; - (void)(arg); + (void)(arg); /* Unused parameter */ + + saved_keyno = 0; /* [-Werror=maybe-uninitialized] */ + + key_gen_setup(&keybuf); + memset(&key, 0, sizeof(key)); + key.data = keybuf; + memset(&value, 0, sizeof(value)); + + buf = NULL; + buf_len = buf_size = 0; /* Open a session and cursor. */ conn = g.wts_conn; - if ((ret = conn->open_session( - conn, NULL, "isolation=snapshot", &session)) != 0) + if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) die(ret, "connection.open_session"); if ((ret = session->open_cursor( session, g.uri, NULL, NULL, &cursor)) != 0) die(ret, "session.open_cursor"); for (pinned = 0;;) { - /* - * If we have an open cursor, reset it, releasing our pin, else - * position the cursor, creating a snapshot. - */ if (pinned) { + /* Re-read the record at the end of the table. */ + while ((ret = read_row(cursor, + &key, saved_keyno, 1)) == WT_ROLLBACK) + ; + if (ret != 0) + die(ret, "read_row %" PRIu64, saved_keyno); + + /* Compare the previous value with the current one. */ + if (g.type == FIX) { + ret = cursor->get_value(cursor, &bitfield); + value.data = &bitfield; + value.size = 1; + } else + ret = cursor->get_value(cursor, &value); + if (ret != 0) + die(ret, + "cursor.get_value: %" PRIu64, saved_keyno); + + if (buf_size != value.size || + memcmp(buf, value.data, value.size) != 0) + die(0, "mismatched start/stop values"); + + /* End the transaction. */ + if ((ret = + session->commit_transaction(session, NULL)) != 0) + die(ret, "session.commit_transaction"); + + /* Reset the cursor, releasing our pin. */ if ((ret = cursor->reset(cursor)) != 0) die(ret, "cursor.reset"); pinned = 0; } else { - if ((ret = cursor->next(cursor)) != 0) - die(ret, "cursor.reset"); + /* + * Begin transaction: without an explicit transaction, + * the snapshot is only kept around while a cursor is + * positioned. As soon as the cursor loses its position + * a new snapshot will be allocated. + */ + if ((ret = session->begin_transaction( + session, "isolation=snapshot")) != 0) + die(ret, "session.begin_transaction"); + + /* Read a record at the end of the table. */ + do { + saved_keyno = mmrand(NULL, + (u_int)(g.key_cnt - g.key_cnt / 10), + (u_int)g.key_cnt); + while ((ret = read_row(cursor, + &key, saved_keyno, 1)) == WT_ROLLBACK) + ; + } while (ret == WT_NOTFOUND); + if (ret != 0) + die(ret, "read_row %" PRIu64, saved_keyno); + + /* Copy the cursor's value. */ + if (g.type == FIX) { + ret = cursor->get_value(cursor, &bitfield); + value.data = &bitfield; + value.size = 1; + } else + ret = cursor->get_value(cursor, &value); + if (ret != 0) + die(ret, + "cursor.get_value: %" PRIu64, saved_keyno); + if (buf_len < value.size && + (buf = realloc(buf, buf_len = value.size)) == NULL) + die(errno, "malloc"); + memcpy(buf, value.data, buf_size = value.size); + + /* + * Move the cursor to an early record in the table, + * hopefully allowing the page with the record just + * retrieved to be evicted from memory. + */ + do { + keyno = mmrand(NULL, 1, (u_int)g.key_cnt / 5); + while ((ret = read_row(cursor, + &key, keyno, 1)) == WT_ROLLBACK) + ; + } while (ret == WT_NOTFOUND); + if (ret != 0) + die(ret, "read_row %" PRIu64, keyno); + pinned = 1; } @@ -82,5 +169,8 @@ lrt(void *arg) if ((ret = session->close(session, NULL)) != 0) die(ret, "session.close"); + free(keybuf); + free(buf); + return (NULL); } diff --git a/test/format/ops.c b/test/format/ops.c index 7d3b22175ca..7c38aec4757 100644 --- a/test/format/ops.c +++ b/test/format/ops.c @@ -33,7 +33,6 @@ static int col_remove(WT_CURSOR *, WT_ITEM *, uint64_t, int *); static int col_update(TINFO *, WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t); static int nextprev(WT_CURSOR *, int, int *); static void *ops(void *); -static int read_row(WT_CURSOR *, WT_ITEM *, uint64_t); static int row_insert(TINFO *, WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t); static int row_remove(WT_CURSOR *, WT_ITEM *, uint64_t, int *); static int row_update(TINFO *, WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t); @@ -64,6 +63,7 @@ wts_ops(int lastrun) session = NULL; /* -Wconditional-uninitialized */ memset(&backup_tid, 0, sizeof(backup_tid)); memset(&compact_tid, 0, sizeof(compact_tid)); + memset(&lrt_tid, 0, sizeof(lrt_tid)); /* * There are two mechanisms to specify the length of the run, a number @@ -239,13 +239,13 @@ ops(void *arg) tinfo = arg; - /* Initialize the per-thread random number generator. */ - __wt_random_init(&tinfo->rnd); - conn = g.wts_conn; keybuf = valbuf = NULL; readonly = 0; /* -Wconditional-uninitialized */ + /* Initialize the per-thread random number generator. */ + __wt_random_init(&tinfo->rnd); + /* Set up the default key and value buffers. */ key_gen_setup(&keybuf); val_gen_setup(&tinfo->rnd, &valbuf); @@ -475,7 +475,7 @@ skip_insert: if (col_update(tinfo, } } else { ++tinfo->search; - if (read_row(cursor, &key, keyno)) + if (read_row(cursor, &key, keyno, 0)) if (intxn) goto deadlock; continue; @@ -498,7 +498,7 @@ skip_insert: if (col_update(tinfo, /* Read to confirm the operation. */ ++tinfo->search; - if (read_row(cursor, &key, keyno)) + if (read_row(cursor, &key, keyno, 0)) goto deadlock; /* Reset the cursor: there is no reason to keep pages pinned. */ @@ -583,7 +583,7 @@ wts_read_scan(void) } key.data = keybuf; - if ((ret = read_row(cursor, &key, cnt)) != 0) + if ((ret = read_row(cursor, &key, cnt, 0)) != 0) die(ret, "read_scan"); } @@ -597,8 +597,8 @@ wts_read_scan(void) * read_row -- * Read and verify a single element in a row- or column-store file. */ -static int -read_row(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno) +int +read_row(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int notfound_err) { static int sn = 0; WT_ITEM value; @@ -634,19 +634,24 @@ read_row(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno) ret = cursor->search(cursor); sn = 1; } - if (ret == 0) { + switch (ret) { + case 0: if (g.type == FIX) { ret = cursor->get_value(cursor, &bitfield); value.data = &bitfield; value.size = 1; - } else { + } else ret = cursor->get_value(cursor, &value); - } - } - if (ret == WT_ROLLBACK) + break; + case WT_ROLLBACK: return (WT_ROLLBACK); - if (ret != 0 && ret != WT_NOTFOUND) + case WT_NOTFOUND: + if (notfound_err) + return (WT_NOTFOUND); + break; + default: die(ret, "read_row: read row %" PRIu64, keyno); + } #ifdef HAVE_BERKELEY_DB if (!SINGLETHREADED) diff --git a/test/format/smoke.sh b/test/format/smoke.sh index 8b4b5d9e424..5fbc349f242 100755 --- a/test/format/smoke.sh +++ b/test/format/smoke.sh @@ -3,7 +3,7 @@ set -e # Smoke-test format as part of running "make check". -args="-1 -c "." data_source=table ops=100000 rows=10000 threads=4 compression=none" +args="-1 -c "." data_source=table ops=100000 rows=10000 threads=4 compression=none logging_compression=none" $TEST_WRAPPER ./t $args file_type=fix $TEST_WRAPPER ./t $args file_type=row diff --git a/test/format/t.c b/test/format/t.c index 8e8a627235f..603706e0ba1 100644 --- a/test/format/t.c +++ b/test/format/t.c @@ -40,7 +40,7 @@ int main(int argc, char *argv[]) { time_t start; - int ch, reps, ret; + int ch, i, onerun, reps, ret; const char *config, *home; config = NULL; @@ -64,11 +64,12 @@ main(int argc, char *argv[]) /* Set values from the command line. */ home = NULL; + onerun = 0; while ((ch = __wt_getopt( g.progname, argc, argv, "1C:c:H:h:Llqrt:")) != EOF) switch (ch) { case '1': /* One run */ - g.c_runs = 1; + onerun = 1; break; case 'C': /* wiredtiger_open config */ g.config_open = __wt_optarg; @@ -105,8 +106,14 @@ main(int argc, char *argv[]) argc -= __wt_optind; argv += __wt_optind; - /* Initialize the global random number generator. */ + /* + * Initialize the global RNG. Start with the standard seeds, and then + * use seconds since the Epoch modulo a prime to run the RNG for some + * number of steps, so we don't start with the same values every time. + */ __wt_random_init(&g.rnd); + for (i = (int)time(NULL) % 10007; i > 0; --i) + (void)__wt_random(&g.rnd); /* Set up paths. */ path_setup(home); @@ -155,6 +162,13 @@ main(int argc, char *argv[]) g.c_runs = 1; /* + * Let the command line -1 flag override runs configured from other + * sources. + */ + if (onerun) + g.c_runs = 1; + + /* * Initialize locks to single-thread named checkpoints and backups, last * last-record updates, and failures. */ @@ -298,6 +312,11 @@ die(int e, const char *fmt, ...) /* Single-thread error handling. */ (void)pthread_rwlock_wrlock(&g.death_lock); + /* Try and turn off tracking so it doesn't obscure the error message. */ + if (g.track) { + g.track = 0; + fprintf(stderr, "\n"); + } if (fmt != NULL) { /* Death message. */ fprintf(stderr, "%s: ", g.progname); va_start(ap, fmt); diff --git a/test/format/util.c b/test/format/util.c index 9d28b7a81bc..0f4f5de7c20 100644 --- a/test/format/util.c +++ b/test/format/util.c @@ -78,8 +78,7 @@ key_gen_setup(uint8_t **keyp) *keyp = NULL; len = MAX(KILOBYTE(100), g.c_key_max); - if ((key = malloc(len)) == NULL) - die(errno, "malloc"); + key = dmalloc(len); for (i = 0; i < len; ++i) key[i] = (uint8_t)("abcdefghijklmnopqrstuvwxyz"[i % 26]); *keyp = key; @@ -139,8 +138,7 @@ val_gen_setup(WT_RAND_STATE *rnd, uint8_t **valp) * data for column-store run-length encoded files. */ len = MAX(KILOBYTE(100), g.c_value_max) + 20; - if ((val = malloc(len)) == NULL) - die(errno, "malloc"); + val = dmalloc(len); for (i = 0; i < len; ++i) val[i] = (uint8_t)("ABCDEFGHIJKLMNOPQRSTUVWXYZ"[i % 26]); @@ -257,43 +255,36 @@ path_setup(const char *home) size_t len; /* Home directory. */ - if ((g.home = strdup(home == NULL ? "RUNDIR" : home)) == NULL) - die(errno, "malloc"); + g.home = dstrdup(home == NULL ? "RUNDIR" : home); /* Log file. */ len = strlen(g.home) + strlen("log") + 2; - if ((g.home_log = malloc(len)) == NULL) - die(errno, "malloc"); + g.home_log = dmalloc(len); snprintf(g.home_log, len, "%s/%s", g.home, "log"); /* RNG log file. */ len = strlen(g.home) + strlen("rand") + 2; - if ((g.home_rand = malloc(len)) == NULL) - die(errno, "malloc"); + g.home_rand = dmalloc(len); snprintf(g.home_rand, len, "%s/%s", g.home, "rand"); /* Run file. */ len = strlen(g.home) + strlen("CONFIG") + 2; - if ((g.home_config = malloc(len)) == NULL) - die(errno, "malloc"); + g.home_config = dmalloc(len); snprintf(g.home_config, len, "%s/%s", g.home, "CONFIG"); /* Statistics file. */ len = strlen(g.home) + strlen("stats") + 2; - if ((g.home_stats = malloc(len)) == NULL) - die(errno, "malloc"); + g.home_stats = dmalloc(len); snprintf(g.home_stats, len, "%s/%s", g.home, "stats"); /* Backup directory. */ len = strlen(g.home) + strlen("BACKUP") + 2; - if ((g.home_backup = malloc(len)) == NULL) - die(errno, "malloc"); + g.home_backup = dmalloc(len); snprintf(g.home_backup, len, "%s/%s", g.home, "BACKUP"); /* BDB directory. */ len = strlen(g.home) + strlen("bdb") + 2; - if ((g.home_bdb = malloc(len)) == NULL) - die(errno, "malloc"); + g.home_bdb = dmalloc(len); snprintf(g.home_bdb, len, "%s/%s", g.home, "bdb"); /* @@ -315,8 +306,7 @@ path_setup(const char *home) "mkdir KVS" #endif len = strlen(g.home) * 3 + strlen(CMD) + 1; - if ((g.home_init = malloc(len)) == NULL) - die(errno, "malloc"); + g.home_init = dmalloc(len); snprintf(g.home_init, len, CMD, g.home, g.home, g.home); /* Backup directory initialize command, remove and re-create it. */ @@ -327,8 +317,7 @@ path_setup(const char *home) #define CMD "rm -rf %s && mkdir %s" #endif len = strlen(g.home_backup) * 2 + strlen(CMD) + 1; - if ((g.home_backup_init = malloc(len)) == NULL) - die(errno, "malloc"); + g.home_backup_init = dmalloc(len); snprintf(g.home_backup_init, len, CMD, g.home_backup, g.home_backup); /* @@ -351,8 +340,7 @@ path_setup(const char *home) "cp WiredTiger* wt* slvg.copy/" #endif len = strlen(g.home) + strlen(CMD) + 1; - if ((g.home_salvage_copy = malloc(len)) == NULL) - die(errno, "malloc"); + g.home_salvage_copy = dmalloc(len); snprintf(g.home_salvage_copy, len, CMD, g.home); } @@ -422,3 +410,31 @@ fclose_and_clear(FILE **fpp) die(errno, "fclose"); return; } + +/* + * dmalloc -- + * Call malloc, dying on failure. + */ +void * +dmalloc(size_t len) +{ + void *p; + + if ((p = malloc(len)) == NULL) + die(errno, "malloc"); + return (p); +} + +/* + * dstrdup -- + * Call strdup, dying on failure. + */ +char * +dstrdup(const char *str) +{ + char *p; + + if ((p = strdup(str)) == NULL) + die(errno, "strdup"); + return (p); +} diff --git a/test/format/wts.c b/test/format/wts.c index 3d3b59810e8..23823c20184 100644 --- a/test/format/wts.c +++ b/test/format/wts.c @@ -462,8 +462,7 @@ wts_dump(const char *tag, int dump_bdb) track("dump files and compare", 0ULL, NULL); len = strlen(g.home) + strlen(BERKELEY_DB_PATH) + strlen(g.uri) + 100; - if ((cmd = malloc(len)) == NULL) - die(errno, "malloc"); + cmd = dmalloc(len); (void)snprintf(cmd, len, "sh s_dumpcmp -h %s %s %s %s %s %s", g.home, @@ -564,9 +563,7 @@ wts_stats(void) /* Data source statistics. */ fprintf(fp, "\n\n====== Data source statistics:\n"); - if ((stat_name = - malloc(strlen("statistics:") + strlen(g.uri) + 1)) == NULL) - die(errno, "malloc"); + stat_name = dmalloc(strlen("statistics:") + strlen(g.uri) + 1); sprintf(stat_name, "statistics:%s", g.uri); if ((ret = session->open_cursor( session, stat_name, NULL, NULL, &cursor)) != 0) diff --git a/test/suite/run.py b/test/suite/run.py index 1cb7309cb53..5e7b76a79b9 100644 --- a/test/suite/run.py +++ b/test/suite/run.py @@ -312,7 +312,7 @@ if __name__ == '__main__': else: for arg in testargs: testsFromArg(tests, loader, arg) - + if debug: import pdb pdb.set_trace() diff --git a/test/suite/test_async01.py b/test/suite/test_async01.py index af5180192af..fee5e8232f1 100644 --- a/test/suite/test_async01.py +++ b/test/suite/test_async01.py @@ -51,7 +51,7 @@ class Callback(wiredtiger.AsyncCallback): def notify_error(self, key, value, optype, desc): tty_pr('ERROR: notify(' + str(key) + ',' + str(value) + ',' + str(optype) + '): ' + desc) - + def notify(self, op, op_ret, flags): # Note: we are careful not to throw any errors here. Any diff --git a/test/suite/test_async02.py b/test/suite/test_async02.py index 21d811989c8..c878e8dd114 100644 --- a/test/suite/test_async02.py +++ b/test/suite/test_async02.py @@ -51,7 +51,7 @@ class Callback(wiredtiger.AsyncCallback): def notify_error(self, key, value, optype, exp, desc): tty_pr('ERROR: notify(' + str(key) + ',' + str(value) + ',' + str(optype) + '): ' + 'Expected: ' + str(exp) + ' ' + desc) - + def notify(self, op, op_ret, flags): # Note: we are careful not to throw any errors here. Any diff --git a/test/suite/test_autoclose.py b/test/suite/test_autoclose.py index 40106e6f97d..6dc71003a34 100644 --- a/test/suite/test_autoclose.py +++ b/test/suite/test_autoclose.py @@ -156,7 +156,7 @@ class test_autoclose(wttest.WiredTigerTestCase): self.assertRaisesHavingMessage(exceptions.RuntimeError, lambda: self.create_table(), '/wt_session.* is None/') - + def test_close_connection1(self): """ Use a connection handle after it is closed. @@ -166,6 +166,6 @@ class test_autoclose(wttest.WiredTigerTestCase): self.assertRaisesHavingMessage(exceptions.RuntimeError, lambda: conn.open_session(None), '/wt_connection.* is None/') - + if __name__ == '__main__': wttest.run() diff --git a/test/suite/test_backup04.py b/test/suite/test_backup04.py index 47e656cf9b1..a0a52f49817 100644 --- a/test/suite/test_backup04.py +++ b/test/suite/test_backup04.py @@ -83,7 +83,7 @@ class test_backup_target(wttest.WiredTigerTestCase, suite_subprocess): # Compare the original and backed-up files using the wt dump command. def compare(self, uri, dir_full, dir_incr): - # print "Compare: full URI: " + uri + " with incremental URI " + # print "Compare: full URI: " + uri + " with incremental URI " if dir_full == None: full_name='original' else: diff --git a/test/suite/test_backup05.py b/test/suite/test_backup05.py index 80706b20299..8ab329f761a 100644 --- a/test/suite/test_backup05.py +++ b/test/suite/test_backup05.py @@ -71,7 +71,7 @@ class test_backup05(wttest.WiredTigerTestCase, suite_subprocess): session = self.setUpSessionOpen(conn) session.verify(self.uri) conn.close() - + def test_backup(self): '''Check manual fsyncLock backup strategy''' diff --git a/test/suite/test_base05.py b/test/suite/test_base05.py index 399cba07164..7d5ff59b2c9 100644 --- a/test/suite/test_base05.py +++ b/test/suite/test_base05.py @@ -154,7 +154,7 @@ class test_base05(wttest.WiredTigerTestCase): choice = (n + i) % len(reflist) result += reflist[choice] return result + ':' + str(n) - + def test_table_ss(self): """ Create entries, and read back in a cursor: key=string, value=string @@ -196,7 +196,7 @@ class test_base05(wttest.WiredTigerTestCase): def do_test_table_base(self, convert): """ - Base functionality that uses regular strings with + Base functionality that uses regular strings with non-ASCII (UTF) chars and optionally converts them to Unicode (considered a type separate from string in Python). """ diff --git a/test/suite/test_baseconfig.py b/test/suite/test_baseconfig.py new file mode 100644 index 00000000000..6ac3654af11 --- /dev/null +++ b/test/suite/test_baseconfig.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2015 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import os +import wiredtiger, wttest + +# test_baseconfig +# test base configuration file being ignored. +class test_baseconfig(wttest.WiredTigerTestCase): + def test_baseconfig(self): + # Open up another database and modify the baseconfig + os.mkdir("A") + conn = wiredtiger.wiredtiger_open("A", 'create') + self.assertTrue(os.path.exists("A/WiredTiger.basecfg")) + with open("A/WiredTiger.basecfg", "a") as basecfg_file: + basecfg_file.write("foo!") + conn.close() + + # Open a database, we should assert here as the basecfg is invalid + self.assertRaisesWithMessage( + wiredtiger.WiredTigerError, + lambda: wiredtiger.wiredtiger_open("A", ''), + '/unknown configuration key/') + + conn = wiredtiger.wiredtiger_open("A", "create,config_base=false") + conn.close() + +if __name__ == '__main__': + wttest.run() diff --git a/test/suite/test_bug005.py b/test/suite/test_bug005.py index 961bb551b69..3e06bea8694 100644 --- a/test/suite/test_bug005.py +++ b/test/suite/test_bug005.py @@ -37,7 +37,7 @@ from helper import key_populate, value_populate class test_bug005(wttest.WiredTigerTestCase): # This is a btree layer test, test files, ignore tables. uri = 'file:test_bug005' - + def test_bug005(self): # Create the object. self.session.create(self.uri, 'value_format=S,key_format=S') diff --git a/test/suite/test_bug008.py b/test/suite/test_bug008.py index 0102cbd63f4..75cbd989cd1 100644 --- a/test/suite/test_bug008.py +++ b/test/suite/test_bug008.py @@ -48,7 +48,7 @@ class test_bug008(wttest.WiredTigerTestCase): # Populate the tree and reopen the connection, forcing it to disk # and moving the records to an on-page format. - simple_populate(self, uri, self.fmt, 100) + simple_populate(self, uri, self.fmt, 100) self.reopen_conn() # Begin a transaction, and add some additional records. @@ -105,7 +105,7 @@ class test_bug008(wttest.WiredTigerTestCase): # Populate the tree and reopen the connection, forcing it to disk # and moving the records to an on-page format. - simple_populate(self, uri, self.fmt, 100) + simple_populate(self, uri, self.fmt, 100) self.reopen_conn() # Add some additional visible records. diff --git a/test/suite/test_bug011.py b/test/suite/test_bug011.py index d2c56adb221..b93fc3a75b7 100644 --- a/test/suite/test_bug011.py +++ b/test/suite/test_bug011.py @@ -64,7 +64,7 @@ class test_bug011(wttest.WiredTigerTestCase): # Make sure we have a cursor for the table so it stays in cache. for i in range(0, self.ntables): this_uri = 'table:%s-%03d' % (self.table_name, i) - cursors.append(self.session.open_cursor(this_uri, None)) + cursors.append(self.session.open_cursor(this_uri, None)) # Make use of the cache. for i in range(0, self.nops): diff --git a/test/suite/test_checkpoint01.py b/test/suite/test_checkpoint01.py index aacc8f1f055..799e6ded1ea 100644 --- a/test/suite/test_checkpoint01.py +++ b/test/suite/test_checkpoint01.py @@ -70,7 +70,7 @@ class test_checkpoint(wttest.WiredTigerTestCase): for checkpoint_name, entry in self.checkpoints.iteritems(): self.add_records(checkpoint_name) self.session.checkpoint("name=" + checkpoint_name) - + # Create a dictionary of sorted records a checkpoint should include. def list_expected(self, name): records = {} diff --git a/test/suite/test_cursor01.py b/test/suite/test_cursor01.py index 507036e85cf..47cc7f6c5b7 100644 --- a/test/suite/test_cursor01.py +++ b/test/suite/test_cursor01.py @@ -167,7 +167,7 @@ class test_cursor01(wttest.WiredTigerTestCase): def backward_iter(self, cursor): cursor.reset() self.assertCursorHasNoKeyValue(cursor) - + i = self.nentries - 1 while True: prevret = cursor.prev() @@ -188,7 +188,7 @@ class test_cursor01(wttest.WiredTigerTestCase): def backward_iter_with_dup(self, cursor): cursor.reset() self.assertCursorHasNoKeyValue(cursor) - + i = self.nentries - 1 while True: prevret = cursor.prev() diff --git a/test/suite/test_cursor04.py b/test/suite/test_cursor04.py index 50cde0023d8..08f1a7240a5 100644 --- a/test/suite/test_cursor04.py +++ b/test/suite/test_cursor04.py @@ -113,7 +113,7 @@ class test_cursor04(wttest.WiredTigerTestCase): self.assertEqual(direction, 0) self.assertEqual(cursor.get_key(), origkey) self.assertEqual(cursor.get_value(), 0) - + def test_searches(self): """ Create entries, and read back in a cursor: key=string, value=string @@ -174,7 +174,7 @@ class test_cursor04(wttest.WiredTigerTestCase): self.assertEqual(cmp, 0) self.assertEqual(cursor.get_key(), self.genkey(0)) self.assertEqual(cursor.get_value(), 0) - + cursor.set_key(self.genkey(5)) self.expect_either(cursor, 4, 6) diff --git a/test/suite/test_cursor06.py b/test/suite/test_cursor06.py index 28ac581cf66..c11d043a548 100644 --- a/test/suite/test_cursor06.py +++ b/test/suite/test_cursor06.py @@ -58,7 +58,7 @@ class test_cursor06(wttest.WiredTigerTestCase): cursor.set_value(v[0], v[1], v[2], v[3]) else: cursor.set_value(value_populate(cursor, 10)) - + def test_reconfigure_overwrite(self): uri = self.type + self.name for open_config in (None, "overwrite=0", "overwrite=1"): @@ -77,7 +77,7 @@ class test_cursor06(wttest.WiredTigerTestCase): self.set_kv(cursor) cursor.insert() cursor.close() - + def test_reconfigure_readonly(self): uri = self.type + self.name for open_config in (None, "readonly=0", "readonly=1"): diff --git a/test/suite/test_cursor_random.py b/test/suite/test_cursor_random.py index be08c59210f..10a3140a2fd 100644 --- a/test/suite/test_cursor_random.py +++ b/test/suite/test_cursor_random.py @@ -92,7 +92,7 @@ class test_cursor_random(wttest.WiredTigerTestCase): # Check that next_random works in the presence of a larger set of values, # where the values are in a disk format page. - def test_cursor_random_multiple_page_records(self): + def cursor_random_multiple_page_records(self, reopen): uri = self.type + 'random' if self.type == 'file:': simple_populate(self, uri, @@ -103,10 +103,10 @@ class test_cursor_random(wttest.WiredTigerTestCase): 'allocation_size=512,leaf_page_max=512,key_format=' +\ self.fmt, 10000) - # Close the connection so everything is forced to disk (otherwise the - # values are on an insert list and the underlying engine doesn't make - # random selections, it selects the middle of the list. - self.reopen_conn() + # Optionally close the connection so everything is forced to disk, + # insert lists are an entirely different path in the code. + if reopen: + self.reopen_conn() cursor = self.session.open_cursor(uri, None, "next_random=true") last = '' @@ -120,6 +120,10 @@ class test_cursor_random(wttest.WiredTigerTestCase): self.assertLess(match, 5, 'next_random did not return random records, too many matches found') + def test_cursor_random_multiple_page_records_reopen(self): + self.cursor_random_multiple_page_records(1) + def test_cursor_random_multiple_page_records(self): + self.cursor_random_multiple_page_records(0) # Check that opening a random cursor on column-store returns not-supported. class test_cursor_random_column(wttest.WiredTigerTestCase): diff --git a/test/suite/test_cursor_tracker.py b/test/suite/test_cursor_tracker.py index 1fa93f3e59b..742dea4c32b 100644 --- a/test/suite/test_cursor_tracker.py +++ b/test/suite/test_cursor_tracker.py @@ -461,7 +461,7 @@ class TestCursorTracker(wttest.WiredTigerTestCase): except: v = '[invalid]' print(prefix + k + ' ' + v) - + def cur_check(self, cursor, got, want, iskey): if got != want: if iskey: diff --git a/test/suite/test_durability01.py b/test/suite/test_durability01.py index 716e38c17d4..8d00d05fa14 100644 --- a/test/suite/test_durability01.py +++ b/test/suite/test_durability01.py @@ -52,7 +52,7 @@ class test_durability01(wttest.WiredTigerTestCase, suite_subprocess): session = self.setUpSessionOpen(conn) session.verify(self.uri) conn.close() - + def test_durability(self): '''Check for missing metadata checkpoints''' diff --git a/test/suite/test_encrypt03.py b/test/suite/test_encrypt03.py index 0e06d4491ca..0e19ad39263 100644 --- a/test/suite/test_encrypt03.py +++ b/test/suite/test_encrypt03.py @@ -86,7 +86,7 @@ class test_encrypt03(wttest.WiredTigerTestCase): def test_encrypt(self): params = 'key_format=S,value_format=S,encryption=(name=' if self.file_encrypt != None: - params += self.file_encrypt + params += self.file_encrypt if self.file_encrypt_args != None: params += ',keyid=' + self.file_encrypt_args params += ')' diff --git a/test/suite/test_encrypt04.py b/test/suite/test_encrypt04.py index ea9bcc5aacb..41fd0f6dd48 100644 --- a/test/suite/test_encrypt04.py +++ b/test/suite/test_encrypt04.py @@ -46,9 +46,15 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess): # with simply the wrong keyid may appear valid when initially verified, # but may result in error on first use. The odds that a real encryptor # would leave a lot of its input unchanged is infinitesimally small. + # + # When both self.forceerror1 and self.forceerror2 occur, we set a config + # flag when loading the rotn encryptor, which forces a particular error + # return in rotn.decrypt. We look for that return back from + # wiredtiger_open. encrypt_scen_1 = [ ('none', dict( name1='none', keyid1='', secretkey1='')), - ('rotn17abc', dict( name1='rotn', keyid1='17', secretkey1='ABC')), + ('rotn17abc', dict( name1='rotn', keyid1='17', + secretkey1='ABC', forceerror1=True)), ('rotn11abc', dict( name1='rotn', keyid1='11', secretkey1='ABC')), ('rotn11xyz', dict( name1='rotn', keyid1='11', secretkey1='XYZ')), ('rotn11xyz_and_clear', dict( name1='rotn', keyid1='11', @@ -58,7 +64,8 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess): ('none', dict( name2='none', keyid2='', secretkey2='')), ('rotn17abc', dict( name2='rotn', keyid2='17', secretkey2='ABC')), ('rotn11abc', dict( name2='rotn', keyid2='11', secretkey2='ABC')), - ('rotn11xyz', dict( name2='rotn', keyid2='11', secretkey2='XYZ')), + ('rotn11xyz', dict( name2='rotn', keyid2='11', + secretkey2='XYZ', forceerror2=True)), ('rotn11xyz_and_clear', dict( name2='rotn', keyid2='11', secretkey2='XYZ', fileinclear2=True)) ] @@ -73,6 +80,7 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess): # Override WiredTigerTestCase, we have extensions. def setUpConnectionOpen(self, dir): + forceerror = None if self.part == 1: self.name = self.name1 self.keyid = self.keyid1 @@ -85,15 +93,28 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess): self.secretkey = self.secretkey2 self.fileinclear = self.fileinclear2 if \ hasattr(self, 'fileinclear2') else False + if hasattr(self, 'forceerror1') and hasattr(self, 'forceerror2'): + forceerror = "rotn_force_error=true" + self.expect_forceerror = forceerror != None + self.got_forceerror = False encarg = 'encryption=(name={0},keyid={1},secretkey={2}),'.format( self.name, self.keyid, self.secretkey) - extarg = self.extensionArg([('encryptors', self.name), - ('encryptors', self.name)]) + # If forceerror is set for this test, add a config arg to + # the extension string. That signals rotn to return a (-1000) + # error code, which we'll detect here. + extarg = self.extensionArg([('encryptors', self.name, forceerror)]) self.pr('encarg = ' + encarg + ' extarg = ' + extarg) - conn = wiredtiger.wiredtiger_open(dir, - 'create,error_prefix="{0}: ",{1}{2}'.format( - self.shortid(), encarg, extarg)) + completed = False + try: + conn = wiredtiger.wiredtiger_open(dir, + 'create,error_prefix="{0}: ",{1}{2}'.format( + self.shortid(), encarg, extarg)) + except (BaseException) as err: + # Capture the recognizable error created by rotn + if str(-1000) in str(err): + self.got_forceerror = True + raise self.pr(`conn`) return conn @@ -119,7 +140,7 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess): def extensionArg(self, exts): extfiles = [] for ext in exts: - (dirname, name) = ext + (dirname, name, extarg) = ext if name != None and name != 'none': testdir = os.path.dirname(__file__) extdir = os.path.join(run.wt_builddir, 'ext', dirname) @@ -127,12 +148,16 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess): extdir, name, '.libs', 'libwiredtiger_' + name + '.so') if not os.path.exists(extfile): self.skipTest('extension "' + extfile + '" not built') + extfile = '"' + extfile + '"' if not extfile in extfiles: - extfiles.append(extfile) + s = extfile + if extarg != None: + s += "=(config=\"" + extarg + "\")" + extfiles.append(s) if len(extfiles) == 0: return '' else: - return ',extensions=["' + '","'.join(extfiles) + '"]' + return ',extensions=[' + ','.join(extfiles) + ']' # Evaluate expression, which either must succeed (if expect_okay) # or must fail (if !expect_okay). @@ -204,7 +229,8 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess): self.check_records(cursor, r, 0, self.nrecords) self.check_records(cursor, r, self.nrecords, self.nrecords * 2) cursor.close() - + self.assertEqual(self.expect_forceerror, self.got_forceerror) + if __name__ == '__main__': wttest.run() diff --git a/test/suite/test_encrypt05.py b/test/suite/test_encrypt05.py index f5db543ecf3..8a69e5f909f 100644 --- a/test/suite/test_encrypt05.py +++ b/test/suite/test_encrypt05.py @@ -93,7 +93,7 @@ class test_encrypt05(wttest.WiredTigerTestCase): diff = n - len(self.bigvalue) rchr = ''.join(chr(r.randint(1, 255)) for i in range(diff)) return self.bigvalue + rchr - + # Create a table, add key/values with specific lengths, then verify them. def test_encrypt(self): params = 'key_format=S,value_format=S' diff --git a/test/suite/test_encrypt06.py b/test/suite/test_encrypt06.py index 21e4d50769c..5b2007fe6e7 100644 --- a/test/suite/test_encrypt06.py +++ b/test/suite/test_encrypt06.py @@ -211,7 +211,7 @@ class test_encrypt06(wttest.WiredTigerTestCase): c0.close() c1.close() - + # Force everything to disk so we can examine it self.close_conn() @@ -222,7 +222,7 @@ class test_encrypt06(wttest.WiredTigerTestCase): not self.match_string_in_rundir(txt0)) self.assertEqual(self.expected_encryption(self.encrypt1), not self.match_string_in_rundir(txt1)) - + if __name__ == '__main__': wttest.run() diff --git a/test/suite/test_jsondump02.py b/test/suite/test_jsondump02.py index 0c6b38db3ef..790f651fd2f 100644 --- a/test/suite/test_jsondump02.py +++ b/test/suite/test_jsondump02.py @@ -84,7 +84,7 @@ class test_jsondump02(wttest.WiredTigerTestCase): cursor[insert[0]] = insert[1] finally: cursor.close() - + # Create JSON cursors and test them directly. def test_json_cursor(self): """ @@ -140,50 +140,50 @@ class test_jsondump02(wttest.WiredTigerTestCase): # bad tokens self.assertRaisesWithMessage(wiredtiger.WiredTigerError, - lambda: self.load_json(self.table_uri2, + lambda: self.load_json(self.table_uri2, (('<>abc?', '9'),)), '/unknown token/') # bad tokens self.assertRaisesWithMessage(wiredtiger.WiredTigerError, - lambda: self.load_json(self.table_uri2, + lambda: self.load_json(self.table_uri2, (('"abc\u"', ''),)), '/invalid Unicode/') # bad tokens self.assertRaisesWithMessage(wiredtiger.WiredTigerError, - lambda: self.load_json(self.table_uri2, + lambda: self.load_json(self.table_uri2, (('"abc', ''),)), '/unterminated string/') # bad syntax self.assertRaisesWithMessage(wiredtiger.WiredTigerError, - lambda: self.load_json(self.table_uri2, + lambda: self.load_json(self.table_uri2, (('"stuff" "jibberish"', '"value0" "more jibberish"'),)), '/expected key name.*\"key0\"/') # bad types self.assertRaisesWithMessage(wiredtiger.WiredTigerError, - lambda: self.load_json(self.table_uri2, + lambda: self.load_json(self.table_uri2, (('"key0" : "KEY002"', '"value0" : "xyz",\n"value1" : "str0"'),)), '/expected unsigned JSON <int>, got <string>/') # bad types self.assertRaisesWithMessage(wiredtiger.WiredTigerError, - lambda: self.load_json(self.table_uri2, + lambda: self.load_json(self.table_uri2, (('"key0" : "KEY002"', '"value0" : 123,\n"value1" : 456'),)), '/expected JSON <string>, got <integer>/') # extra stuff self.assertRaisesWithMessage(wiredtiger.WiredTigerError, - lambda: self.load_json(self.table_uri2, + lambda: self.load_json(self.table_uri2, (('"key0" : "KEY002"', '"value0" : 123,\n"value1" : "str0",'),)), '/expected JSON <EOF>, got \',\'/') # fields out of order currently not supported self.assertRaisesWithMessage(wiredtiger.WiredTigerError, - lambda: self.load_json(self.table_uri2, + lambda: self.load_json(self.table_uri2, (('"key0" : "KEY002"', '"value1" : "str0",\n"value0" : 123'),)), '/expected value name.*\"value0\"/') @@ -192,17 +192,17 @@ class test_jsondump02(wttest.WiredTigerTestCase): '\\u', '\\ux', '\\u0', '\\u0F', '\\u0FA', '\\u0FAx', '\\u0FA\\x') for uni in invalid_unicode: self.assertRaisesWithMessage(wiredtiger.WiredTigerError, - lambda: self.load_json(self.table_uri2, + lambda: self.load_json(self.table_uri2, (('"key0" : "KEY002"', '"value0" : 123,\n"value1" : "' + uni + '"'),)), '/invalid Unicode/') # this one should work - self.load_json(self.table_uri2, + self.load_json(self.table_uri2, (('"key0" : "KEY002"', '"value0" : 345,\n"value1" : "str2"'),)) # extraneous/missing space is okay - self.load_json(self.table_uri2, + self.load_json(self.table_uri2, ((' "key0"\n:\t"KEY003" ', '"value0":456,"value1"\n\n\r\n:\t\n"str3"'),)) diff --git a/test/suite/test_metadata_cursor01.py b/test/suite/test_metadata_cursor01.py index 35fd1a74354..706b8a4132a 100644 --- a/test/suite/test_metadata_cursor01.py +++ b/test/suite/test_metadata_cursor01.py @@ -107,7 +107,7 @@ class test_metadata_cursor01(wttest.WiredTigerTestCase): self.create_table() cursor = self.session.open_cursor(self.metauri, None, None) self.assertCursorHasNoKeyValue(cursor) - + while True: prevret = cursor.prev() if prevret != 0: @@ -124,7 +124,7 @@ class test_metadata_cursor01(wttest.WiredTigerTestCase): self.create_table() cursor = self.session.open_cursor(self.metauri, None, None) self.assertCursorHasNoKeyValue(cursor) - + # Ensure the 'special' metadata metadata is found. value = cursor['metadata:'] self.assertTrue(value.find('key_format') != -1) diff --git a/test/suite/test_pack.py b/test/suite/test_pack.py index c9d360c2dcd..451c6fbb9a9 100644 --- a/test/suite/test_pack.py +++ b/test/suite/test_pack.py @@ -43,7 +43,7 @@ class test_pack(wttest.WiredTigerTestCase): y = cursor.get_value() self.tty(' ' + name + ': ' + str(x) + ' => ' + str(y)) cursor.reset() - + def check(self, fmt, *v): v = list(v) fmtname = re.sub('([A-Z])', r'_\1', fmt) diff --git a/test/suite/test_priv01.py b/test/suite/test_priv01.py index 9b6b494e76e..0602d24a2b2 100644 --- a/test/suite/test_priv01.py +++ b/test/suite/test_priv01.py @@ -131,7 +131,7 @@ class test_priv01(wttest.WiredTigerTestCase): lambda: self.common_test(None, edir, None), '/WIREDTIGER_HOME environment variable set but\ process lacks privileges to use that environment variable/') - + def test_env_conf_priv(self): edir = 'envdir' os.mkdir(edir) diff --git a/test/suite/test_schema02.py b/test/suite/test_schema02.py index ab709a28211..0cbff4b5ae0 100644 --- a/test/suite/test_schema02.py +++ b/test/suite/test_schema02.py @@ -173,7 +173,7 @@ class test_schema02(wttest.WiredTigerTestCase): cursor[(i, 'key' + str(i))] = \ ('val' + str(square), square, 'val' + str(cube), cube) cursor.close() - + def check_entries(self): cursor = self.session.open_cursor('table:main', None, None) # spot check via search diff --git a/test/suite/test_schema04.py b/test/suite/test_schema04.py index 9ad01b0f285..a66e1ea2411 100644 --- a/test/suite/test_schema04.py +++ b/test/suite/test_schema04.py @@ -79,7 +79,7 @@ class test_schema04(wttest.WiredTigerTestCase): (i*3)%100, (i*4)%100, (i*5)%100) cursor.insert() cursor.close() - + def check_entries(self): cursor = self.session.open_cursor('table:schema04', None, None) icursor = [] diff --git a/test/suite/test_schema05.py b/test/suite/test_schema05.py index c3919af0880..2a7bc042c80 100644 --- a/test/suite/test_schema05.py +++ b/test/suite/test_schema05.py @@ -120,7 +120,7 @@ class test_schema05(wttest.WiredTigerTestCase): cursor[i] = ','.join([str((i*j)%100) for j in range(0, self.nindices)]) cursor.close() - + def check_entries(self): cursor = self.session.open_cursor('table:schema05', None, None) icursor = [] diff --git a/test/suite/test_sweep01.py b/test/suite/test_sweep01.py index f5e2aa96cbe..13422a75a61 100644 --- a/test/suite/test_sweep01.py +++ b/test/suite/test_sweep01.py @@ -42,7 +42,6 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess): uri = 'table:' + tablebase numfiles = 50 numkv = 1000 - ckpt = 5 types = [ ('row', dict(tabletype='row', @@ -65,7 +64,6 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess): ',create,error_prefix="%s: ",' % self.shortid() + \ 'file_manager=(close_handle_minimum=0,' + \ 'close_idle_time=6,close_scan_interval=2),' + \ - 'checkpoint=(wait=%d),' % self.ckpt + \ 'statistics=(fast),' # print "Creating conn at '%s' with config '%s'" % (dir, conn_params) try: @@ -93,12 +91,13 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess): time.sleep(1) stat_cursor = self.session.open_cursor('statistics:', None, None) - close1 = stat_cursor[stat.conn.dh_conn_handles][2] - sweep1 = stat_cursor[stat.conn.dh_conn_sweeps][2] + close1 = stat_cursor[stat.conn.dh_sweep_close][2] + remove1 = stat_cursor[stat.conn.dh_sweep_remove][2] + sweep1 = stat_cursor[stat.conn.dh_sweeps][2] sclose1 = stat_cursor[stat.conn.dh_session_handles][2] ssweep1 = stat_cursor[stat.conn.dh_session_sweeps][2] - tod1 = stat_cursor[stat.conn.dh_conn_tod][2] - ref1 = stat_cursor[stat.conn.dh_conn_ref][2] + tod1 = stat_cursor[stat.conn.dh_sweep_tod][2] + ref1 = stat_cursor[stat.conn.dh_sweep_ref][2] nfile1 = stat_cursor[stat.conn.file_open][2] stat_cursor.close() @@ -116,10 +115,15 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess): # checkpoint something to do. Make sure checkpoint doesn't adjust # the time of death for inactive handles. # + # Note that we do checkpoints inline because that has the side effect + # of sweeping the session cache, which will allow handles to be + # removed. + # c = self.session.open_cursor(uri, None) k = 0 sleep = 0 while sleep < 12: + self.session.checkpoint() k = k+1 c[k] = 1 sleep += 2 @@ -127,13 +131,14 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess): c.close() stat_cursor = self.session.open_cursor('statistics:', None, None) - close2 = stat_cursor[stat.conn.dh_conn_handles][2] - sweep2 = stat_cursor[stat.conn.dh_conn_sweeps][2] + close2 = stat_cursor[stat.conn.dh_sweep_close][2] + remove2 = stat_cursor[stat.conn.dh_sweep_remove][2] + sweep2 = stat_cursor[stat.conn.dh_sweeps][2] sclose2 = stat_cursor[stat.conn.dh_session_handles][2] ssweep2 = stat_cursor[stat.conn.dh_session_sweeps][2] nfile2 = stat_cursor[stat.conn.file_open][2] - tod2 = stat_cursor[stat.conn.dh_conn_tod][2] - ref2 = stat_cursor[stat.conn.dh_conn_ref][2] + tod2 = stat_cursor[stat.conn.dh_sweep_tod][2] + ref2 = stat_cursor[stat.conn.dh_sweep_ref][2] stat_cursor.close() # print "checkpoint: " + str(self.ckpt) # print "nfile1: " + str(nfile1) + " nfile2: " + str(nfile2) @@ -144,12 +149,13 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess): # print "tod1: " + str(tod1) + " tod2: " + str(tod2) # print "ref1: " + str(ref1) + " ref2: " + str(ref2) - # + # # The files are all closed. Check that sweep did its work even # in the presence of recent checkpoints. # if (close1 >= close2): print "XX: close1: " + str(close1) + " close2: " + str(close2) + print "remove1: " + str(remove1) + " remove2: " + str(remove2) print "sweep1: " + str(sweep1) + " sweep2: " + str(sweep2) print "sclose1: " + str(sclose1) + " sclose2: " + str(sclose2) print "ssweep1: " + str(ssweep1) + " ssweep2: " + str(ssweep2) @@ -157,8 +163,19 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess): print "ref1: " + str(ref1) + " ref2: " + str(ref2) print "nfile1: " + str(nfile1) + " nfile2: " + str(nfile2) self.assertEqual(close1 < close2, True) + if (remove1 >= remove2): + print "close1: " + str(close1) + " close2: " + str(close2) + print "XX: remove1: " + str(remove1) + " remove2: " + str(remove2) + print "sweep1: " + str(sweep1) + " sweep2: " + str(sweep2) + print "sclose1: " + str(sclose1) + " sclose2: " + str(sclose2) + print "ssweep1: " + str(ssweep1) + " ssweep2: " + str(ssweep2) + print "tod1: " + str(tod1) + " tod2: " + str(tod2) + print "ref1: " + str(ref1) + " ref2: " + str(ref2) + print "nfile1: " + str(nfile1) + " nfile2: " + str(nfile2) + self.assertEqual(remove1 < remove2, True) if (sweep1 >= sweep2): print "close1: " + str(close1) + " close2: " + str(close2) + print "remove1: " + str(remove1) + " remove2: " + str(remove2) print "XX: sweep1: " + str(sweep1) + " sweep2: " + str(sweep2) print "sclose1: " + str(sclose1) + " sclose2: " + str(sclose2) print "ssweep1: " + str(ssweep1) + " ssweep2: " + str(ssweep2) @@ -167,6 +184,7 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess): self.assertEqual(sweep1 < sweep2, True) if (nfile2 >= nfile1): print "close1: " + str(close1) + " close2: " + str(close2) + print "remove1: " + str(remove1) + " remove2: " + str(remove2) print "sweep1: " + str(sweep1) + " sweep2: " + str(sweep2) print "sclose1: " + str(sclose1) + " sclose2: " + str(sclose2) print "ssweep1: " + str(ssweep1) + " ssweep2: " + str(ssweep2) @@ -174,17 +192,18 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess): print "ref1: " + str(ref1) + " ref2: " + str(ref2) print "XX: nfile1: " + str(nfile1) + " nfile2: " + str(nfile2) self.assertEqual(nfile2 < nfile1, True) - # The only files that should be left is the metadata, the lock file - # and the active file. - if (nfile2 != 3): + # The only files that should be left are the metadata, the lookaside + # file, the lock file, and the active file. + if (nfile2 != 4): print "close1: " + str(close1) + " close2: " + str(close2) + print "remove1: " + str(remove1) + " remove2: " + str(remove2) print "sweep1: " + str(sweep1) + " sweep2: " + str(sweep2) print "sclose1: " + str(sclose1) + " sclose2: " + str(sclose2) print "ssweep1: " + str(ssweep1) + " ssweep2: " + str(ssweep2) print "tod1: " + str(tod1) + " tod2: " + str(tod2) print "ref1: " + str(ref1) + " ref2: " + str(ref2) print "XX2: nfile1: " + str(nfile1) + " nfile2: " + str(nfile2) - self.assertEqual(nfile2 == 3, True) + self.assertEqual(nfile2 == 4, True) if __name__ == '__main__': wttest.run() diff --git a/test/suite/test_sweep03.py b/test/suite/test_sweep03.py index 4030e2fb715..684c87695c5 100644 --- a/test/suite/test_sweep03.py +++ b/test/suite/test_sweep03.py @@ -93,13 +93,13 @@ class test_sweep03(wttest.WiredTigerTestCase, suite_subprocess): time.sleep(5) stat_cursor = self.session.open_cursor('statistics:', None, None) - close1 = stat_cursor[stat.conn.dh_conn_handles][2] - sweep1 = stat_cursor[stat.conn.dh_conn_sweeps][2] + close1 = stat_cursor[stat.conn.dh_sweep_close][2] + sweep1 = stat_cursor[stat.conn.dh_sweeps][2] stat_cursor.close() # The sweep server should have run, or the test isn't working. self.assertGreater(sweep1, 0) - # We expect nothing to have been closed, so dh_conn_handles should be 0 + # We expect nothing to have been closed. self.assertEqual(close1, 0) def test_disable_idle_timeout_drop_force(self): @@ -116,7 +116,7 @@ class test_sweep03(wttest.WiredTigerTestCase, suite_subprocess): # We just filled the table, now check what the stats are stat_cursor = self.session.open_cursor('statistics:', None, None) cache1 = stat_cursor[stat.conn.cache_bytes_inuse][2] - sweep1 = stat_cursor[stat.conn.dh_conn_sweeps][2] + sweep1 = stat_cursor[stat.conn.dh_sweeps][2] stat_cursor.close() # We force the drop in this case to confirm that the handle is closed @@ -127,8 +127,8 @@ class test_sweep03(wttest.WiredTigerTestCase, suite_subprocess): # Grab the stats post table drop to see things have decremented stat_cursor = self.session.open_cursor('statistics:', None, None) cache2 = stat_cursor[stat.conn.cache_bytes_inuse][2] - close2 = stat_cursor[stat.conn.dh_conn_handles][2] - sweep2 = stat_cursor[stat.conn.dh_conn_sweeps][2] + close2 = stat_cursor[stat.conn.dh_sweep_close][2] + sweep2 = stat_cursor[stat.conn.dh_sweeps][2] stat_cursor.close() # Make sure the sweep server is still working. @@ -151,8 +151,8 @@ class test_sweep03(wttest.WiredTigerTestCase, suite_subprocess): # We just filled the table, now check what the stats are stat_cursor = self.session.open_cursor('statistics:', None, None) cache1 = stat_cursor[stat.conn.cache_bytes_inuse][2] - close1 = stat_cursor[stat.conn.dh_conn_handles][2] - sweep1 = stat_cursor[stat.conn.dh_conn_sweeps][2] + close1 = stat_cursor[stat.conn.dh_sweep_close][2] + sweep1 = stat_cursor[stat.conn.dh_sweeps][2] stat_cursor.close() self.session.drop(drop_uri, None) @@ -162,8 +162,8 @@ class test_sweep03(wttest.WiredTigerTestCase, suite_subprocess): # Grab the stats post table drop to see things have decremented stat_cursor = self.session.open_cursor('statistics:', None, None) cache2 = stat_cursor[stat.conn.cache_bytes_inuse][2] - close2 = stat_cursor[stat.conn.dh_conn_handles][2] - sweep2 = stat_cursor[stat.conn.dh_conn_sweeps][2] + close2 = stat_cursor[stat.conn.dh_sweep_close][2] + sweep2 = stat_cursor[stat.conn.dh_sweeps][2] stat_cursor.close() self.assertGreater(sweep2, sweep1) diff --git a/test/suite/test_txn02.py b/test/suite/test_txn02.py index 5827a892654..83c10f41244 100644 --- a/test/suite/test_txn02.py +++ b/test/suite/test_txn02.py @@ -217,7 +217,7 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess): for i, ot in enumerate(zip(ops, txns)): ok, txn = ot op, k = ok - + # Close and reopen the connection and cursor. if reopen == 'reopen': self.reopen_conn() diff --git a/test/suite/test_txn03.py b/test/suite/test_txn03.py index 41e283a8050..e2efef1742e 100644 --- a/test/suite/test_txn03.py +++ b/test/suite/test_txn03.py @@ -39,7 +39,7 @@ class test_txn03(wttest.WiredTigerTestCase): uri2 = 'table:' + tablename + "_2" key_str = "TEST_KEY1" data_str1 = "VAL" - data_str2 = "TEST_VAL1" + data_str2 = "TEST_VAL1" nentries = 1000 scenarios = check_scenarios([ diff --git a/test/suite/test_txn04.py b/test/suite/test_txn04.py index d0a21f5ec9c..f9f660223da 100644 --- a/test/suite/test_txn04.py +++ b/test/suite/test_txn04.py @@ -72,7 +72,7 @@ class test_txn04(wttest.WiredTigerTestCase, suite_subprocess): self.txn_sync = self.sync_list[ self.scenario_number % len(self.sync_list)] self.backup_dir = os.path.join(self.home, "WT_BACKUP") - # Set archive false on the home directory. + # Set archive false on the home directory. conn_params = \ 'log=(archive=false,enabled,file_max=%s),' % self.logmax + \ 'create,error_prefix="%s: ",' % self.shortid() + \ @@ -158,7 +158,7 @@ class test_txn04(wttest.WiredTigerTestCase, suite_subprocess): self.session.begin_transaction() ok, txn = ot op, k = ok - + # print '%d: %s(%d)[%s]' % (i, ok[0], ok[1], txn) if op == 'insert' or op == 'update': c[k] = i + 2 @@ -199,7 +199,7 @@ class test_txn04(wttest.WiredTigerTestCase, suite_subprocess): self.hot_backup(self.uri, committed) if txn == 'commit': self.assertEqual(True, self.exception == 'true') - else: + else: self.assertEqual(True, self.exception == 'false') if __name__ == '__main__': diff --git a/test/suite/test_txn05.py b/test/suite/test_txn05.py index 8a2f36fc910..d427b893b17 100644 --- a/test/suite/test_txn05.py +++ b/test/suite/test_txn05.py @@ -181,7 +181,7 @@ class test_txn05(wttest.WiredTigerTestCase, suite_subprocess): ok, txn = ot # print '%d: %s(%d)[%s]' % (i, ok[0], ok[1], txn) op, k = ok - + # print '%d: %s(%d)[%s]' % (i, ok[0], ok[1], txn) if op == 'stop': c.set_key(k) diff --git a/test/suite/test_txn07.py b/test/suite/test_txn07.py index 8e7119186f5..fa522582a8e 100644 --- a/test/suite/test_txn07.py +++ b/test/suite/test_txn07.py @@ -171,7 +171,7 @@ class test_txn07(wttest.WiredTigerTestCase, suite_subprocess): ok, txn = ot # print '%d: %s(%d)[%s]' % (i, ok[0], ok[1], txn) op, k = ok - + # print '%d: %s(%d)[%s]' % (i, ok[0], ok[1], txn) if op == 'stop': c.set_key(k) diff --git a/test/suite/test_txn09.py b/test/suite/test_txn09.py index 98229c52f2e..df085a75d67 100644 --- a/test/suite/test_txn09.py +++ b/test/suite/test_txn09.py @@ -139,7 +139,7 @@ class test_txn09(wttest.WiredTigerTestCase, suite_subprocess): for i, ot in enumerate(zip(ops, txns)): ok, txn = ot op, k = ok - + # Close and reopen the connection and cursor, toggling the log self.log_enabled = not self.log_enabled self.reopen_conn() diff --git a/test/suite/test_txn10.py b/test/suite/test_txn10.py index cee25562756..8810df46777 100644 --- a/test/suite/test_txn10.py +++ b/test/suite/test_txn10.py @@ -62,15 +62,15 @@ class test_txn10(wttest.WiredTigerTestCase, suite_subprocess): self.close_conn() self.conn = self.setUpConnectionOpen(newdir) self.session = self.setUpSessionOpen(self.conn) - + def test_recovery(self): ''' Check for bugs in file ID allocation. ''' # Here's the strategy: - # - Create a table (t1). - # - Do a clean restart. - # - Create another table (t2). - # - Insert data into t2. + # - Create a table (t1). + # - Do a clean restart. + # - Create another table (t2). + # - Insert data into t2. # - Make recovery run. # # If we aren't tracking file IDs properly, it's possible that diff --git a/test/suite/test_txn12.py b/test/suite/test_txn12.py new file mode 100644 index 00000000000..0901811535e --- /dev/null +++ b/test/suite/test_txn12.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2015 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest +from suite_subprocess import suite_subprocess +from wiredtiger import stat +from wtscenario import multiply_scenarios, number_scenarios + +# test_txn12.py +# test of commit following failed op in a read only transaction. +class test_txn12(wttest.WiredTigerTestCase, suite_subprocess): + name = 'test_txn12' + uri = 'table:' + name + create_params = 'key_format=i,value_format=i' + + # Test that read-only transactions can commit following a failure. + def test_txn12(self): + + # Setup the session and table. + session = self.conn.open_session(None) + session.create(self.uri, self.create_params) + session.begin_transaction("isolation=snapshot") + + # Create a read only transaction. + c = session.open_cursor(self.uri, None) + c.next() + msg = '/next_random.*boolean/' + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:session.open_cursor(self.uri, None, "next_random=bar"), msg) + # This commit should succeed as we have done no writes. + session.commit_transaction() + + # Create a read/write transaction. + session.begin_transaction("isolation=snapshot") + c = session.open_cursor(self.uri, None) + c[123] = 123 + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:session.open_cursor(self.uri, None, "next_random=bar"), msg) + # This commit should fail as we have written something + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda:session.commit_transaction(), '/requires rollback/') + +if __name__ == '__main__': + wttest.run() + diff --git a/test/suite/test_util01.py b/test/suite/test_util01.py index 0b1e2a35833..29033fb43ba 100644 --- a/test/suite/test_util01.py +++ b/test/suite/test_util01.py @@ -168,7 +168,7 @@ class test_util01(wttest.WiredTigerTestCase, suite_subprocess): dumpargs.append("-x") dumpargs.append(self.tablename) self.runWt(dumpargs, outfilename="dump.out") - + self.assertTrue(self.compare_files("expect.out", "dump.out")) def test_dump_process(self): @@ -179,10 +179,10 @@ class test_util01(wttest.WiredTigerTestCase, suite_subprocess): def test_dump_api(self): self.dump(True, False) - + def test_dump_api_hex(self): self.dump(True, True) - + if __name__ == '__main__': wttest.run() diff --git a/test/suite/wtscenario.py b/test/suite/wtscenario.py index 6e4b0d3464e..0f8e8c30c1f 100644 --- a/test/suite/wtscenario.py +++ b/test/suite/wtscenario.py @@ -61,7 +61,7 @@ def log2chr(val): return chr(ord('0') + p) else: return chr(ord('a') + p - 10) - + megabyte = 1024 * 1024 def check_scenarios(scenes): diff --git a/test/suite/wttest.py b/test/suite/wttest.py index 9f833b0b6a4..443fabb00b2 100644 --- a/test/suite/wttest.py +++ b/test/suite/wttest.py @@ -169,14 +169,14 @@ class WiredTigerTestCase(unittest.TestCase): self.captureerr = CapturedFd('stderr.txt', 'error output') sys.stdout = self.captureout.capture() sys.stderr = self.captureerr.capture() - + def fdTearDown(self): # restore stderr/stdout self.captureout.release() self.captureerr.release() sys.stdout = WiredTigerTestCase._stdout sys.stderr = WiredTigerTestCase._stderr - + def __init__(self, *args, **kwargs): if hasattr(self, 'scenarios'): assert(len(self.scenarios) == len(dict(self.scenarios))) @@ -204,11 +204,11 @@ class WiredTigerTestCase(unittest.TestCase): 'create,error_prefix="%s",%s' % (self.shortid(), self.conn_config)) self.pr(`conn`) return conn - + # Can be overridden def setUpSessionOpen(self, conn): return conn.open_session(None) - + # Can be overridden def close_conn(self): """ @@ -351,7 +351,7 @@ class WiredTigerTestCase(unittest.TestCase): else: with self.expectedStderr(message): self.assertRaises(exceptionType, expr) - + def exceptionToStderr(self, expr): """ Used by assertRaisesHavingMessage to convert an expression diff --git a/test/suite/wtthread.py b/test/suite/wtthread.py index 1e2e4f56380..8959684d6d3 100644 --- a/test/suite/wtthread.py +++ b/test/suite/wtthread.py @@ -35,7 +35,7 @@ class checkpoint_thread(threading.Thread): self.conn = conn self.done = done threading.Thread.__init__(self) - + def run(self): sess = self.conn.open_session() while not self.done.isSet(): @@ -50,7 +50,7 @@ class backup_thread(threading.Thread): self.conn = conn self.done = done threading.Thread.__init__(self) - + def run(self): sess = self.conn.open_session() while not self.done.isSet(): @@ -111,7 +111,7 @@ class op_thread(threading.Thread): self.queue = queue self.done = done threading.Thread.__init__(self) - + def run(self): sess = self.conn.open_session() if (len(self.uris) == 1): diff --git a/tools/wtstats/stat_data.py b/tools/wtstats/stat_data.py index db5b14d6cd6..f2f193c0860 100644 --- a/tools/wtstats/stat_data.py +++ b/tools/wtstats/stat_data.py @@ -13,6 +13,7 @@ no_scale_per_second_list = [ 'cache: tracked dirty bytes in the cache', 'cache: tracked dirty pages in the cache', 'connection: files currently open', + 'data-handle: connection data handles currently active', 'log: maximum log file size', 'log: number of pre-allocated log files to create', 'log: total log buffer size', @@ -42,6 +43,7 @@ no_scale_per_second_list = [ 'btree: column-store internal pages', 'btree: column-store variable-size deleted values', 'btree: column-store variable-size leaf pages', + 'btree: column-store variable-size RLE encoded values', 'btree: fixed-record size', 'btree: maximum internal page key size', 'btree: maximum internal page size', @@ -74,6 +76,7 @@ no_clear_list = [ 'cache: tracked dirty bytes in the cache', 'cache: tracked dirty pages in the cache', 'connection: files currently open', + 'data-handle: connection data handles currently active', 'log: maximum log file size', 'log: number of pre-allocated log files to create', 'log: total log buffer size', |