summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDon Anderson <dda@ddanderson.com>2015-09-13 21:57:42 -0400
committerDon Anderson <dda@ddanderson.com>2015-09-13 21:57:42 -0400
commit0225351bb6d937309f0bccb800c46e72e1aa4b82 (patch)
tree95998c9b83fe07fef1642cc281f814e4f25831f0
parent4c663725867d2f9434298d30883c58a0d96deaa9 (diff)
parente1d6886824058b333495236b776b10fcd8fb74ae (diff)
downloadmongo-0225351bb6d937309f0bccb800c46e72e1aa4b82.tar.gz
Merge branch 'develop' into index-create-lsm3
-rw-r--r--SConstruct1
-rw-r--r--bench/wtperf/config.c8
-rw-r--r--bench/wtperf/runners/mongodb-large-oplog.wtperf13
-rw-r--r--bench/wtperf/runners/mongodb-small-oplog.wtperf13
-rwxr-xr-xbench/wtperf/runners/wtperf_run.sh14
-rw-r--r--bench/wtperf/wtperf.c36
-rw-r--r--bench/wtperf/wtperf.h10
-rw-r--r--bench/wtperf/wtperf_truncate.c45
-rw-r--r--build_win/filelist.win1
-rw-r--r--dist/api_data.py22
-rw-r--r--dist/filelist1
-rw-r--r--dist/flags.py15
-rwxr-xr-xdist/s_all2
-rwxr-xr-xdist/s_define15
-rw-r--r--dist/s_define.list100
-rwxr-xr-xdist/s_stat2
-rw-r--r--dist/s_string.ok18
-rwxr-xr-xdist/s_style12
-rwxr-xr-xdist/s_typedef2
-rwxr-xr-xdist/s_whitespace23
-rw-r--r--dist/stat.py128
-rw-r--r--dist/stat_data.py102
-rw-r--r--examples/c/ex_all.c3
-rw-r--r--examples/c/ex_log.c31
-rwxr-xr-xexamples/python/ex_access.py2
-rwxr-xr-xexamples/python/ex_stat.py30
-rw-r--r--ext/encryptors/rotn/rotn_encrypt.c97
-rw-r--r--ext/extractors/csv/csv_extractor.c2
-rw-r--r--src/async/async_api.c29
-rw-r--r--src/async/async_op.c6
-rw-r--r--src/async/async_worker.c16
-rw-r--r--src/block/block_ext.c16
-rw-r--r--src/block/block_open.c32
-rw-r--r--src/block/block_read.c6
-rw-r--r--src/block/block_slvg.c12
-rw-r--r--src/btree/bt_compact.c35
-rw-r--r--src/btree/bt_cursor.c156
-rw-r--r--src/btree/bt_debug.c8
-rw-r--r--src/btree/bt_delete.c28
-rw-r--r--src/btree/bt_discard.c28
-rw-r--r--src/btree/bt_handle.c86
-rw-r--r--src/btree/bt_io.c71
-rw-r--r--src/btree/bt_ovfl.c12
-rw-r--r--src/btree/bt_page.c212
-rw-r--r--src/btree/bt_read.c563
-rw-r--r--src/btree/bt_slvg.c16
-rw-r--r--src/btree/bt_split.c151
-rw-r--r--src/btree/bt_stat.c91
-rw-r--r--src/btree/bt_sync.c1
-rw-r--r--src/btree/bt_vrfy.c8
-rw-r--r--src/btree/bt_vrfy_dsk.c141
-rw-r--r--src/btree/col_modify.c18
-rw-r--r--src/btree/row_key.c5
-rw-r--r--src/btree/row_modify.c9
-rw-r--r--src/btree/row_srch.c15
-rw-r--r--src/cache/cache_las.c391
-rw-r--r--src/config/config_def.c39
-rw-r--r--src/conn/conn_api.c37
-rw-r--r--src/conn/conn_cache.c33
-rw-r--r--src/conn/conn_cache_pool.c219
-rw-r--r--src/conn/conn_dhandle.c19
-rw-r--r--src/conn/conn_handle.c24
-rw-r--r--src/conn/conn_log.c322
-rw-r--r--src/conn/conn_open.c17
-rw-r--r--src/conn/conn_stat.c37
-rw-r--r--src/conn/conn_sweep.c235
-rw-r--r--src/cursor/cur_backup.c8
-rw-r--r--src/cursor/cur_ds.c2
-rw-r--r--src/cursor/cur_file.c15
-rw-r--r--src/cursor/cur_index.c11
-rw-r--r--src/cursor/cur_log.c8
-rw-r--r--src/cursor/cur_stat.c36
-rw-r--r--src/cursor/cur_std.c13
-rw-r--r--src/docs/cursor-random.dox16
-rw-r--r--src/docs/upgrading.dox9
-rw-r--r--src/evict/evict_file.c25
-rw-r--r--src/evict/evict_lru.c456
-rw-r--r--src/evict/evict_page.c81
-rw-r--r--src/include/async.h42
-rw-r--r--src/include/bitstring.i4
-rw-r--r--src/include/block.h4
-rw-r--r--src/include/btmem.h66
-rw-r--r--src/include/btree.h14
-rw-r--r--src/include/btree.i370
-rw-r--r--src/include/cache.h18
-rw-r--r--src/include/cache.i100
-rw-r--r--src/include/cell.i28
-rw-r--r--src/include/connection.h108
-rw-r--r--src/include/cursor.h15
-rw-r--r--src/include/cursor.i12
-rw-r--r--src/include/dhandle.h13
-rw-r--r--src/include/error.h3
-rw-r--r--src/include/extern.h79
-rw-r--r--src/include/flags.h31
-rw-r--r--src/include/gcc.h103
-rw-r--r--src/include/hardware.h26
-rw-r--r--src/include/lint.h99
-rw-r--r--src/include/log.h177
-rw-r--r--src/include/log.i40
-rw-r--r--src/include/lsm.h31
-rw-r--r--src/include/meta.h4
-rw-r--r--src/include/misc.h1
-rw-r--r--src/include/misc.i12
-rw-r--r--src/include/msvc.h92
-rw-r--r--src/include/mutex.h29
-rw-r--r--src/include/mutex.i14
-rw-r--r--src/include/os.h6
-rw-r--r--src/include/queue.h345
-rw-r--r--src/include/schema.h4
-rw-r--r--src/include/serial.i167
-rw-r--r--src/include/session.h17
-rw-r--r--src/include/stat.h704
-rw-r--r--src/include/txn.h7
-rw-r--r--src/include/txn.i52
-rw-r--r--src/include/wiredtiger.in385
-rw-r--r--src/include/wt_internal.h25
-rw-r--r--src/log/log.c598
-rw-r--r--src/log/log_slot.c579
-rw-r--r--src/lsm/lsm_cursor.c26
-rw-r--r--src/lsm/lsm_manager.c22
-rw-r--r--src/lsm/lsm_merge.c12
-rw-r--r--src/lsm/lsm_stat.c63
-rw-r--r--src/lsm/lsm_tree.c41
-rw-r--r--src/lsm/lsm_work_unit.c30
-rw-r--r--src/lsm/lsm_worker.c2
-rw-r--r--src/meta/meta_apply.c2
-rw-r--r--src/meta/meta_table.c33
-rw-r--r--src/os_posix/os_alloc.c12
-rw-r--r--src/os_posix/os_mtx_cond.c19
-rw-r--r--src/os_posix/os_mtx_rw.c199
-rw-r--r--src/os_posix/os_open.c10
-rw-r--r--src/os_posix/os_path.c4
-rw-r--r--src/os_posix/os_remove.c2
-rw-r--r--src/os_posix/os_thread.c6
-rw-r--r--src/os_win/os_errno.c4
-rw-r--r--src/os_win/os_mtx_cond.c35
-rw-r--r--src/os_win/os_open.c8
-rw-r--r--src/os_win/os_path.c4
-rw-r--r--src/os_win/os_remove.c2
-rw-r--r--src/reconcile/rec_write.c1187
-rw-r--r--src/schema/schema_list.c12
-rw-r--r--src/schema/schema_stat.c4
-rw-r--r--src/session/session_api.c59
-rw-r--r--src/session/session_dhandle.c137
-rw-r--r--src/support/pow.c2
-rw-r--r--src/support/rand.c7
-rw-r--r--src/support/stat.c1630
-rw-r--r--src/txn/txn.c90
-rw-r--r--src/txn/txn_ckpt.c33
-rw-r--r--src/txn/txn_log.c31
-rw-r--r--src/txn/txn_nsnap.c36
-rw-r--r--src/txn/txn_recover.c8
-rw-r--r--src/utilities/util_list.c13
-rw-r--r--test/checkpoint/checkpointer.c52
-rw-r--r--test/checkpoint/workers.c6
-rw-r--r--test/format/backup.c3
-rw-r--r--test/format/bulk.c1
-rw-r--r--test/format/config.c37
-rw-r--r--test/format/format.h3
-rw-r--r--test/format/lrt.c108
-rw-r--r--test/format/ops.c35
-rwxr-xr-xtest/format/smoke.sh2
-rw-r--r--test/format/t.c25
-rw-r--r--test/format/util.c64
-rw-r--r--test/format/wts.c7
-rw-r--r--test/suite/run.py2
-rw-r--r--test/suite/test_async01.py2
-rw-r--r--test/suite/test_async02.py2
-rw-r--r--test/suite/test_autoclose.py4
-rw-r--r--test/suite/test_backup04.py2
-rw-r--r--test/suite/test_backup05.py2
-rw-r--r--test/suite/test_base05.py4
-rw-r--r--test/suite/test_baseconfig.py54
-rw-r--r--test/suite/test_bug005.py2
-rw-r--r--test/suite/test_bug008.py4
-rw-r--r--test/suite/test_bug011.py2
-rw-r--r--test/suite/test_checkpoint01.py2
-rw-r--r--test/suite/test_cursor01.py4
-rw-r--r--test/suite/test_cursor04.py4
-rw-r--r--test/suite/test_cursor06.py4
-rw-r--r--test/suite/test_cursor_random.py14
-rw-r--r--test/suite/test_cursor_tracker.py2
-rw-r--r--test/suite/test_durability01.py2
-rw-r--r--test/suite/test_encrypt03.py2
-rw-r--r--test/suite/test_encrypt04.py48
-rw-r--r--test/suite/test_encrypt05.py2
-rw-r--r--test/suite/test_encrypt06.py4
-rw-r--r--test/suite/test_jsondump02.py24
-rw-r--r--test/suite/test_metadata_cursor01.py4
-rw-r--r--test/suite/test_pack.py2
-rw-r--r--test/suite/test_priv01.py2
-rw-r--r--test/suite/test_schema02.py2
-rw-r--r--test/suite/test_schema04.py2
-rw-r--r--test/suite/test_schema05.py2
-rw-r--r--test/suite/test_sweep01.py49
-rw-r--r--test/suite/test_sweep03.py20
-rw-r--r--test/suite/test_txn02.py2
-rw-r--r--test/suite/test_txn03.py2
-rw-r--r--test/suite/test_txn04.py6
-rw-r--r--test/suite/test_txn05.py2
-rw-r--r--test/suite/test_txn07.py2
-rw-r--r--test/suite/test_txn09.py2
-rw-r--r--test/suite/test_txn10.py10
-rw-r--r--test/suite/test_txn12.py70
-rw-r--r--test/suite/test_util01.py6
-rw-r--r--test/suite/wtscenario.py2
-rw-r--r--test/suite/wttest.py10
-rw-r--r--test/suite/wtthread.py6
-rw-r--r--tools/wtstats/stat_data.py3
209 files changed, 8245 insertions, 5277 deletions
diff --git a/SConstruct b/SConstruct
index 49e4417133f..70ed6e0220b 100644
--- a/SConstruct
+++ b/SConstruct
@@ -454,6 +454,7 @@ t = env.Program("wtperf", [
"bench/wtperf/misc.c",
"bench/wtperf/track.c",
"bench/wtperf/wtperf.c",
+ "bench/wtperf/wtperf_truncate.c",
],
LIBS=[wtlib, shim] + wtlibs)
Default(t)
diff --git a/bench/wtperf/config.c b/bench/wtperf/config.c
index 4445de3296d..6b0ce47ef3f 100644
--- a/bench/wtperf/config.c
+++ b/bench/wtperf/config.c
@@ -96,7 +96,7 @@ config_assign(CONFIG *dest, const CONFIG *src)
}
}
- STAILQ_INIT(&dest->stone_head);
+ TAILQ_INIT(&dest->stone_head);
return (0);
}
@@ -257,13 +257,15 @@ config_threads(CONFIG *cfg, const char *config, size_t len)
continue;
}
if (STRING_MATCH("truncate_pct", k.str, k.len)) {
- if ((workp->truncate_pct = v.val) <= 0)
+ if (v.val <= 0)
goto err;
+ workp->truncate_pct = (uint64_t)v.val;
continue;
}
if (STRING_MATCH("truncate_count", k.str, k.len)) {
- if ((workp->truncate_count = v.val) <= 0)
+ if (v.val <= 0)
goto err;
+ workp->truncate_count = (uint64_t)v.val;
continue;
}
goto err;
diff --git a/bench/wtperf/runners/mongodb-large-oplog.wtperf b/bench/wtperf/runners/mongodb-large-oplog.wtperf
new file mode 100644
index 00000000000..1e203a34cc3
--- /dev/null
+++ b/bench/wtperf/runners/mongodb-large-oplog.wtperf
@@ -0,0 +1,13 @@
+# wtperf options file to simulate populating a MongoDB oplog
+# This creates a test database of 7.8GB
+conn_config="cache_size=2GB,checkpoint=(wait=60)"
+table_config="type=file"
+# Start with a small set of inserts in the populate phase.
+icount=300000
+report_interval=5
+run_time=3600
+populate_threads=1
+key_sz=8192
+# Setup three threads to insert into the oplog
+# Setup one thread to be doing truncates from the oplog
+threads=((count=3,inserts=1,throttle=2000),(count=1,truncate=1,truncate_pct=10,truncate_count=300000))
diff --git a/bench/wtperf/runners/mongodb-small-oplog.wtperf b/bench/wtperf/runners/mongodb-small-oplog.wtperf
new file mode 100644
index 00000000000..4f2ae5359cd
--- /dev/null
+++ b/bench/wtperf/runners/mongodb-small-oplog.wtperf
@@ -0,0 +1,13 @@
+# wtperf options file to simulate populating a MongoDB oplog
+# This creates an oplog of 6.1GB
+conn_config="cache_size=2GB,checkpoint=(wait=60)"
+table_config="type=file"
+# Start with a small set of inserts in the populate phase.
+icount=750000
+report_interval=5
+run_time=3600
+populate_threads=1
+key_sz=512
+# Setup three threads to insert into the oplog
+# Setup one thread to be doing truncates from the oplog
+threads=((count=3,inserts=1,throttle=2000),(count=1,truncate=1,truncate_pct=10,truncate_count=750000))
diff --git a/bench/wtperf/runners/wtperf_run.sh b/bench/wtperf/runners/wtperf_run.sh
index d5de7c4abdb..ac31c2a2e78 100755
--- a/bench/wtperf/runners/wtperf_run.sh
+++ b/bench/wtperf/runners/wtperf_run.sh
@@ -24,18 +24,18 @@ outfile=./wtperf.out
rm -f $outfile
# Each of these has an entry for each op in ops below.
-avg=(0 0 0)
-max=(0 0 0)
-min=(0 0 0)
-sum=(0 0 0)
+avg=(0 0 0 0)
+max=(0 0 0 0)
+min=(0 0 0 0)
+sum=(0 0 0 0)
# Load needs floating point and bc, handle separately.
-loadindex=4
+loadindex=5
avg[$loadindex]=0
max[$loadindex]=0
min[$loadindex]=0
sum[$loadindex]=0
-ops=(read insert update)
-outp=("Read count:" "Insert count:" "Update count:")
+ops=(read insert update truncate)
+outp=("Read count:" "Insert count:" "Update count:" "Truncate count:")
outp[$loadindex]="Load time:"
# getval min/max val cur
diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c
index 148aa0e4e84..5d3b334785d 100644
--- a/bench/wtperf/wtperf.c
+++ b/bench/wtperf/wtperf.c
@@ -96,17 +96,11 @@ static uint64_t wtperf_value_range(CONFIG *);
#define HELIUM_CONFIG ",type=helium"
#define INDEX_COL_NAMES ",columns=(key,val)"
-inline uint64_t
-decode_key(char *key_buf)
-{
- return (strtoull(key_buf, NULL, 10));
-}
-
/* Retrieve an ID for the next insert operation. */
static inline uint64_t
get_next_incr(CONFIG *cfg)
{
- return (WT_ATOMIC_ADD8(cfg->insert_key, 1));
+ return (__wt_atomic_add64(&cfg->insert_key, 1));
}
static void
@@ -157,7 +151,7 @@ cb_asyncop(WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP *op, int ret, uint32_t flags)
switch (type) {
case WT_AOP_COMPACT:
tables = (uint32_t *)op->app_private;
- WT_ATOMIC_ADD4(*tables, (uint32_t)-1);
+ (void)__wt_atomic_add32(tables, (uint32_t)-1);
break;
case WT_AOP_INSERT:
trk = &thread->insert;
@@ -192,7 +186,7 @@ cb_asyncop(WT_ASYNC_CALLBACK *cb, WT_ASYNC_OP *op, int ret, uint32_t flags)
return (0);
if (ret == 0 || (ret == WT_NOTFOUND && type != WT_AOP_INSERT)) {
if (!cfg->in_warmup)
- (void)WT_ATOMIC_ADD8(trk->ops, 1);
+ (void)__wt_atomic_add64(&trk->ops, 1);
return (0);
}
err:
@@ -513,10 +507,9 @@ worker(void *arg)
* is 0, to avoid first time latency spikes.
*/
measure_latency =
- cfg->sample_interval != 0 && trk->ops != 0 && (
- trk->ops % cfg->sample_rate == 0);
- if (measure_latency &&
- (ret = __wt_epoch(NULL, &start)) != 0) {
+ cfg->sample_interval != 0 && trk != NULL &&
+ trk->ops != 0 && (trk->ops % cfg->sample_rate == 0);
+ if (measure_latency && (ret = __wt_epoch(NULL, &start)) != 0) {
lprintf(cfg, ret, 0, "Get time call failed");
goto err;
}
@@ -880,10 +873,9 @@ populate_thread(void *arg)
cursor = cursors[op % cfg->table_count];
generate_key(cfg, key_buf, op);
measure_latency =
- cfg->sample_interval != 0 && trk->ops != 0 && (
- trk->ops % cfg->sample_rate == 0);
- if (measure_latency &&
- (ret = __wt_epoch(NULL, &start)) != 0) {
+ cfg->sample_interval != 0 &&
+ trk->ops != 0 && (trk->ops % cfg->sample_rate == 0);
+ if (measure_latency && (ret = __wt_epoch(NULL, &start)) != 0) {
lprintf(cfg, ret, 0, "Get time call failed");
goto err;
}
@@ -1001,10 +993,9 @@ populate_async(void *arg)
* the time to process by workers.
*/
measure_latency =
- cfg->sample_interval != 0 && trk->ops != 0 && (
- trk->ops % cfg->sample_rate == 0);
- if (measure_latency &&
- (ret = __wt_epoch(NULL, &start)) != 0) {
+ cfg->sample_interval != 0 &&
+ trk->ops != 0 && (trk->ops % cfg->sample_rate == 0);
+ if (measure_latency && (ret = __wt_epoch(NULL, &start)) != 0) {
lprintf(cfg, ret, 0, "Get time call failed");
goto err;
}
@@ -1046,8 +1037,7 @@ populate_async(void *arg)
goto err;
if (measure_latency) {
if ((ret = __wt_epoch(NULL, &stop)) != 0) {
- lprintf(cfg, ret, 0,
- "Get time call failed");
+ lprintf(cfg, ret, 0, "Get time call failed");
goto err;
}
++trk->latency_ops;
diff --git a/bench/wtperf/wtperf.h b/bench/wtperf/wtperf.h
index 991c09138e3..e4b9fc00798 100644
--- a/bench/wtperf/wtperf.h
+++ b/bench/wtperf/wtperf.h
@@ -95,7 +95,7 @@ typedef struct {
int64_t ops_per_txn;
int64_t truncate; /* Truncate ratio */
uint64_t truncate_pct; /* Truncate Percent */
- uint64_t truncate_count; /* Truncate Percent */
+ uint64_t truncate_count; /* Truncate Count */
#define WORKER_INSERT 1 /* Insert */
#define WORKER_INSERT_RMW 2 /* Insert with read-modify-write */
@@ -108,7 +108,6 @@ typedef struct {
/* Steering items for the truncate workload */
typedef struct __truncate_struct TRUNCATE_CONFIG;
struct __truncate_struct {
- double truncation_percentage;
uint64_t stone_gap;
uint64_t needed_stones;
uint64_t final_stone_gap;
@@ -122,8 +121,8 @@ struct __truncate_struct {
/* Queue entry for use with the Truncate Logic */
struct __truncate_queue_entry {
char *key; /* Truncation point */
- u_int diff; /* Number of items to be truncated*/
- STAILQ_ENTRY(__truncate_queue_entry) q;
+ uint64_t diff; /* Number of items to be truncated*/
+ TAILQ_ENTRY(__truncate_queue_entry) q;
};
typedef struct __truncate_queue_entry TRUNCATE_QUEUE_ENTRY;
@@ -179,7 +178,7 @@ struct __config { /* Configuration structure */
u_int has_truncate; /* if there is a truncate workload */
/* Queue head for use with the Truncate Logic */
- STAILQ_HEAD(__truncate_qh, __truncate_queue_entry) stone_head;
+ TAILQ_HEAD(__truncate_qh, __truncate_queue_entry) stone_head;
/* Fields changeable on command line are listed in wtperf_opt.i */
#define OPT_DECLARE_STRUCT
@@ -273,7 +272,6 @@ int config_opt_line(CONFIG *, const char *);
int config_opt_str(CONFIG *, const char *, const char *);
void config_print(CONFIG *);
int config_sanity(CONFIG *);
-uint64_t decode_key(char *);
void latency_insert(CONFIG *, uint32_t *, uint32_t *, uint32_t *);
void latency_read(CONFIG *, uint32_t *, uint32_t *, uint32_t *);
void latency_update(CONFIG *, uint32_t *, uint32_t *, uint32_t *);
diff --git a/bench/wtperf/wtperf_truncate.c b/bench/wtperf/wtperf_truncate.c
index 0cdbbb914a4..581d1987947 100644
--- a/bench/wtperf/wtperf_truncate.c
+++ b/bench/wtperf/wtperf_truncate.c
@@ -28,6 +28,12 @@
#include "wtperf.h"
+static inline uint64_t
+decode_key(char *key_buf)
+{
+ return (strtoull(key_buf, NULL, 10));
+}
+
int
setup_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_SESSION *session) {
@@ -37,8 +43,7 @@ setup_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_SESSION *session) {
WT_CURSOR *cursor;
char *key, *truncate_key;
int ret;
- size_t i;
- uint64_t end_point, final_stone_gap, start_point;
+ uint64_t end_point, final_stone_gap, i, start_point;
end_point = final_stone_gap = start_point = 0;
trunc_cfg = &thread->trunc_cfg;
@@ -49,11 +54,9 @@ setup_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_SESSION *session) {
session, cfg->uris[0], NULL, NULL, &cursor)) != 0)
goto err;
- /* Truncation percentage value. eg 10% is 0.1. */
- trunc_cfg->truncation_percentage = (double)workload->truncate_pct / 100;
/* How many entries between each stone. */
trunc_cfg->stone_gap =
- workload->truncate_count * trunc_cfg->truncation_percentage;
+ (workload->truncate_count * workload->truncate_pct) / 100;
/* How many stones we need. */
trunc_cfg->needed_stones =
workload->truncate_count / trunc_cfg->stone_gap;
@@ -94,8 +97,13 @@ setup_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_SESSION *session) {
trunc_cfg->expected_total = (end_point - start_point);
for (i = 1; i <= trunc_cfg->needed_stones; i++) {
truncate_key = calloc(cfg->key_sz, 1);
+ if (truncate_key == NULL) {
+ ret = enomem(cfg);
+ goto err;
+ }
truncate_item = calloc(sizeof(TRUNCATE_QUEUE_ENTRY), 1);
if (truncate_item == NULL) {
+ free(truncate_key);
ret = enomem(cfg);
goto err;
}
@@ -104,14 +112,16 @@ setup_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_SESSION *session) {
truncate_item->key = truncate_key;
truncate_item->diff =
(trunc_cfg->stone_gap * i) - trunc_cfg->last_key;
- STAILQ_INSERT_TAIL( &cfg->stone_head, truncate_item, q);
+ TAILQ_INSERT_TAIL( &cfg->stone_head, truncate_item, q);
trunc_cfg->last_key = trunc_cfg->stone_gap * i;
trunc_cfg->num_stones++;
}
}
trunc_cfg->stone_gap = final_stone_gap;
-err: cursor->close(cursor);
+err: if ((ret = cursor->close(cursor)) != 0) {
+ lprintf(cfg, ret, 0, "truncate setup: cursor close failed");
+ }
return (ret);
}
@@ -141,16 +151,22 @@ run_truncate(CONFIG *cfg, CONFIG_THREAD *thread,
while (trunc_cfg->num_stones < trunc_cfg->needed_stones) {
trunc_cfg->last_key += trunc_cfg->stone_gap;
truncate_key = calloc(cfg->key_sz, 1);
+ if (truncate_key == NULL) {
+ lprintf(cfg, ENOMEM, 0,
+ "truncate: couldn't allocate key array");
+ return (ENOMEM);
+ }
truncate_item = calloc(sizeof(TRUNCATE_QUEUE_ENTRY), 1);
if (truncate_item == NULL) {
+ free(truncate_key);
lprintf(cfg, ENOMEM, 0,
- "worker: couldn't allocate cursor array");
+ "truncate: couldn't allocate item");
return (ENOMEM);
}
generate_key(cfg, truncate_key, trunc_cfg->last_key);
truncate_item->key = truncate_key;
truncate_item->diff = trunc_cfg->stone_gap;
- STAILQ_INSERT_TAIL(&cfg->stone_head, truncate_item, q);
+ TAILQ_INSERT_TAIL(&cfg->stone_head, truncate_item, q);
trunc_cfg->num_stones++;
}
@@ -159,9 +175,9 @@ run_truncate(CONFIG *cfg, CONFIG_THREAD *thread,
trunc_cfg->expected_total <= thread->workload->truncate_count)
return (0);
- truncate_item = STAILQ_FIRST(&cfg->stone_head);
+ truncate_item = TAILQ_FIRST(&cfg->stone_head);
trunc_cfg->num_stones--;
- STAILQ_REMOVE_HEAD(&cfg->stone_head, q);
+ TAILQ_REMOVE(&cfg->stone_head, truncate_item, q);
cursor->set_key(cursor,truncate_item->key);
if ((ret = cursor->search(cursor)) != 0) {
lprintf(cfg, ret, 0, "Truncate search: failed");
@@ -179,7 +195,6 @@ run_truncate(CONFIG *cfg, CONFIG_THREAD *thread,
err: free(truncate_item->key);
free(truncate_item);
- truncate_item = NULL;
t_ret = cursor->reset(cursor);
if (t_ret != 0)
lprintf(cfg, t_ret, 0, "Cursor reset failed");
@@ -192,9 +207,9 @@ void
cleanup_truncate_config(CONFIG *cfg) {
TRUNCATE_QUEUE_ENTRY *truncate_item;
- while (!STAILQ_EMPTY(&cfg->stone_head)) {
- truncate_item = STAILQ_FIRST(&cfg->stone_head);
- STAILQ_REMOVE_HEAD(&cfg->stone_head, q);
+ while (!TAILQ_EMPTY(&cfg->stone_head)) {
+ truncate_item = TAILQ_FIRST(&cfg->stone_head);
+ TAILQ_REMOVE(&cfg->stone_head, truncate_item, q);
free(truncate_item->key);
free(truncate_item);
}
diff --git a/build_win/filelist.win b/build_win/filelist.win
index 099451e418d..9d0ee10d305 100644
--- a/build_win/filelist.win
+++ b/build_win/filelist.win
@@ -45,6 +45,7 @@ src/btree/col_srch.c
src/btree/row_key.c
src/btree/row_modify.c
src/btree/row_srch.c
+src/cache/cache_las.c
src/config/config.c
src/config/config_api.c
src/config/config_check.c
diff --git a/dist/api_data.py b/dist/api_data.py
index 43b585a6c6d..3a700cf886b 100644
--- a/dist/api_data.py
+++ b/dist/api_data.py
@@ -449,13 +449,17 @@ connection_runtime_config = [
Config('chunk', '10MB', r'''
the granularity that a shared cache is redistributed''',
min='1MB', max='10TB'),
+ Config('name', 'none', r'''
+ the name of a cache that is shared between databases or
+ \c "none" when no shared cache is configured'''),
+ Config('quota', '0', r'''
+ maximum size of cache this database can be allocated from the
+ shared cache. Defaults to the entire shared cache size''',
+ type='int'),
Config('reserve', '0', r'''
amount of cache this database is guaranteed to have
available from the shared cache. This setting is per
database. Defaults to the chunk size''', type='int'),
- Config('name', 'none', r'''
- the name of a cache that is shared between databases or
- \c "none" when no shared cache is configured'''),
Config('size', '500MB', r'''
maximum memory to allocate for the shared cache. Setting
this will update the value if one is already set''',
@@ -981,8 +985,10 @@ methods = {
connection_runtime_config +
common_wiredtiger_open + [
Config('config_base', 'true', r'''
- write the base configuration file if creating the database,
- see @ref config_base for more information''',
+ write the base configuration file if creating the database. If
+ \c false in the config passed directly to ::wiredtiger_open, will
+ ignore any existing base configuration file in addition to not creating
+ one. See @ref config_base for more information''',
type='boolean'),
Config('create', 'false', r'''
create the database if it does not exist''',
@@ -1011,8 +1017,10 @@ methods = {
connection_runtime_config +
common_wiredtiger_open + [
Config('config_base', 'true', r'''
- write the base configuration file if creating the database,
- see @ref config_base for more information''',
+ write the base configuration file if creating the database. If
+ \c false in the config passed directly to ::wiredtiger_open, will
+ ignore any existing base configuration file in addition to not creating
+ one. See @ref config_base for more information''',
type='boolean'),
Config('create', 'false', r'''
create the database if it does not exist''',
diff --git a/dist/filelist b/dist/filelist
index c3321cf845d..f33f0e9a962 100644
--- a/dist/filelist
+++ b/dist/filelist
@@ -45,6 +45,7 @@ src/btree/col_srch.c
src/btree/row_key.c
src/btree/row_modify.c
src/btree/row_srch.c
+src/cache/cache_las.c
src/config/config.c
src/config/config_api.c
src/config/config_check.c
diff --git a/dist/flags.py b/dist/flags.py
index c8d9bcc6a5e..d98f249335e 100644
--- a/dist/flags.py
+++ b/dist/flags.py
@@ -12,7 +12,6 @@ flags = {
'SYNC_CHECKPOINT',
'SYNC_CLOSE',
'SYNC_DISCARD',
- 'SYNC_DISCARD_FORCE',
'SYNC_WRITE_LEAVES',
],
'file_types' : [
@@ -46,9 +45,10 @@ flags = {
'READ_WONT_NEED',
],
'rec_write' : [
+ 'EVICT_LOOKASIDE',
'EVICTING',
- 'SKIP_UPDATE_ERR',
- 'SKIP_UPDATE_RESTORE',
+ 'EVICT_UPDATE_RESTORE',
+ 'VISIBILITY_ERR',
],
'txn_log_checkpoint' : [
'TXN_LOG_CKPT_CLEANUP',
@@ -107,19 +107,20 @@ flags = {
'session' : [
'SESSION_CAN_WAIT',
'SESSION_CLEAR_EVICT_WALK',
- 'SESSION_DISCARD_FORCE',
+ 'SESSION_INTERNAL',
'SESSION_LOCKED_CHECKPOINT',
'SESSION_LOCKED_HANDLE_LIST',
'SESSION_LOCKED_SCHEMA',
+ 'SESSION_LOCKED_SLOT',
'SESSION_LOCKED_TABLE',
- 'SESSION_INTERNAL',
'SESSION_LOGGING_INMEM',
+ 'SESSION_LOOKASIDE_CURSOR',
'SESSION_NO_CACHE',
- 'SESSION_NO_CACHE_CHECK',
'SESSION_NO_DATA_HANDLES',
+ 'SESSION_NO_EVICTION',
'SESSION_NO_LOGGING',
'SESSION_NO_SCHEMA_LOCK',
- 'SESSION_SALVAGE_CORRUPT_OK',
+ 'SESSION_QUIET_CORRUPT_FILE',
'SESSION_SERVER_ASYNC',
],
}
diff --git a/dist/s_all b/dist/s_all
index c624db06a97..8e3f265e79b 100755
--- a/dist/s_all
+++ b/dist/s_all
@@ -2,7 +2,7 @@
# Run standard scripts.
t=__wt.$$
-t_pfx=__s_all_tmp
+t_pfx=__s_all_tmp_
trap 'rm -f $t *.pyc __tmp __wt.* __s_all_tmp*' 0 1 2 3 13 15
# We require python which may not be installed.
diff --git a/dist/s_define b/dist/s_define
index 7809bf14918..77673bdcdf9 100755
--- a/dist/s_define
+++ b/dist/s_define
@@ -4,18 +4,23 @@
t=__wt.$$
trap 'rm -f $t; exit 0' 0 1 2 3 13 15
-# List of files to search.
+# List of source files to search.
l=`sed -e 's,#.*,,' -e '/^$/d' -e 's,^,../,' filelist`
l="$l `echo ../src/include/*.i ../src/utilities/*.c ../test/*/*.c`"
+# List of include files for source #defines.
+# Ignore the queue.h file, we don't use most of it.
+dl="../src/include/*.[hi] ../src/include/*.in"
+dl=`echo $dl | sed 's/ [^ ]*queue.h//'`
+
(
# Copy out the list of #defines we don't use, but it's OK.
sed -e '/^$/d' -e '/^#/d' < s_define.list
-# Get the list of #defines.
-# Ignore the list of configuration objects
-# Ignore the list of statistic "keys" generated for applications.
-search=`cat ../src/include/*.[hi] ../src/include/*.in |
+# Search the list of include files for #defines
+# Ignore configuration objects #defines
+# Ignore statistic "keys" generated for applications #defines
+search=`cat $dl |
sed -e '/configuration section: BEGIN/,/configuration section: END/d' \
-e '/Statistics section: BEGIN/,/Statistics section: END/d' |
egrep '^#define' |
diff --git a/dist/s_define.list b/dist/s_define.list
index 623a34447a8..aaf365a7376 100644
--- a/dist/s_define.list
+++ b/dist/s_define.list
@@ -16,115 +16,43 @@ TXN_API_CALL
TXN_API_CALL_NOCONF
TXN_API_END
WIN32_LEAN_AND_MEAN
-WT_ATOMIC_ADD1
-WT_ATOMIC_ADD2
-WT_ATOMIC_CAS1
-WT_ATOMIC_CAS2
-WT_ATOMIC_FETCH_ADD1
-WT_ATOMIC_FETCH_ADD2
-WT_ATOMIC_FETCH_ADD4
-WT_ATOMIC_STORE1
-WT_ATOMIC_STORE2
-WT_ATOMIC_SUB1
-WT_ATOMIC_SUB2
+WT_ATOMIC_CAS
+WT_ATOMIC_FUNC
WT_BARRIER
WT_BLOCK_DESC_SIZE
WT_CACHE_LINE_ALIGNMENT
WT_COMPILER_TYPE_ALIGN
WT_CONN_CHECK_PANIC
+WT_COUNTER_SLOTS
WT_DEADLOCK
WT_DEBUG_BYTE
WT_HANDLE_CLOSED
WT_HANDLE_NULLABLE
+WT_LOG_SLOT_ACTIVE
+WT_LOG_SLOT_BITS
+WT_LOG_SLOT_JOIN_MASK
+WT_LOG_SLOT_MASK_OFF
+WT_LOG_SLOT_MASK_ON
+WT_LOG_SLOT_MAXBITS
+WT_LOG_SLOT_UNBUFFERED_ISSET
WT_PACKED_STRUCT_BEGIN
WT_PACKED_STRUCT_END
WT_READ_BARRIER
WT_REF_SIZE
WT_SESSION_LOCKED_CHECKPOINT
-WT_STAT_ATOMIC_DECR
-WT_STAT_ATOMIC_DECRV
-WT_STAT_ATOMIC_INCR
-WT_STAT_ATOMIC_INCRV
+WT_STATS_FIELD_TO_SLOT
+WT_STATS_SLOT_ID
WT_STAT_DECR
WT_STAT_DECRV
-WT_STAT_FAST_ATOMIC_DECR
-WT_STAT_FAST_ATOMIC_DECRV
-WT_STAT_FAST_ATOMIC_INCR
-WT_STAT_FAST_ATOMIC_INCRV
-WT_STAT_FAST_CONN_ATOMIC_DECRV
-WT_STAT_FAST_CONN_ATOMIC_INCRV
WT_STAT_FAST_CONN_DECRV
WT_STAT_FAST_DATA_DECRV
WT_STAT_FAST_DECR
WT_STAT_FAST_DECRV
+WT_STAT_FAST_INCR
WT_STAT_FAST_INCRV
WT_STAT_FAST_SET
+WT_STAT_WRITE
WT_WITH_LOCK
__F
__WIREDTIGER_EXT_H_
__WIREDTIGER_H_
-__WT_ATOMIC_ADD
-__WT_ATOMIC_CAS
-__WT_ATOMIC_FETCH_ADD
-__WT_ATOMIC_STORE
-__WT_ATOMIC_SUB
-
-# List of queue.h #defines that are "unused", but it's OK.
-LIST_EMPTY
-LIST_ENTRY
-LIST_FIRST
-LIST_FOREACH
-LIST_HEAD
-LIST_HEAD_INITIALIZER
-LIST_INIT
-LIST_INSERT_AFTER
-LIST_INSERT_BEFORE
-LIST_INSERT_HEAD
-LIST_NEXT
-LIST_REMOVE
-QMD_TRACE_ELEM
-QMD_TRACE_HEAD
-QUEUE_MACRO_DEBUG
-SLIST_EMPTY
-SLIST_ENTRY
-SLIST_FIRST
-SLIST_FOREACH
-SLIST_FOREACH_PREVPTR
-SLIST_HEAD
-SLIST_HEAD_INITIALIZER
-SLIST_INIT
-SLIST_INSERT_AFTER
-SLIST_INSERT_HEAD
-SLIST_NEXT
-SLIST_REMOVE
-SLIST_REMOVE_HEAD
-STAILQ_CONCAT
-STAILQ_EMPTY
-STAILQ_ENTRY
-STAILQ_FIRST
-STAILQ_FOREACH
-STAILQ_HEAD
-STAILQ_HEAD_INITIALIZER
-STAILQ_INIT
-STAILQ_INSERT_AFTER
-STAILQ_INSERT_HEAD
-STAILQ_INSERT_TAIL
-STAILQ_LAST
-STAILQ_NEXT
-STAILQ_REMOVE
-STAILQ_REMOVE_HEAD
-STAILQ_REMOVE_HEAD_UNTIL
-TAILQ_CONCAT
-TAILQ_EMPTY
-TAILQ_ENTRY
-TAILQ_FOREACH_REVERSE
-TAILQ_HEAD
-TAILQ_HEAD_INITIALIZER
-TAILQ_INSERT_AFTER
-TAILQ_INSERT_BEFORE
-TAILQ_LAST
-TAILQ_NEXT
-TAILQ_PREV
-TRACEBUF
-TRASHIT
-_DB_QUEUE_H_
diff --git a/dist/s_stat b/dist/s_stat
index 152097f14be..44c22ab56bb 100755
--- a/dist/s_stat
+++ b/dist/s_stat
@@ -16,7 +16,7 @@ l="$l `echo ../src/include/*.i`"
(
# Get the list of statistics fields.
search=`sed \
- -e 's/^ WT_STATS \([a-z_*]*\);$/\1/p' \
+ -e 's/^ int64_t \([a-z_*]*\);$/\1/p' \
-e d ../src/include/stat.h |
sort`
diff --git a/dist/s_string.ok b/dist/s_string.ok
index 48c0f7f30f4..bfc4124f74d 100644
--- a/dist/s_string.ok
+++ b/dist/s_string.ok
@@ -32,6 +32,7 @@ BIGENDIAN
BOOL
BSR
BTREE
+BUF
BZ
Barack
Bitfield
@@ -156,11 +157,13 @@ KVS
Kanowski's
Kounavis
LANGID
+LAS
LF
LLLLLL
LLLLLLL
LOGREC
LOGSCAN
+LOOKASIDE
LRU
LRVv
LSB
@@ -176,6 +179,7 @@ Levyx
Llqr
Llqrt
LockFile
+Lookaside
Lookup
MALLOC
MEM
@@ -210,6 +214,7 @@ NetBSD
NoAddr
Noll
Nul
+OOB
OPTYPE
OUTBUFF
OVFL
@@ -231,6 +236,7 @@ Preload
Prepend
Qsort
RCS
+RECNO
REF's
REFs
RET
@@ -291,6 +297,7 @@ ULINE
URI
URIs
UTF
+Unbuffered
UnixLib
Unmap
UnmapViewOfFile
@@ -320,6 +327,7 @@ WiredTiger's
WiredTigerCheckpoint
WiredTigerException
WiredTigerInit
+WiredTigerLAS
WiredTigerLog
WiredTigerPreplog
WiredTigerTmplog
@@ -396,6 +404,7 @@ bzalloc
bzfree
bzip
calloc
+cas
catfmt
cd
centric
@@ -494,6 +503,7 @@ desc
dest
destSize
dev
+dh
dhandle
dhandles
dir
@@ -503,6 +513,7 @@ dlh
dll
dlopen
dlsym
+dmalloc
dmsg
doxgen
doxygen
@@ -512,6 +523,7 @@ dsk
dsrc
dst
dstlen
+dstrdup
dsync
dumpcmp
dumpfile
@@ -648,6 +660,7 @@ kvraw
kvs
kvsbdb
lang
+las
latencies
lbrace
lbracket
@@ -675,6 +688,7 @@ logread
logrec
logsize
logtest
+lookaside
lookup
lookups
lossy
@@ -745,6 +759,7 @@ nop
noraw
notfound
notsup
+notused
nset
nsnap
nul
@@ -797,6 +812,7 @@ progname
ps
psp
pthread
+ptr
pushms
putK
putV
@@ -937,6 +953,7 @@ uS
uint
uintmax
unbare
+unbuffered
uncompressing
uncompresssed
undef
@@ -945,6 +962,7 @@ unesc
unescaped
uninstantiated
unistd
+unlinked
unmap
unmarshall
unmarshalled
diff --git a/dist/s_style b/dist/s_style
index e5411748a31..0e013852914 100755
--- a/dist/s_style
+++ b/dist/s_style
@@ -46,6 +46,11 @@ else
cat $t
fi
+ if ! expr "$f" : 'src/include/queue\.h' > /dev/null &&
+ egrep 'STAILQ_|SLIST_|\bLIST_' $f ; then
+ echo "$f: use TAILQ for all lists"
+ fi
+
if ! expr "$f" : 'src/os_posix/.*' > /dev/null &&
! expr "$f" : 'src/os_win/.*' > /dev/null &&
! expr "$f" : 'src/include/extern.h' > /dev/null &&
@@ -69,6 +74,13 @@ else
cat $t
}
+ # Alignment directive before "struct".
+ egrep 'WT_COMPILER_TYPE_ALIGN.*struct' $f > $t
+ test -s $t && {
+ echo "$f: compiler alignment direction must precede \"struct\""
+ cat $t
+ }
+
# Direct calls to functions we're not supposed to use in the library.
# We don't check for all of them, just a few of the common ones.
if ! expr "$f" : 'bench/.*' > /dev/null &&
diff --git a/dist/s_typedef b/dist/s_typedef
index 2e206757f48..233f432f0e5 100755
--- a/dist/s_typedef
+++ b/dist/s_typedef
@@ -25,7 +25,7 @@ build() {
$l |
sed -e 's/WT_PACKED_STRUCT_BEGIN(\(.*\))/struct \1 {/' \
-e 's/WT_COMPILER_TYPE_ALIGN(.*)[ ]*//' \
- -e 's/^[ ]*//' -e 's/[ ]*{.*//' | sort | \
+ -e 's/^[ ]*//' -e 's/[ ]*{.*//' | sort -u | \
while read t n; do
upper=`echo $n | sed -e 's/^__//' | tr [a-z] [A-Z]`
echo "$t $n;"
diff --git a/dist/s_whitespace b/dist/s_whitespace
index 3a51b251bfe..dfc031e3ea4 100755
--- a/dist/s_whitespace
+++ b/dist/s_whitespace
@@ -4,7 +4,16 @@
t=__wt.$$
trap 'rm -f $t; exit 0' 0 1 2 3 13 15
-ws()
+# Clear lines that only contain whitespace.
+whitespace()
+{
+ sed -e 's/[ ][ ]*$//' < $1 > $t
+ cmp $t $1 > /dev/null 2>&1 || (echo "$1" && cp $t $1)
+}
+
+# Clear lines that only contain whitespace, compress multiple empty lines
+# into a single line, discarding trailing empty lines.
+whitespace_and_empty_line()
{
sed -e 's/[ ][ ]*$//' \
-e '/^$/N' \
@@ -14,10 +23,12 @@ ws()
cd ..
+# Scripts.
for f in `find dist -name '*.py' -name 's_*'`; do
- ws $f
+ whitespace_and_empty_line $f
done
+# C-language sources.
for f in `find examples ext src test \
-name '*.[chi]' -o \
-name '*.dox' -o \
@@ -26,5 +37,11 @@ for f in `find examples ext src test \
if expr "$f" : ".*/Makefile.in" > /dev/null; then
continue
fi
- ws $f
+ whitespace_and_empty_line $f
+done
+
+# Python sources.
+for f in `find test \
+ -name '*.py' | sed '/3rdparty/d'`; do
+ whitespace $f
done
diff --git a/dist/stat.py b/dist/stat.py
index 2a87d4425e6..c9684665a53 100644
--- a/dist/stat.py
+++ b/dist/stat.py
@@ -12,12 +12,11 @@ def print_struct(title, name, base, stats):
f.write('/*\n')
f.write(' * Statistics entries for ' + title + '.\n')
f.write(' */\n')
- f.write(
- '#define\tWT_' + name.upper() + '_STATS_BASE\t' + str(base) + '\n')
+ f.write('#define\tWT_' + name.upper() + '_STATS_BASE\t' + str(base) + '\n')
f.write('struct __wt_' + name + '_stats {\n')
for l in stats:
- f.write('\tWT_STATS ' + l.name + ';\n')
+ f.write('\tint64_t ' + l.name + ';\n')
f.write('};\n\n')
# Update the #defines in the stat.h file.
@@ -90,67 +89,113 @@ for line in open('../src/include/wiredtiger.in', 'r'):
f.close()
compare_srcfile(tmp_file, '../src/include/wiredtiger.in')
-def print_func(name, list):
- '''Print the functions for the stat.c file.'''
+def print_func(name, handle, list):
+ '''Print the structures/functions for the stat.c file.'''
+ f.write('\n')
+ f.write('static const char * const __stats_' + name + '_desc[] = {\n')
+ for l in list:
+ f.write('\t"' + l.desc + '",\n')
+ f.write('};\n')
+
+ f.write('''
+const char *
+__wt_stat_''' + name + '''_desc(int slot)
+{
+\treturn (__stats_''' + name + '''_desc[slot]);
+}
+''')
+
f.write('''
void
-__wt_stat_init_''' + name + '''_stats(WT_''' + name.upper() + '''_STATS *stats)
+__wt_stat_''' + name + '_init_single(WT_' + name.upper() + '''_STATS *stats)
{
-\t/* Clear, so can also be called for reinitialization. */
\tmemset(stats, 0, sizeof(*stats));
-
-''')
- for l in sorted(list):
- o = '\tstats->' + l.name + '.desc = "' + l.desc + '";\n'
- if len(o) + 7 > 80:
- o = o.replace('= ', '=\n\t ')
- f.write(o)
- f.write('''}
+}
''')
f.write('''
void
-__wt_stat_refresh_''' + name + '''_stats(void *stats_arg)
+__wt_stat_''' + name + '_init(' + handle + ''' *handle)
{
-\tWT_''' + name.upper() + '''_STATS *stats;
+\tint i;
+
+\tfor (i = 0; i < WT_COUNTER_SLOTS; ++i) {
+\t\thandle->stats[i] = &handle->stat_array[i];
+\t\t__wt_stat_''' + name + '''_init_single(handle->stats[i]);
+\t}
+}
+''')
-\tstats = (WT_''' + name.upper() + '''_STATS *)stats_arg;
+ f.write('''
+void
+__wt_stat_''' + name + '_clear_single(WT_' + name.upper() + '''_STATS *stats)
+{
''')
for l in sorted(list):
# no_clear: don't clear the value.
- if not 'no_clear' in l.flags:
- f.write('\tstats->' + l.name + '.v = 0;\n');
+ if 'no_clear' in l.flags:
+ f.write('\t\t/* not clearing ' + l.name + ' */\n')
+ else:
+ f.write('\tstats->' + l.name + ' = 0;\n')
f.write('}\n')
- # Aggregation is only interesting for data-source statistics.
- # Complain if any aggregation flags are set.
- if name == 'connection':
+ f.write('''
+void
+__wt_stat_''' + name + '_clear_all(WT_' + name.upper() + '''_STATS **stats)
+{
+\tu_int i;
+
+\tfor (i = 0; i < WT_COUNTER_SLOTS; ++i)
+\t\t__wt_stat_''' + name + '''_clear_single(stats[i]);
+}
+''')
+
+ # Single structure aggregation is currently only used by data sources.
+ if name == 'dsrc':
+ f.write('''
+void
+__wt_stat_''' + name + '''_aggregate_single(
+ WT_''' + name.upper() + '_STATS *from, WT_' + name.upper() + '''_STATS *to)
+{
+''')
for l in sorted(list):
- if 'no_aggregate' in l.flags or 'max_aggregate' in l.flags:
- print >>sys.stdout,\
- "Aggregation configuration for " +\
- name + "." + l.name + " statistics not supported"
- return;
+ if 'no_aggregate' in l.flags:
+ o = '\tto->' + l.name + ' = from->' + l.name + ';\n'
+ elif 'max_aggregate' in l.flags:
+ o = '\tif (from->' + l.name + ' > to->' + l.name + ')\n' +\
+ '\t\tto->' + l.name + ' = from->' + l.name + ';\n'
+ else:
+ o = '\tto->' + l.name + ' += from->' + l.name + ';\n'
+ if len(o) > 72: # Account for the leading tab.
+ o = o.replace(' += ', ' +=\n\t ')
+ f.write(o)
+ f.write('}\n')
f.write('''
void
-__wt_stat_aggregate_''' + name +
-'''_stats(const void *child, const void *parent)
+__wt_stat_''' + name + '''_aggregate(
+ WT_''' + name.upper() + '_STATS **from, WT_' + name.upper() + '''_STATS *to)
{
-\tWT_''' + name.upper() + '''_STATS *c, *p;
-
-\tc = (WT_''' + name.upper() + '''_STATS *)child;
-\tp = (WT_''' + name.upper() + '''_STATS *)parent;
''')
+ # Connection level aggregation does not currently have any computation
+ # of a maximum value; I'm leaving in support for it, but don't declare
+ # a temporary variable until it's needed.
+ for l in sorted(list):
+ if 'max_aggregate' in l.flags:
+ f.write('\tint64_t v;\n\n')
+ break;
for l in sorted(list):
if 'no_aggregate' in l.flags:
- continue;
+ o = '\tto->' + l.name + ' = from[0]->' + l.name + ';\n'
elif 'max_aggregate' in l.flags:
- o = 'if (c->' + l.name + '.v > p->' + l.name +\
- '.v)\n\t p->' + l.name + '.v = c->' + l.name + '.v;'
+ o = '\tif ((v = WT_STAT_READ(from, ' + l.name + ')) >\n' +\
+ '\t to->' + l.name + ')\n' +\
+ '\t\tto->' + l.name + ' = v;\n'
else:
- o = 'p->' + l.name + '.v += c->' + l.name + '.v;'
- f.write('\t' + o + '\n')
+ o = '\tto->' + l.name + ' += WT_STAT_READ(from, ' + l.name + ');\n'
+ if len(o) > 72: # Account for the leading tab.
+ o = o.replace(' += ', ' +=\n\t ')
+ f.write(o)
f.write('}\n')
# Write the stat initialization and refresh routines to the stat.c file.
@@ -158,12 +203,11 @@ f = open(tmp_file, 'w')
f.write('/* DO NOT EDIT: automatically built by dist/stat.py. */\n\n')
f.write('#include "wt_internal.h"\n')
-print_func('dsrc', dsrc_stats)
-print_func('connection', connection_stats)
+print_func('dsrc', 'WT_DATA_HANDLE', dsrc_stats)
+print_func('connection', 'WT_CONNECTION_IMPL', connection_stats)
f.close()
compare_srcfile(tmp_file, '../src/support/stat.c')
-
# Update the statlog file with the entries we can scale per second.
scale_info = 'no_scale_per_second_list = [\n'
clear_info = 'no_clear_list = [\n'
diff --git a/dist/stat_data.py b/dist/stat_data.py
index caf68364696..c91fc921380 100644
--- a/dist/stat_data.py
+++ b/dist/stat_data.py
@@ -7,14 +7,21 @@
# currently open'.
# NOTE: All statistics descriptions must have a prefix string followed by ':'.
#
-# Optional configuration flags:
-# no_clear Value not cleared when statistics cleared
-# no_scale Don't scale value per second in the logging tool script
-#
# Data-source statistics are normally aggregated across the set of underlying
# objects. Additional optionaly configuration flags are available:
# no_aggregate Ignore the value when aggregating statistics
# max_aggregate Take the maximum value when aggregating statistics
+#
+# Optional configuration flags:
+# no_clear Value not cleared when statistics cleared
+# no_scale Don't scale value per second in the logging tool script
+#
+# The no_clear flag is a little complicated: it means we don't clear the values
+# when resetting statistics after each run (necessary when the WiredTiger engine
+# is updating values that persist over multiple runs, for example the count of
+# cursors), but it also causes the underlying display routines to not treat the
+# change between displays as relative to the number of seconds, that is, it's an
+# absolute value. The no_clear flag should be set in either case.
from operator import attrgetter
import sys
@@ -120,9 +127,9 @@ connection_stats = [
AsyncStat('async_alloc_race', 'number of allocation state races'),
AsyncStat('async_alloc_view',
'number of operation slots viewed for allocation'),
+ AsyncStat('async_cur_queue', 'current work queue length'),
AsyncStat('async_flush', 'number of flush calls'),
AsyncStat('async_full', 'number of times operation allocation failed'),
- AsyncStat('async_cur_queue', 'current work queue length'),
AsyncStat('async_max_queue',
'maximum work queue length', 'no_clear,no_scale'),
AsyncStat('async_nowork', 'number of times worker found no work'),
@@ -149,11 +156,11 @@ connection_stats = [
##########################################
CacheStat('cache_bytes_dirty',
'tracked dirty bytes in the cache', 'no_clear,no_scale'),
- CacheStat('cache_bytes_inuse',
- 'bytes currently in the cache', 'no_clear,no_scale'),
CacheStat('cache_bytes_internal',
'tracked bytes belonging to internal pages in the cache',
'no_clear,no_scale'),
+ CacheStat('cache_bytes_inuse',
+ 'bytes currently in the cache', 'no_clear,no_scale'),
CacheStat('cache_bytes_leaf',
'tracked bytes belonging to leaf pages in the cache',
'no_clear,no_scale'),
@@ -165,11 +172,11 @@ connection_stats = [
CacheStat('cache_bytes_read', 'bytes read into cache'),
CacheStat('cache_bytes_write', 'bytes written from cache'),
CacheStat('cache_eviction_app', 'pages evicted by application threads'),
+ CacheStat('cache_eviction_checkpoint', 'checkpoint blocked page eviction'),
CacheStat('cache_eviction_clean', 'unmodified pages evicted'),
CacheStat('cache_eviction_deepen',
'page split during eviction deepened the tree'),
CacheStat('cache_eviction_dirty', 'modified pages evicted'),
- CacheStat('cache_eviction_checkpoint', 'checkpoint blocked page eviction'),
CacheStat('cache_eviction_fail',
'pages selected for eviction unable to be evicted'),
CacheStat('cache_eviction_force',
@@ -197,21 +204,35 @@ connection_stats = [
CacheStat('cache_eviction_worker_evicting',
'eviction worker thread evicting pages'),
CacheStat('cache_inmem_split', 'in-memory page splits'),
+ CacheStat('cache_inmem_splittable',
+ 'in-memory page passed criteria to be split'),
+ CacheStat('cache_lookaside_insert', 'lookaside table insert calls'),
+ CacheStat('cache_lookaside_remove', 'lookaside table remove calls'),
CacheStat('cache_overhead', 'percentage overhead', 'no_clear,no_scale'),
CacheStat('cache_pages_dirty',
'tracked dirty pages in the cache', 'no_clear,no_scale'),
CacheStat('cache_pages_inuse',
'pages currently held in the cache', 'no_clear,no_scale'),
CacheStat('cache_read', 'pages read into cache'),
+ CacheStat('cache_read_lookaside',
+ 'pages read into cache requiring lookaside entries'),
CacheStat('cache_write', 'pages written from cache'),
+ CacheStat('cache_write_lookaside',
+ 'page written requiring lookaside records'),
+ CacheStat('cache_write_restore',
+ 'pages written requiring in-memory restoration'),
##########################################
# Dhandle statistics
##########################################
- DhandleStat('dh_conn_handles', 'connection dhandles swept'),
- DhandleStat('dh_conn_ref', 'connection candidate referenced'),
- DhandleStat('dh_conn_sweeps', 'connection sweeps'),
- DhandleStat('dh_conn_tod', 'connection time-of-death sets'),
+ DhandleStat('dh_conn_handle_count',
+ 'connection data handles currently active', 'no_clear,no_scale'),
+ DhandleStat('dh_sweep_close', 'connection sweep dhandles closed'),
+ DhandleStat('dh_sweep_remove',
+ 'connection sweep dhandles removed from hash list'),
+ DhandleStat('dh_sweep_ref', 'connection sweep candidate became referenced'),
+ DhandleStat('dh_sweep_tod', 'connection sweep time-of-death sets'),
+ DhandleStat('dh_sweeps', 'connection sweeps'),
DhandleStat('dh_session_handles', 'session dhandles swept'),
DhandleStat('dh_session_sweeps', 'session sweep attempts'),
@@ -225,8 +246,8 @@ connection_stats = [
LogStat('log_compress_len', 'total size of compressed records'),
LogStat('log_compress_mem', 'total in-memory size of compressed records'),
LogStat('log_compress_small', 'log records too small to compress'),
- LogStat('log_compress_writes', 'log records compressed'),
LogStat('log_compress_write_fails', 'log records not compressed'),
+ LogStat('log_compress_writes', 'log records compressed'),
LogStat('log_max_filesize', 'maximum log file size', 'no_clear,no_scale'),
LogStat('log_prealloc_files', 'pre-allocated log files prepared'),
LogStat('log_prealloc_max',
@@ -236,20 +257,18 @@ connection_stats = [
LogStat('log_scan_records', 'records processed by log scan'),
LogStat('log_scan_rereads', 'log scan records requiring two reads'),
LogStat('log_scans', 'log scan operations'),
- LogStat('log_sync', 'log sync operations'),
- LogStat('log_sync_dir', 'log sync_dir operations'),
- LogStat('log_writes', 'log write operations'),
- LogStat('log_write_lsn', 'log server thread advances write LSN'),
-
+ LogStat('log_slot_closes', 'consolidated slot closures'),
LogStat('log_slot_coalesced', 'written slots coalesced'),
LogStat('log_slot_consolidated', 'logging bytes consolidated'),
- LogStat('log_slot_closes', 'consolidated slot closures'),
LogStat('log_slot_joins', 'consolidated slot joins'),
LogStat('log_slot_races', 'consolidated slot join races'),
- LogStat('log_slot_toobig', 'record size exceeded maximum'),
- LogStat('log_slot_toosmall',
- 'failed to find a slot large enough for record'),
+ LogStat('log_slot_switch_busy', 'busy returns attempting to switch slots'),
LogStat('log_slot_transitions', 'consolidated slot join transitions'),
+ LogStat('log_slot_unbuffered', 'consolidated slot unbuffered writes'),
+ LogStat('log_sync', 'log sync operations'),
+ LogStat('log_sync_dir', 'log sync_dir operations'),
+ LogStat('log_write_lsn', 'log server thread advances write LSN'),
+ LogStat('log_writes', 'log write operations'),
##########################################
# Reconciliation statistics
@@ -268,6 +287,8 @@ connection_stats = [
TxnStat('txn_checkpoint', 'transaction checkpoints'),
TxnStat('txn_checkpoint_generation',
'transaction checkpoint generation', 'no_clear,no_scale'),
+ TxnStat('txn_checkpoint_running',
+ 'transaction checkpoint currently running', 'no_clear,no_scale'),
TxnStat('txn_checkpoint_time_max',
'transaction checkpoint max time (msecs)', 'no_clear,no_scale'),
TxnStat('txn_checkpoint_time_min',
@@ -276,17 +297,16 @@ connection_stats = [
'transaction checkpoint most recent time (msecs)', 'no_clear,no_scale'),
TxnStat('txn_checkpoint_time_total',
'transaction checkpoint total time (msecs)', 'no_clear,no_scale'),
- TxnStat('txn_checkpoint_running',
- 'transaction checkpoint currently running', 'no_clear,no_scale'),
+ TxnStat('txn_commit', 'transactions committed'),
+ TxnStat('txn_fail_cache',
+ 'transaction failures due to cache overflow'),
TxnStat('txn_pinned_checkpoint_range',
'transaction range of IDs currently pinned by a checkpoint',
- 'no_clear,no_scale'),
+ 'no_clear,no_scale'),
TxnStat('txn_pinned_range',
'transaction range of IDs currently pinned', 'no_clear,no_scale'),
- TxnStat('txn_sync', 'transaction sync calls'),
- TxnStat('txn_commit', 'transactions committed'),
- TxnStat('txn_fail_cache', 'transaction failures due to cache overflow'),
TxnStat('txn_rollback', 'transactions rolled back'),
+ TxnStat('txn_sync', 'transaction sync calls'),
##########################################
# LSM statistics
@@ -322,6 +342,7 @@ connection_stats = [
CursorStat('cursor_prev', 'cursor prev calls'),
CursorStat('cursor_remove', 'cursor remove calls'),
CursorStat('cursor_reset', 'cursor reset calls'),
+ CursorStat('cursor_restart', 'cursor restarted searches'),
CursorStat('cursor_search', 'cursor search calls'),
CursorStat('cursor_search_near', 'cursor search near calls'),
CursorStat('cursor_update', 'cursor update calls'),
@@ -362,6 +383,7 @@ dsrc_stats = [
CursorStat('cursor_remove', 'remove calls'),
CursorStat('cursor_remove_bytes', 'cursor-remove key bytes removed'),
CursorStat('cursor_reset', 'reset calls'),
+ CursorStat('cursor_restart', 'restarted searches'),
CursorStat('cursor_search', 'search calls'),
CursorStat('cursor_search_near', 'search near calls'),
CursorStat('cursor_update', 'update calls'),
@@ -378,6 +400,8 @@ dsrc_stats = [
'column-store fixed-size leaf pages', 'no_scale'),
BtreeStat('btree_column_internal',
'column-store internal pages', 'no_scale'),
+ BtreeStat('btree_column_rle',
+ 'column-store variable-size RLE encoded values', 'no_scale'),
BtreeStat('btree_column_variable',
'column-store variable-size leaf pages', 'no_scale'),
BtreeStat('btree_compact_rewrite', 'pages rewritten by compaction'),
@@ -421,9 +445,9 @@ dsrc_stats = [
##########################################
# Block manager statistics
##########################################
- BlockStat('block_alloc', 'blocks allocated'),
BlockStat('allocation_size',
'file allocation unit size', 'no_aggregate,no_scale'),
+ BlockStat('block_alloc', 'blocks allocated'),
BlockStat('block_checkpoint_size', 'checkpoint size', 'no_scale'),
BlockStat('block_extension', 'allocations requiring file extension'),
BlockStat('block_free', 'blocks freed'),
@@ -450,20 +474,28 @@ dsrc_stats = [
CacheStat('cache_eviction_internal', 'internal pages evicted'),
CacheStat('cache_eviction_split', 'pages split during eviction'),
CacheStat('cache_inmem_split', 'in-memory page splits'),
+ CacheStat('cache_inmem_splittable',
+ 'in-memory page passed criteria to be split'),
CacheStat('cache_overflow_value',
'overflow values cached in memory', 'no_scale'),
CacheStat('cache_read', 'pages read into cache'),
+ CacheStat('cache_read_lookaside',
+ 'pages read into cache requiring lookaside entries'),
CacheStat('cache_read_overflow', 'overflow pages read into cache'),
CacheStat('cache_write', 'pages written from cache'),
+ CacheStat('cache_write_lookaside',
+ 'page written requiring lookaside records'),
+ CacheStat('cache_write_restore',
+ 'pages written requiring in-memory restoration'),
##########################################
# Compression statistics
##########################################
- CompressStat('compress_raw_ok', 'raw compression call succeeded'),
CompressStat('compress_raw_fail',
'raw compression call failed, no additional data available'),
CompressStat('compress_raw_fail_temporary',
'raw compression call failed, additional data available'),
+ CompressStat('compress_raw_ok', 'raw compression call succeeded'),
CompressStat('compress_read', 'compressed pages read'),
CompressStat('compress_write', 'compressed pages written'),
CompressStat('compress_write_fail', 'page written failed to compress'),
@@ -474,21 +506,21 @@ dsrc_stats = [
# Reconciliation statistics
##########################################
RecStat('rec_dictionary', 'dictionary matches'),
+ RecStat('rec_multiblock_internal', 'internal page multi-block writes'),
+ RecStat('rec_multiblock_leaf', 'leaf page multi-block writes'),
+ RecStat('rec_multiblock_max',
+ 'maximum blocks required for a page', 'max_aggregate,no_scale'),
RecStat('rec_overflow_key_internal', 'internal-page overflow keys'),
RecStat('rec_overflow_key_leaf', 'leaf-page overflow keys'),
RecStat('rec_overflow_value', 'overflow values written'),
- RecStat('rec_page_match', 'page checksum matches'),
RecStat('rec_page_delete', 'pages deleted'),
+ RecStat('rec_page_match', 'page checksum matches'),
RecStat('rec_pages', 'page reconciliation calls'),
RecStat('rec_pages_eviction', 'page reconciliation calls for eviction'),
RecStat('rec_prefix_compression',
'leaf page key bytes discarded using prefix compression'),
RecStat('rec_suffix_compression',
'internal page key bytes discarded using suffix compression'),
- RecStat('rec_multiblock_internal', 'internal page multi-block writes'),
- RecStat('rec_multiblock_leaf', 'leaf page multi-block writes'),
- RecStat('rec_multiblock_max',
- 'maximum blocks required for a page', 'max_aggregate,no_scale'),
##########################################
# Transaction statistics
diff --git a/examples/c/ex_all.c b/examples/c/ex_all.c
index 6905169c4c2..213e058d4cc 100644
--- a/examples/c/ex_all.c
+++ b/examples/c/ex_all.c
@@ -1064,7 +1064,8 @@ main(void)
home = NULL;
/*! [Open a connection] */
- ret = wiredtiger_open(home, NULL, "create,cache_size=500M", &conn);
+ ret = wiredtiger_open(home, NULL,
+ "create,cache_size=5GB,log=(enabled,recover=on)", &conn);
/*! [Open a connection] */
if (ret == 0)
diff --git a/examples/c/ex_log.c b/examples/c/ex_log.c
index 136cca900cd..d5a8f32487d 100644
--- a/examples/c/ex_log.c
+++ b/examples/c/ex_log.c
@@ -128,20 +128,22 @@ print_record(WT_LSN *lsn, uint32_t opcount,
* A simple walk of the log.
*/
static int
-simple_walk_log(WT_SESSION *session)
+simple_walk_log(WT_SESSION *session, int count_min)
{
WT_CURSOR *cursor;
WT_LSN lsn;
WT_ITEM logrec_key, logrec_value;
uint64_t txnid;
uint32_t fileid, opcount, optype, rectype;
- int ret;
+ int count, ret;
/*! [log cursor open] */
ret = session->open_cursor(session, "log:", NULL, NULL, &cursor);
/*! [log cursor open] */
+ count = 0;
while ((ret = cursor->next(cursor)) == 0) {
+ count++;
/*! [log cursor get_key] */
ret = cursor->get_key(cursor, &lsn.file, &lsn.offset, &opcount);
/*! [log cursor get_key] */
@@ -156,6 +158,12 @@ simple_walk_log(WT_SESSION *session)
if (ret == WT_NOTFOUND)
ret = 0;
ret = cursor->close(cursor);
+ if (count < count_min) {
+ fprintf(stderr,
+ "Expected minimum %d records, found %d\n",
+ count_min, count);
+ abort();
+ }
return (ret);
}
/*! [log cursor walk] */
@@ -206,11 +214,13 @@ walk_log(WT_SESSION *session)
/*
* If the operation is a put, replay it here on the backup
- * connection. Note, we cheat by looking only for fileid 1
- * in this example. The metadata is fileid 0.
+ * connection.
+ *
+ * !!!
+ * Minor cheat: the metadata is fileid 0, skip its records.
*/
- if (fileid == 1 && rectype == WT_LOGREC_COMMIT &&
- optype == WT_LOGOP_ROW_PUT) {
+ if (fileid != 0 &&
+ rectype == WT_LOGREC_COMMIT && optype == WT_LOGOP_ROW_PUT) {
if (!in_txn) {
ret = session2->begin_transaction(session2,
NULL);
@@ -276,9 +286,10 @@ main(void)
WT_CONNECTION *wt_conn;
WT_CURSOR *cursor;
WT_SESSION *session;
- int i, record_count, ret;
+ int count_min, i, record_count, ret;
char cmd_buf[256], k[16], v[16];
+ count_min = 0;
snprintf(cmd_buf, sizeof(cmd_buf), "rm -rf %s %s && mkdir %s %s",
home1, home2, home1, home2);
if ((ret = system(cmd_buf)) != 0) {
@@ -293,6 +304,7 @@ main(void)
ret = wt_conn->open_session(wt_conn, NULL, NULL, &session);
ret = session->create(session, uri, "key_format=S,value_format=S");
+ count_min++;
ret = session->open_cursor(session, uri, NULL, NULL, &cursor);
/*
@@ -304,6 +316,7 @@ main(void)
cursor->set_key(cursor, k);
cursor->set_value(cursor, v);
ret = cursor->insert(cursor);
+ count_min++;
}
ret = session->begin_transaction(session, NULL);
/*
@@ -317,10 +330,12 @@ main(void)
ret = cursor->insert(cursor);
}
ret = session->commit_transaction(session, NULL);
+ count_min++;
ret = cursor->close(cursor);
/*! [log cursor printf] */
ret = session->log_printf(session, "Wrote %d records", record_count);
+ count_min++;
/*! [log cursor printf] */
/*
@@ -336,7 +351,7 @@ main(void)
}
ret = wt_conn->open_session(wt_conn, NULL, NULL, &session);
- ret = simple_walk_log(session);
+ ret = simple_walk_log(session, count_min);
ret = walk_log(session);
ret = wt_conn->close(wt_conn, NULL);
return (ret);
diff --git a/examples/python/ex_access.py b/examples/python/ex_access.py
index 8eeefd56cf7..2940ac63625 100755
--- a/examples/python/ex_access.py
+++ b/examples/python/ex_access.py
@@ -50,6 +50,6 @@ cursor.insert()
# Iterate through the records
cursor.reset()
for key, value in cursor:
- print('Got record: ' + key + ' : ' + value)
+ print('Got record: %s : %s' % (key, value))
conn.close()
diff --git a/examples/python/ex_stat.py b/examples/python/ex_stat.py
index e27177403cc..af2c4f7a1a7 100755
--- a/examples/python/ex_stat.py
+++ b/examples/python/ex_stat.py
@@ -32,6 +32,7 @@
import os
from wiredtiger import wiredtiger_open,WIREDTIGER_VERSION_STRING,stat
+
def main():
# Create a clean test directory for this run of the test program
os.system('rm -rf WT_HOME')
@@ -39,16 +40,16 @@ def main():
# Connect to the database and open a session
conn = wiredtiger_open('WT_HOME', 'create,statistics=(all)')
session = conn.open_session()
-
+
# Create a simple table
session.create('table:access', 'key_format=S,value_format=S')
-
+
# Open a cursor and insert a record
cursor = session.open_cursor('table:access', None)
- cursor['key'] = 'value'
+ cursor['key'] = 'value'
cursor.close()
-
+
session.checkpoint()
print WIREDTIGER_VERSION_STRING
print_database_stats(session)
@@ -57,46 +58,51 @@ def main():
print_derived_stats(session)
conn.close()
+
def print_database_stats(session):
statcursor = session.open_cursor("statistics:")
print_cursor(statcursor)
statcursor.close()
+
def print_file_stats(session):
fstatcursor = session.open_cursor("statistics:table:access")
print_cursor(fstatcursor)
fstatcursor.close()
+
def print_overflow_pages(session):
ostatcursor = session.open_cursor("statistics:table:access")
val = ostatcursor[stat.dsrc.btree_overflow]
- if val != 0 :
- print str(val[0]) + '=' + str(val[1])
+ if val != 0:
+ print '%s=%s' % (str(val[0]), str(val[1]))
ostatcursor.close()
+
def print_derived_stats(session):
dstatcursor = session.open_cursor("statistics:table:access")
ckpt_size = dstatcursor[stat.dsrc.block_checkpoint_size][1]
file_size = dstatcursor[stat.dsrc.block_size][1]
percent = 0
- if file_size != 0 :
+ if file_size != 0:
percent = 100 * ((float(file_size) - float(ckpt_size)) / float(file_size))
- print "Table is %" + str(percent) + " fragmented"
+ print "Table is %%%s fragmented" % str(percent)
app_insert = int(dstatcursor[stat.dsrc.cursor_insert_bytes][1])
app_remove = int(dstatcursor[stat.dsrc.cursor_remove_bytes][1])
app_update = int(dstatcursor[stat.dsrc.cursor_update_bytes][1])
- fs_writes = int(dstatcursor[stat.dsrc.cache_bytes_write][1])
+ fs_writes = int(dstatcursor[stat.dsrc.cache_bytes_write][1])
- if(app_insert + app_remove + app_update != 0):
+ if app_insert + app_remove + app_update != 0:
print "Write amplification is " + '{:.2f}'.format(fs_writes / (app_insert + app_remove + app_update))
dstatcursor.close()
+
def print_cursor(mycursor):
while mycursor.next() == 0:
val = mycursor.get_value()
- if val[1] != '0' :
- print str(val[0]) + '=' + str(val[1])
+ if val[1] != '0':
+ print '%s=%s' % (str(val[0]), str(val[1]))
if __name__ == "__main__":
main()
diff --git a/ext/encryptors/rotn/rotn_encrypt.c b/ext/encryptors/rotn/rotn_encrypt.c
index 503dcae83a7..5b29e66c503 100644
--- a/ext/encryptors/rotn/rotn_encrypt.c
+++ b/ext/encryptors/rotn/rotn_encrypt.c
@@ -68,7 +68,7 @@
typedef struct {
WT_ENCRYPTOR encryptor; /* Must come first */
- WT_EXTENSION_API *wt_api; /* Extension API */
+ WT_EXTENSION_API *wtext; /* Extension API */
int rot_N; /* rotN value */
char *keyid; /* Saved keyid */
@@ -76,6 +76,7 @@ typedef struct {
u_char *shift_forw; /* Encrypt shift data from secretkey */
u_char *shift_back; /* Decrypt shift data from secretkey */
size_t shift_len; /* Length of shift* byte arrays */
+ int force_error; /* Force a decrypt error for testing */
} ROTN_ENCRYPTOR;
/*! [WT_ENCRYPTOR initialization structure] */
@@ -84,6 +85,22 @@ typedef struct {
#define IV_LEN 16
/*
+ * rotn_error --
+ * Display an error from this module in a standard way.
+ */
+static int
+rotn_error(ROTN_ENCRYPTOR *encryptor, WT_SESSION *session, int err,
+ const char *msg)
+{
+ WT_EXTENSION_API *wtext;
+
+ wtext = encryptor->wtext;
+ (void)wtext->err_printf(wtext, session,
+ "rotn encryption: %s: %s", msg, wtext->strerror(wtext, NULL, err));
+ return (err);
+}
+
+/*
* make_cksum --
* This is where one would call a checksum function on the encrypted
* buffer. Here we just put a constant value in it.
@@ -221,13 +238,18 @@ rotn_decrypt(WT_ENCRYPTOR *encryptor, WT_SESSION *session,
(void)session; /* Unused */
/*
+ * For certain tests, force an error we can recognize.
+ */
+ if (rotn_encryptor->force_error)
+ return (-1000);
+
+ /*
* Make sure it is big enough.
*/
mylen = src_len - (CHKSUM_LEN + IV_LEN);
- if (dst_len < mylen) {
- fprintf(stderr, "Rotate: ENOMEM ERROR\n");
- return (ENOMEM);
- }
+ if (dst_len < mylen)
+ return (rotn_error(rotn_encryptor, session,
+ ENOMEM, "decrypt buffer not big enough"));
/*
* !!! Most implementations would verify the checksum here.
@@ -286,7 +308,7 @@ rotn_customize(WT_ENCRYPTOR *encryptor, WT_SESSION *session,
const ROTN_ENCRYPTOR *orig;
ROTN_ENCRYPTOR *rotn_encryptor;
WT_CONFIG_ITEM keyid, secret;
- WT_EXTENSION_API *wt_api;
+ WT_EXTENSION_API *wtext;
size_t i, len;
int ret, keyid_val;
u_char base;
@@ -295,7 +317,7 @@ rotn_customize(WT_ENCRYPTOR *encryptor, WT_SESSION *session,
keyid_val = 0;
orig = (const ROTN_ENCRYPTOR *)encryptor;
- wt_api = orig->wt_api;
+ wtext = orig->wtext;
if ((rotn_encryptor = calloc(1, sizeof(ROTN_ENCRYPTOR))) == NULL)
return (errno);
@@ -305,7 +327,7 @@ rotn_customize(WT_ENCRYPTOR *encryptor, WT_SESSION *session,
/*
* Stash the keyid from the configuration string.
*/
- if ((ret = wt_api->config_get(wt_api, session, encrypt_config,
+ if ((ret = wtext->config_get(wtext, session, encrypt_config,
"keyid", &keyid)) == 0 && keyid.len != 0) {
/*
* In this demonstration, we expect keyid to be a number.
@@ -327,7 +349,7 @@ rotn_customize(WT_ENCRYPTOR *encryptor, WT_SESSION *session,
* We stash the secret key from the configuration string
* and build some shift bytes to make encryption/decryption easy.
*/
- if ((ret = wt_api->config_get(wt_api, session, encrypt_config,
+ if ((ret = wtext->config_get(wtext, session, encrypt_config,
"secretkey", &secret)) == 0 && secret.len != 0) {
len = secret.len;
if ((rotn_encryptor->secretkey = malloc(len + 1)) == NULL ||
@@ -396,6 +418,53 @@ rotn_terminate(WT_ENCRYPTOR *encryptor, WT_SESSION *session)
}
/*! [WT_ENCRYPTOR terminate] */
+/*
+ * rotn_configure --
+ * WiredTiger no-op encryption configuration.
+ */
+static int
+rotn_configure(ROTN_ENCRYPTOR *rotn_encryptor, WT_CONFIG_ARG *config)
+{
+ WT_CONFIG_ITEM k, v;
+ WT_CONFIG_PARSER *config_parser;
+ WT_EXTENSION_API *wtext; /* Extension API */
+ int ret, t_ret;
+
+ wtext = rotn_encryptor->wtext;
+
+ /* Get the configuration string. */
+ if ((ret = wtext->config_get(wtext, NULL, config, "config", &v)) != 0)
+ return (rotn_error(rotn_encryptor, NULL, ret,
+ "WT_EXTENSION_API.config_get"));
+
+ /* Step through the list of configuration options. */
+ if ((ret = wtext->config_parser_open(
+ wtext, NULL, v.str, v.len, &config_parser)) != 0)
+ return (rotn_error(rotn_encryptor, NULL, ret,
+ "WT_EXTENSION_API.config_parser_open"));
+
+ while ((ret = config_parser->next(config_parser, &k, &v)) == 0) {
+ if (strncmp("rotn_force_error", k.str, k.len) == 0 &&
+ strlen("rotn_force_error") == k.len) {
+ rotn_encryptor->force_error = v.val == 0 ? 0 : 1;
+ continue;
+ }
+ else {
+ (void)config_parser->close(config_parser);
+ return (rotn_error(rotn_encryptor, NULL, EINVAL,
+ "unknown config key"));
+ }
+ }
+ if ((t_ret = config_parser->close(config_parser)) != 0)
+ return (rotn_error(rotn_encryptor, NULL, t_ret,
+ "WT_CONFIG_PARSER.close"));
+ if (ret != WT_NOTFOUND)
+ return (rotn_error(rotn_encryptor, NULL, ret,
+ "WT_CONFIG_PARSER.next"));
+
+ return (0);
+}
+
/*! [WT_ENCRYPTOR initialization function] */
/*
* wiredtiger_extension_init --
@@ -405,8 +474,7 @@ int
wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
{
ROTN_ENCRYPTOR *rotn_encryptor;
-
- (void)config; /* Unused parameters */
+ int ret;
if ((rotn_encryptor = calloc(1, sizeof(ROTN_ENCRYPTOR))) == NULL)
return (errno);
@@ -423,9 +491,12 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
rotn_encryptor->encryptor.sizing = rotn_sizing;
rotn_encryptor->encryptor.customize = rotn_customize;
rotn_encryptor->encryptor.terminate = rotn_terminate;
+ rotn_encryptor->wtext = connection->get_extension_api(connection);
- rotn_encryptor->wt_api = connection->get_extension_api(connection);
-
+ if ((ret = rotn_configure(rotn_encryptor, config)) != 0) {
+ free(rotn_encryptor);
+ return (ret);
+ }
/* Load the encryptor */
return (connection->add_encryptor(
connection, "rotn", (WT_ENCRYPTOR *)rotn_encryptor, NULL));
diff --git a/ext/extractors/csv/csv_extractor.c b/ext/extractors/csv/csv_extractor.c
index 0dd110955ad..34b8d7c7c64 100644
--- a/ext/extractors/csv/csv_extractor.c
+++ b/ext/extractors/csv/csv_extractor.c
@@ -128,7 +128,7 @@ csv_customize(WT_EXTRACTOR *extractor, WT_SESSION *session,
return (errno);
*csv_extractor = *orig;
- csv_extractor->field_num = field_num;
+ csv_extractor->field_num = (int)field_num;
*customp = (WT_EXTRACTOR *)csv_extractor;
return (0);
}
diff --git a/src/async/async_api.c b/src/async/async_api.c
index 44e492cb0e5..416c3c84f7b 100644
--- a/src/async/async_api.c
+++ b/src/async/async_api.c
@@ -43,7 +43,7 @@ __async_get_format(WT_CONNECTION_IMPL *conn, const char *uri,
* is a possibility a duplicate entry might be inserted, but
* that is not harmful.
*/
- STAILQ_FOREACH(af, &async->formatqh, q) {
+ TAILQ_FOREACH(af, &async->formatqh, q) {
if (af->uri_hash == uri_hash && af->cfg_hash == cfg_hash)
goto setup;
}
@@ -71,7 +71,7 @@ __async_get_format(WT_CONNECTION_IMPL *conn, const char *uri,
WT_ERR(c->close(c));
c = NULL;
- STAILQ_INSERT_HEAD(&async->formatqh, af, q);
+ TAILQ_INSERT_HEAD(&async->formatqh, af, q);
__wt_spin_unlock(session, &async->ops_lock);
WT_ERR(wt_session->close(wt_session, NULL));
@@ -151,15 +151,16 @@ retry:
* If we can set the state then the op entry is ours.
* Start the next search at the next entry after this one.
*/
- if (!WT_ATOMIC_CAS4(op->state, WT_ASYNCOP_FREE, WT_ASYNCOP_READY)) {
+ if (!__wt_atomic_cas32(&op->state, WT_ASYNCOP_FREE, WT_ASYNCOP_READY)) {
WT_STAT_FAST_CONN_INCR(session, async_alloc_race);
goto retry;
}
WT_STAT_FAST_CONN_INCRV(session, async_alloc_view, view);
WT_RET(__async_get_format(conn, uri, config, op));
- op->unique_id = WT_ATOMIC_ADD8(async->op_id, 1);
+ op->unique_id = __wt_atomic_add64(&async->op_id, 1);
op->optype = WT_AOP_NONE;
- (void)WT_ATOMIC_STORE4(async->ops_index, (i + 1) % conn->async_size);
+ (void)__wt_atomic_store32(
+ &async->ops_index, (i + 1) % conn->async_size);
*opp = op;
return (0);
}
@@ -206,15 +207,15 @@ __wt_async_stats_update(WT_SESSION_IMPL *session)
{
WT_ASYNC *async;
WT_CONNECTION_IMPL *conn;
- WT_CONNECTION_STATS *stats;
+ WT_CONNECTION_STATS **stats;
conn = S2C(session);
async = conn->async;
if (async == NULL)
return;
- stats = &conn->stats;
- WT_STAT_SET(stats, async_cur_queue, async->cur_queue);
- WT_STAT_SET(stats, async_max_queue, async->max_queue);
+ stats = conn->stats;
+ WT_STAT_SET(session, stats, async_cur_queue, async->cur_queue);
+ WT_STAT_SET(session, stats, async_max_queue, async->max_queue);
F_SET(conn, WT_CONN_SERVER_ASYNC);
}
@@ -237,7 +238,7 @@ __async_start(WT_SESSION_IMPL *session)
*/
WT_RET(__wt_calloc_one(session, &conn->async));
async = conn->async;
- STAILQ_INIT(&async->formatqh);
+ TAILQ_INIT(&async->formatqh);
WT_RET(__wt_spin_init(session, &async->ops_lock, "ops"));
WT_RET(__wt_cond_alloc(session, "async flush", 0, &async->flush_cond));
WT_RET(__wt_async_op_init(session));
@@ -461,9 +462,9 @@ __wt_async_destroy(WT_SESSION_IMPL *session)
}
/* Free format resources */
- af = STAILQ_FIRST(&async->formatqh);
+ af = TAILQ_FIRST(&async->formatqh);
while (af != NULL) {
- afnext = STAILQ_NEXT(af, q);
+ afnext = TAILQ_NEXT(af, q);
__wt_free(session, af->uri);
__wt_free(session, af->config);
__wt_free(session, af->key_format);
@@ -514,7 +515,7 @@ retry:
*/
__wt_sleep(0, 100000);
- if (!WT_ATOMIC_CAS4(async->flush_state, WT_ASYNC_FLUSH_NONE,
+ if (!__wt_atomic_cas32(&async->flush_state, WT_ASYNC_FLUSH_NONE,
WT_ASYNC_FLUSH_IN_PROGRESS))
goto retry;
/*
@@ -524,7 +525,7 @@ retry:
* things off the work queue with the lock.
*/
async->flush_count = 0;
- (void)WT_ATOMIC_ADD8(async->flush_gen, 1);
+ (void)__wt_atomic_add64(&async->flush_gen, 1);
WT_ASSERT(session, async->flush_op.state == WT_ASYNCOP_FREE);
async->flush_op.state = WT_ASYNCOP_READY;
WT_ERR(__wt_async_op_enqueue(session, &async->flush_op));
diff --git a/src/async/async_op.c b/src/async/async_op.c
index d0c58f584cc..469dbc8e615 100644
--- a/src/async/async_op.c
+++ b/src/async/async_op.c
@@ -237,7 +237,7 @@ __async_op_init(WT_CONNECTION_IMPL *conn, WT_ASYNC_OP_IMPL *op, uint32_t id)
asyncop->c.set_key = __wt_cursor_set_key;
asyncop->c.get_value = __wt_cursor_get_value;
asyncop->c.set_value = __wt_cursor_set_value;
- asyncop->c.recno = 0;
+ asyncop->c.recno = WT_RECNO_OOB;
memset(asyncop->c.raw_recno_buf, 0, sizeof(asyncop->c.raw_recno_buf));
memset(&asyncop->c.key, 0, sizeof(asyncop->c.key));
memset(&asyncop->c.value, 0, sizeof(asyncop->c.value));
@@ -280,7 +280,7 @@ __wt_async_op_enqueue(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op)
* Enqueue op at the tail of the work queue.
* We get our slot in the ring buffer to use.
*/
- my_alloc = WT_ATOMIC_ADD8(async->alloc_head, 1);
+ my_alloc = __wt_atomic_add64(&async->alloc_head, 1);
my_slot = my_alloc % async->async_qsize;
/*
@@ -300,7 +300,7 @@ __wt_async_op_enqueue(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op)
#endif
WT_PUBLISH(async->async_queue[my_slot], op);
op->state = WT_ASYNCOP_ENQUEUED;
- if (WT_ATOMIC_ADD4(async->cur_queue, 1) > async->max_queue)
+ if (__wt_atomic_add32(&async->cur_queue, 1) > async->max_queue)
WT_PUBLISH(async->max_queue, async->cur_queue);
/*
* Multiple threads may be adding ops to the queue. We need to wait
diff --git a/src/async/async_worker.c b/src/async/async_worker.c
index 4f372d05d19..6a5ec5feeb0 100644
--- a/src/async/async_worker.c
+++ b/src/async/async_worker.c
@@ -67,7 +67,7 @@ retry:
* a race, try again.
*/
my_consume = last_consume + 1;
- if (!WT_ATOMIC_CAS8(async->alloc_tail, last_consume, my_consume))
+ if (!__wt_atomic_cas64(&async->alloc_tail, last_consume, my_consume))
goto retry;
/*
* This item of work is ours to process. Clear it out of the
@@ -81,7 +81,7 @@ retry:
WT_ASSERT(session, async->cur_queue > 0);
WT_ASSERT(session, *op != NULL);
WT_ASSERT(session, (*op)->state == WT_ASYNCOP_ENQUEUED);
- (void)WT_ATOMIC_SUB4(async->cur_queue, 1);
+ (void)__wt_atomic_sub32(&async->cur_queue, 1);
(*op)->state = WT_ASYNCOP_WORKING;
if (*op == &async->flush_op)
@@ -135,7 +135,7 @@ __async_worker_cursor(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op,
if (op->optype == WT_AOP_COMPACT)
return (0);
WT_ASSERT(session, op->format != NULL);
- STAILQ_FOREACH(ac, &worker->cursorqh, q) {
+ TAILQ_FOREACH(ac, &worker->cursorqh, q) {
if (op->format->cfg_hash == ac->cfg_hash &&
op->format->uri_hash == ac->uri_hash) {
/*
@@ -156,7 +156,7 @@ __async_worker_cursor(WT_SESSION_IMPL *session, WT_ASYNC_OP_IMPL *op,
ac->cfg_hash = op->format->cfg_hash;
ac->uri_hash = op->format->uri_hash;
ac->c = c;
- STAILQ_INSERT_HEAD(&worker->cursorqh, ac, q);
+ TAILQ_INSERT_HEAD(&worker->cursorqh, ac, q);
worker->num_cursors++;
*cursorp = c;
return (0);
@@ -297,7 +297,7 @@ __wt_async_worker(void *arg)
async = conn->async;
worker.num_cursors = 0;
- STAILQ_INIT(&worker.cursorqh);
+ TAILQ_INIT(&worker.cursorqh);
while (F_ISSET(conn, WT_CONN_SERVER_ASYNC) &&
F_ISSET(session, WT_SESSION_SERVER_ASYNC)) {
WT_ERR(__async_op_dequeue(conn, session, &op));
@@ -316,7 +316,7 @@ __wt_async_worker(void *arg)
* the queue.
*/
WT_ORDERED_READ(flush_gen, async->flush_gen);
- if (WT_ATOMIC_ADD4(async->flush_count, 1) ==
+ if (__wt_atomic_add32(&async->flush_count, 1) ==
conn->async_workers) {
/*
* We're last. All workers accounted for so
@@ -346,9 +346,9 @@ err: WT_PANIC_MSG(session, ret, "async worker error");
* Worker thread cleanup, close our cached cursors and free all the
* WT_ASYNC_CURSOR structures.
*/
- ac = STAILQ_FIRST(&worker.cursorqh);
+ ac = TAILQ_FIRST(&worker.cursorqh);
while (ac != NULL) {
- acnext = STAILQ_NEXT(ac, q);
+ acnext = TAILQ_NEXT(ac, q);
WT_TRET(ac->c->close(ac->c));
__wt_free(session, ac);
ac = acnext;
diff --git a/src/block/block_ext.c b/src/block/block_ext.c
index d593537446b..018f6a20164 100644
--- a/src/block/block_ext.c
+++ b/src/block/block_ext.c
@@ -86,7 +86,7 @@ __block_off_srch(WT_EXT **head, wt_off_t off, WT_EXT ***stack, int skip_off)
* __block_first_srch --
* Search the skiplist for the first available slot.
*/
-static inline int
+static inline bool
__block_first_srch(WT_EXT **head, wt_off_t size, WT_EXT ***stack)
{
WT_EXT *ext;
@@ -99,11 +99,11 @@ __block_first_srch(WT_EXT **head, wt_off_t size, WT_EXT ***stack)
if (ext->size >= size)
break;
if (ext == NULL)
- return (0);
+ return (false);
/* Build a stack for the offset we want. */
__block_off_srch(head, ext->off, stack, 0);
- return (1);
+ return (true);
}
/*
@@ -251,7 +251,7 @@ __block_off_insert(
* Return if any part of a specified range appears on a specified extent
* list.
*/
-static int
+static bool
__block_off_match(WT_EXTLIST *el, wt_off_t off, wt_off_t size)
{
WT_EXT *before, *after;
@@ -261,10 +261,10 @@ __block_off_match(WT_EXTLIST *el, wt_off_t off, wt_off_t size)
/* If "before" or "after" overlaps, we have a winner. */
if (before != NULL && before->off + before->size > off)
- return (1);
+ return (true);
if (after != NULL && off + size > after->off)
- return (1);
- return (0);
+ return (true);
+ return (false);
}
/*
@@ -283,7 +283,7 @@ __wt_block_misplaced(WT_SESSION_IMPL *session,
* Don't check during the salvage read phase, we might be reading an
* already freed overflow page.
*/
- if (F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))
+ if (F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))
return (0);
/*
diff --git a/src/block/block_open.c b/src/block/block_open.c
index fd00e0c7deb..cfb5b000092 100644
--- a/src/block/block_open.c
+++ b/src/block/block_open.c
@@ -158,9 +158,9 @@ __wt_block_configure_first_fit(WT_BLOCK *block, int on)
* as long as any operation wants it.
*/
if (on)
- (void)WT_ATOMIC_ADD4(block->allocfirst, 1);
+ (void)__wt_atomic_add32(&block->allocfirst, 1);
else
- (void)WT_ATOMIC_SUB4(block->allocfirst, 1);
+ (void)__wt_atomic_sub32(&block->allocfirst, 1);
}
/*
@@ -185,7 +185,7 @@ __wt_block_open(WT_SESSION_IMPL *session,
hash = __wt_hash_city64(filename, strlen(filename));
bucket = hash % WT_HASH_ARRAY_SIZE;
__wt_spin_lock(session, &conn->block_lock);
- SLIST_FOREACH(block, &conn->blockhash[bucket], hashl) {
+ TAILQ_FOREACH(block, &conn->blockhash[bucket], hashq) {
if (strcmp(filename, block->name) == 0) {
++block->ref;
*blockp = block;
@@ -398,21 +398,19 @@ err: __wt_scr_free(session, &buf);
void
__wt_block_stat(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_DSRC_STATS *stats)
{
+ WT_UNUSED(session);
+
/*
- * We're looking inside the live system's structure, which normally
- * requires locking: the chances of a corrupted read are probably
- * non-existent, and it's statistics information regardless, but it
- * isn't like this is a common function for an application to call.
+ * Reading from the live system's structure normally requires locking,
+ * but it's an 8B statistics read, there's no need.
*/
- __wt_spin_lock(session, &block->live_lock);
- WT_STAT_SET(stats, allocation_size, block->allocsize);
- WT_STAT_SET(stats, block_checkpoint_size, block->live.ckpt_size);
- WT_STAT_SET(stats, block_magic, WT_BLOCK_MAGIC);
- WT_STAT_SET(stats, block_major, WT_BLOCK_MAJOR_VERSION);
- WT_STAT_SET(stats, block_minor, WT_BLOCK_MINOR_VERSION);
- WT_STAT_SET(stats, block_reuse_bytes, block->live.avail.bytes);
- WT_STAT_SET(stats, block_size, block->fh->size);
- __wt_spin_unlock(session, &block->live_lock);
+ stats->allocation_size = block->allocsize;
+ stats->block_checkpoint_size = (int64_t)block->live.ckpt_size;
+ stats->block_magic = WT_BLOCK_MAGIC;
+ stats->block_major = WT_BLOCK_MAJOR_VERSION;
+ stats->block_minor = WT_BLOCK_MINOR_VERSION;
+ stats->block_reuse_bytes = (int64_t)block->live.avail.bytes;
+ stats->block_size = block->fh->size;
}
/*
@@ -426,7 +424,7 @@ __wt_block_manager_size(
wt_off_t filesize;
WT_RET(__wt_filesize_name(session, filename, &filesize));
- WT_STAT_SET(stats, block_size, filesize);
+ stats->block_size = filesize;
return (0);
}
diff --git a/src/block/block_read.c b/src/block/block_read.c
index 0d631396b41..9f7c869dd38 100644
--- a/src/block/block_read.c
+++ b/src/block/block_read.c
@@ -200,7 +200,7 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
if (page_cksum == cksum)
return (0);
- if (!F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))
+ if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))
__wt_errx(session,
"read checksum error for %" PRIu32 "B block at "
"offset %" PRIuMAX ": calculated block checksum "
@@ -208,7 +208,7 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
"of %" PRIu32,
size, (uintmax_t)offset, page_cksum, cksum);
} else
- if (!F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))
+ if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))
__wt_errx(session,
"read checksum error for %" PRIu32 "B block at "
"offset %" PRIuMAX ": block header checksum "
@@ -218,6 +218,6 @@ __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
/* Panic if a checksum fails during an ordinary read. */
return (block->verify ||
- F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ?
+ F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE) ?
WT_ERROR : __wt_illegal_value(session, block->name));
}
diff --git a/src/block/block_slvg.c b/src/block/block_slvg.c
index c78a6c39942..641bb8a42f7 100644
--- a/src/block/block_slvg.c
+++ b/src/block/block_slvg.c
@@ -73,19 +73,19 @@ __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block)
* __wt_block_offset_invalid --
* Return if the block offset is insane.
*/
-int
+bool
__wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size)
{
if (size == 0) /* < minimum page size */
- return (1);
+ return (true);
if (size % block->allocsize != 0) /* not allocation-size units */
- return (1);
+ return (true);
if (size > WT_BTREE_PAGE_SIZE_MAX) /* > maximum page size */
- return (1);
+ return (true);
/* past end-of-file */
if (offset + (wt_off_t)size > block->fh->size)
- return (1);
- return (0);
+ return (true);
+ return (false);
}
/*
diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c
index 18f8ca54601..79a52dbcaa3 100644
--- a/src/btree/bt_compact.c
+++ b/src/btree/bt_compact.c
@@ -53,12 +53,12 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp)
} else if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_REPLACE) {
/*
* The page's modification information can change underfoot if
- * the page is being reconciled, lock the page down.
+ * the page is being reconciled, serialize with reconciliation.
*/
- WT_PAGE_LOCK(session, page);
+ F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION);
ret = bm->compact_page_skip(bm, session,
mod->mod_replace.addr, mod->mod_replace.size, skipp);
- WT_PAGE_UNLOCK(session, page);
+ F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION);
WT_RET(ret);
}
return (0);
@@ -73,14 +73,12 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[])
{
WT_BM *bm;
WT_BTREE *btree;
- WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_REF *ref;
- int block_manager_begin, evict_reset, skip;
+ int block_manager_begin, skip;
WT_UNUSED(cfg);
- conn = S2C(session);
btree = S2BT(session);
bm = btree->bm;
ref = NULL;
@@ -118,25 +116,6 @@ __wt_compact(WT_SESSION_IMPL *session, const char *cfg[])
*/
__wt_spin_lock(session, &btree->flush_lock);
- /*
- * That leaves eviction, we don't want to block eviction. Set a flag
- * so reconciliation knows compaction is running. If reconciliation
- * sees the flag it locks the page it's writing, we acquire the same
- * lock when reading the page's modify information, serializing access.
- * The same page lock blocks work on the page, but compaction is an
- * uncommon, heavy-weight operation. If it's ever a problem, there's
- * no reason we couldn't use an entirely separate lock than the page
- * lock.
- *
- * We also need to ensure we don't race with an on-going reconciliation.
- * After we set the flag, wait for eviction of this file to drain, and
- * then let eviction continue;
- */
- conn->compact_in_memory_pass = 1;
- WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset));
- if (evict_reset)
- __wt_evict_file_exclusive_off(session);
-
/* Start compaction. */
WT_ERR(bm->compact_start(bm, session));
block_manager_begin = 1;
@@ -172,11 +151,7 @@ err: if (ref != NULL)
if (block_manager_begin)
WT_TRET(bm->compact_end(bm, session));
- /*
- * Unlock will be a release barrier, use it to update the compaction
- * status for reconciliation.
- */
- conn->compact_in_memory_pass = 0;
+ /* Unblock threads writing leaf pages. */
__wt_spin_unlock(session, &btree->flush_lock);
return (ret);
diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c
index 0aed5940533..458a1985e28 100644
--- a/src/btree/bt_cursor.c
+++ b/src/btree/bt_cursor.c
@@ -70,7 +70,7 @@ __cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt)
* __cursor_valid --
* Return if the cursor references an valid key/value pair.
*/
-static inline int
+static inline bool
__cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
{
WT_BTREE *btree;
@@ -133,10 +133,10 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
if (cbt->ins != NULL &&
(upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) {
if (WT_UPDATE_DELETED_ISSET(upd))
- return (0);
+ return (false);
if (updp != NULL)
*updp = upd;
- return (1);
+ return (true);
}
/*
@@ -155,7 +155,7 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
* keys, check for retrieval past the end of the page.
*/
if (cbt->recno >= page->pg_fix_recno + page->pg_fix_entries)
- return (0);
+ return (false);
/*
* Updates aren't stored on the page, an update would have
@@ -170,7 +170,7 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
* "slots", check if search returned a valid slot.
*/
if (cbt->slot >= page->pg_var_entries)
- return (0);
+ return (false);
/*
* Updates aren't stored on the page, an update would have
@@ -181,7 +181,7 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
cip = &page->pg_var_d[cbt->slot];
if ((cell = WT_COL_PTR(page, cip)) == NULL ||
__wt_cell_type(cell) == WT_CELL_DEL)
- return (0);
+ return (false);
break;
case BTREE_ROW:
/*
@@ -189,7 +189,7 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
* key as an on-page object, we're done.
*/
if (cbt->ins != NULL)
- return (0);
+ return (false);
/*
* Check if searched returned a valid slot (the failure mode is
@@ -198,19 +198,19 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
* mirrors the column-store test).
*/
if (cbt->slot >= page->pg_row_entries)
- return (0);
+ return (false);
/* Updates are stored on the page, check for a delete. */
if (page->pg_row_upd != NULL && (upd = __wt_txn_read(
session, page->pg_row_upd[cbt->slot])) != NULL) {
if (WT_UPDATE_DELETED_ISSET(upd))
- return (0);
+ return (false);
if (updp != NULL)
*updp = upd;
}
break;
}
- return (1);
+ return (true);
}
/*
@@ -517,7 +517,7 @@ retry: WT_RET(__cursor_func_init(cbt, 1));
WT_ERR(__cursor_col_search(session, cbt, NULL));
if (F_ISSET(cursor, WT_CURSTD_APPEND))
- cbt->iface.recno = 0;
+ cbt->iface.recno = WT_RECNO_OOB;
/*
* If not overwriting, fail if the key exists. Creating a
@@ -549,8 +549,11 @@ retry: WT_RET(__cursor_func_init(cbt, 1));
WT_ILLEGAL_VALUE_ERR(session);
}
-err: if (ret == WT_RESTART)
+err: if (ret == WT_RESTART) {
+ WT_STAT_FAST_CONN_INCR(session, cursor_restart);
+ WT_STAT_FAST_DATA_INCR(session, cursor_restart);
goto retry;
+ }
/* Insert doesn't maintain a position across calls, clear resources. */
if (ret == 0)
WT_TRET(__curfile_leave(cbt));
@@ -624,8 +627,11 @@ retry: WT_RET(__cursor_func_init(cbt, 1));
WT_ILLEGAL_VALUE_ERR(session);
}
-err: if (ret == WT_RESTART)
+err: if (ret == WT_RESTART) {
+ WT_STAT_FAST_CONN_INCR(session, cursor_restart);
+ WT_STAT_FAST_DATA_INCR(session, cursor_restart);
goto retry;
+ }
WT_TRET(__curfile_leave(cbt));
if (ret != 0)
WT_TRET(__cursor_reset(cbt));
@@ -702,8 +708,11 @@ retry: WT_RET(__cursor_func_init(cbt, 1));
WT_ILLEGAL_VALUE_ERR(session);
}
-err: if (ret == WT_RESTART)
+err: if (ret == WT_RESTART) {
+ WT_STAT_FAST_CONN_INCR(session, cursor_restart);
+ WT_STAT_FAST_DATA_INCR(session, cursor_restart);
goto retry;
+ }
/*
* If the cursor is configured to overwrite and the record is not
* found, that is exactly what we want.
@@ -790,8 +799,11 @@ retry: WT_RET(__cursor_func_init(cbt, 1));
WT_ILLEGAL_VALUE_ERR(session);
}
-err: if (ret == WT_RESTART)
+err: if (ret == WT_RESTART) {
+ WT_STAT_FAST_CONN_INCR(session, cursor_restart);
+ WT_STAT_FAST_DATA_INCR(session, cursor_restart);
goto retry;
+ }
/*
* If successful, point the cursor at internal copies of the data. We
@@ -899,7 +911,7 @@ __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp)
* __cursor_equals --
* Return if two cursors reference the same row.
*/
-static inline int
+static inline bool
__cursor_equals(WT_CURSOR_BTREE *a, WT_CURSOR_BTREE *b)
{
switch (a->btree->type) {
@@ -911,21 +923,21 @@ __cursor_equals(WT_CURSOR_BTREE *a, WT_CURSOR_BTREE *b)
* one being returned to the application.
*/
if (((WT_CURSOR *)a)->recno == ((WT_CURSOR *)b)->recno)
- return (1);
+ return (true);
break;
case BTREE_ROW:
if (a->ref != b->ref)
- return (0);
+ return (false);
if (a->ins != NULL || b->ins != NULL) {
if (a->ins == b->ins)
- return (1);
+ return (true);
break;
}
if (a->slot == b->slot)
- return (1);
+ return (true);
break;
}
- return (0);
+ return (false);
}
/*
@@ -993,22 +1005,27 @@ __cursor_truncate(WT_SESSION_IMPL *session,
* instantiated the end cursor, so we know that page is pinned in memory
* and we can proceed without concern.
*/
- do {
- WT_RET(__wt_btcur_remove(start));
- /*
- * Reset ret each time through so that we don't loop forever in
- * the cursor equals case.
- */
- for (ret = 0;;) {
- if (stop != NULL && __cursor_equals(start, stop))
- break;
- if ((ret = __wt_btcur_next(start, 1)) != 0)
- break;
- start->compare = 0; /* Exact match */
- if ((ret = rmfunc(session, start, 1)) != 0)
- break;
- }
- } while (ret == WT_RESTART);
+retry: WT_RET(__wt_btcur_remove(start));
+
+ /*
+ * Reset ret each time through so that we don't loop forever in
+ * the cursor equals case.
+ */
+ for (ret = 0;;) {
+ if (stop != NULL && __cursor_equals(start, stop))
+ break;
+ if ((ret = __wt_btcur_next(start, 1)) != 0)
+ break;
+ start->compare = 0; /* Exact match */
+ if ((ret = rmfunc(session, start, 1)) != 0)
+ break;
+ }
+
+ if (ret == WT_RESTART) {
+ WT_STAT_FAST_CONN_INCR(session, cursor_restart);
+ WT_STAT_FAST_DATA_INCR(session, cursor_restart);
+ goto retry;
+ }
WT_RET_NOTFOUND_OK(ret);
return (0);
@@ -1042,24 +1059,28 @@ __cursor_truncate_fix(WT_SESSION_IMPL *session,
* other thread of control; in that case, repeat the full search to
* refresh the page's modification information.
*/
- do {
- WT_RET(__wt_btcur_remove(start));
- /*
- * Reset ret each time through so that we don't loop forever in
- * the cursor equals case.
- */
- for (ret = 0;;) {
- if (stop != NULL && __cursor_equals(start, stop))
- break;
- if ((ret = __wt_btcur_next(start, 1)) != 0)
- break;
- start->compare = 0; /* Exact match */
- value = (uint8_t *)start->iface.value.data;
- if (*value != 0 &&
- (ret = rmfunc(session, start, 1)) != 0)
- break;
- }
- } while (ret == WT_RESTART);
+retry: WT_RET(__wt_btcur_remove(start));
+ /*
+ * Reset ret each time through so that we don't loop forever in
+ * the cursor equals case.
+ */
+ for (ret = 0;;) {
+ if (stop != NULL && __cursor_equals(start, stop))
+ break;
+ if ((ret = __wt_btcur_next(start, 1)) != 0)
+ break;
+ start->compare = 0; /* Exact match */
+ value = (uint8_t *)start->iface.value.data;
+ if (*value != 0 &&
+ (ret = rmfunc(session, start, 1)) != 0)
+ break;
+ }
+
+ if (ret == WT_RESTART) {
+ WT_STAT_FAST_CONN_INCR(session, cursor_restart);
+ WT_STAT_FAST_DATA_INCR(session, cursor_restart);
+ goto retry;
+ }
WT_RET_NOTFOUND_OK(ret);
return (0);
@@ -1132,6 +1153,19 @@ err: if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED))
}
/*
+ * __wt_btcur_init --
+ * Initialize an cursor used for internal purposes.
+ */
+void
+__wt_btcur_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+{
+ memset(cbt, 0, sizeof(WT_CURSOR_BTREE));
+
+ cbt->iface.session = &session->iface;
+ cbt->btree = S2BT(session);
+}
+
+/*
* __wt_btcur_open --
* Open a btree cursor.
*/
@@ -1147,14 +1181,22 @@ __wt_btcur_open(WT_CURSOR_BTREE *cbt)
* Close a btree cursor.
*/
int
-__wt_btcur_close(WT_CURSOR_BTREE *cbt)
+__wt_btcur_close(WT_CURSOR_BTREE *cbt, int lowlevel)
{
WT_DECL_RET;
WT_SESSION_IMPL *session;
session = (WT_SESSION_IMPL *)cbt->iface.session;
- ret = __curfile_leave(cbt);
+ /*
+ * The in-memory split and lookaside table code creates low-level btree
+ * cursors to search/modify leaf pages. Those cursors don't hold hazard
+ * pointers, nor are they counted in the session handle's cursor count.
+ * Skip the usual cursor tear-down in that case.
+ */
+ if (!lowlevel)
+ ret = __curfile_leave(cbt);
+
__wt_buf_free(session, &cbt->_row_key);
__wt_buf_free(session, &cbt->_tmp);
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index 77d80cdb3a2..38ef407e160 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -340,6 +340,8 @@ __wt_debug_disk(
__dmsg(ds, ", empty-all");
if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE))
__dmsg(ds, ", empty-none");
+ if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE))
+ __dmsg(ds, ", LAS-update");
__dmsg(ds, ", generation %" PRIu64 "\n", dsk->write_gen);
@@ -643,12 +645,10 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page)
__dmsg(ds, ", disk-mapped");
if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU))
__dmsg(ds, ", evict-lru");
- if (F_ISSET_ATOMIC(page, WT_PAGE_SCANNING))
- __dmsg(ds, ", scanning");
+ if (F_ISSET_ATOMIC(page, WT_PAGE_RECONCILIATION))
+ __dmsg(ds, ", reconciliation");
if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT))
__dmsg(ds, ", split-insert");
- if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_LOCKED))
- __dmsg(ds, ", split-locked");
if (mod != NULL)
switch (F_ISSET(mod, WT_PM_REC_MASK)) {
diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c
index 8cca6328f21..c3c7afa1450 100644
--- a/src/btree/bt_delete.c
+++ b/src/btree/bt_delete.c
@@ -70,15 +70,15 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp)
/* If we have a clean page in memory, attempt to evict it. */
if (ref->state == WT_REF_MEM &&
- WT_ATOMIC_CAS4(ref->state, WT_REF_MEM, WT_REF_LOCKED)) {
+ __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED)) {
if (__wt_page_is_modified(ref->page)) {
WT_PUBLISH(ref->state, WT_REF_MEM);
return (0);
}
- (void)WT_ATOMIC_ADD4(S2BT(session)->evict_busy, 1);
- ret = __wt_evict_page(session, ref);
- (void)WT_ATOMIC_SUB4(S2BT(session)->evict_busy, 1);
+ (void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1);
+ ret = __wt_evict(session, ref, 0);
+ (void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1);
WT_RET_BUSY_OK(ret);
}
@@ -93,7 +93,7 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp)
* unclear optimizing for overlapping range deletes is worth the effort.
*/
if (ref->state != WT_REF_DISK ||
- !WT_ATOMIC_CAS4(ref->state, WT_REF_DISK, WT_REF_LOCKED))
+ !__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED))
return (0);
/*
@@ -176,8 +176,8 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
* If the page is still "deleted", it's as we left it,
* reset the state.
*/
- if (WT_ATOMIC_CAS4(
- ref->state, WT_REF_DELETED, WT_REF_DISK))
+ if (__wt_atomic_casv32(
+ &ref->state, WT_REF_DELETED, WT_REF_DISK))
return;
break;
case WT_REF_LOCKED:
@@ -216,10 +216,10 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
* __wt_delete_page_skip --
* If iterating a cursor, skip deleted pages that are visible to us.
*/
-int
+bool
__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
{
- int skip;
+ bool skip;
/*
* Deleted pages come from two sources: either it's a fast-delete as
@@ -240,13 +240,13 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
* the structure, just to be safe.
*/
if (ref->page_del == NULL)
- return (1);
+ return (true);
- if (!WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED))
- return (0);
+ if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED))
+ return (false);
- skip = (ref->page_del == NULL ||
- __wt_txn_visible(session, ref->page_del->txnid));
+ skip = ref->page_del == NULL ||
+ __wt_txn_visible(session, ref->page_del->txnid);
WT_PUBLISH(ref->state, WT_REF_DELETED);
return (skip);
diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c
index a05c6217338..73e6affccd3 100644
--- a/src/btree/bt_discard.c
+++ b/src/btree/bt_discard.c
@@ -15,7 +15,6 @@ static void __free_page_row_leaf(WT_SESSION_IMPL *, WT_PAGE *);
static void __free_skip_array(WT_SESSION_IMPL *, WT_INSERT_HEAD **, uint32_t);
static void __free_skip_list(WT_SESSION_IMPL *, WT_INSERT *);
static void __free_update(WT_SESSION_IMPL *, WT_UPDATE **, uint32_t);
-static void __free_update_list(WT_SESSION_IMPL *, WT_UPDATE *);
/*
* __wt_ref_out --
@@ -56,7 +55,7 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep)
*/
WT_ASSERT(session, !__wt_page_is_modified(page));
WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU));
- WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_LOCKED));
+ WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_RECONCILIATION));
#ifdef HAVE_DIAGNOSTIC
{
@@ -160,8 +159,8 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
__wt_free(session, multi->key.ikey);
break;
}
- __wt_free(session, multi->skip);
- __wt_free(session, multi->skip_dsk);
+ __wt_free(session, multi->supd);
+ __wt_free(session, multi->supd_dsk);
__wt_free(session, multi->addr.addr);
}
__wt_free(session, mod->mod_multi);
@@ -235,10 +234,7 @@ __wt_free_ref(
* it clean explicitly.)
*/
if (free_pages && ref->page != NULL) {
- if (ref->page->modify != NULL) {
- ref->page->modify->write_gen = 0;
- __wt_cache_dirty_decr(session, ref->page);
- }
+ __wt_page_modify_clear(session, ref->page);
__wt_page_out(session, &ref->page);
}
@@ -373,7 +369,7 @@ __free_skip_list(WT_SESSION_IMPL *session, WT_INSERT *ins)
WT_INSERT *next;
for (; ins != NULL; ins = next) {
- __free_update_list(session, ins->upd);
+ __wt_free_update_list(session, ins->upd);
next = WT_SKIP_NEXT(ins);
__wt_free(session, ins);
}
@@ -395,29 +391,23 @@ __free_update(
*/
for (updp = update_head; entries > 0; --entries, ++updp)
if (*updp != NULL)
- __free_update_list(session, *updp);
+ __wt_free_update_list(session, *updp);
/* Free the update array. */
__wt_free(session, update_head);
}
/*
- * __free_update_list --
+ * __wt_free_update_list --
* Walk a WT_UPDATE forward-linked list and free the per-thread combination
* of a WT_UPDATE structure and its associated data.
*/
-static void
-__free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd)
+void
+__wt_free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd)
{
WT_UPDATE *next;
for (; upd != NULL; upd = next) {
- /* Everything we free should be visible to everyone. */
- WT_ASSERT(session,
- F_ISSET(session, WT_SESSION_DISCARD_FORCE) ||
- upd->txnid == WT_TXN_ABORTED ||
- __wt_txn_visible_all(session, upd->txnid));
-
next = upd->next;
__wt_free(session, upd);
}
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index c1a8ab61054..6a4243a0fc7 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -255,27 +255,17 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
/* Page sizes */
WT_RET(__btree_page_sizes(session));
- /*
- * Set special flags for the metadata file.
- * Eviction; the metadata file is never evicted.
- * Logging; the metadata file is always logged if possible.
- */
- if (WT_IS_METADATA(btree->dhandle)) {
+ WT_RET(__wt_config_gets(session, cfg, "cache_resident", &cval));
+ if (cval.val)
F_SET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION);
+ else
+ F_CLR(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION);
+
+ WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval));
+ if (cval.val)
F_CLR(btree, WT_BTREE_NO_LOGGING);
- } else {
- WT_RET(__wt_config_gets(session, cfg, "cache_resident", &cval));
- if (cval.val)
- F_SET(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION);
- else
- F_CLR(btree, WT_BTREE_IN_MEMORY | WT_BTREE_NO_EVICTION);
-
- WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval));
- if (cval.val)
- F_CLR(btree, WT_BTREE_NO_LOGGING);
- else
- F_SET(btree, WT_BTREE_NO_LOGGING);
- }
+ else
+ F_SET(btree, WT_BTREE_NO_LOGGING);
/* Checksums */
WT_RET(__wt_config_gets(session, cfg, "checksum", &cval));
@@ -352,8 +342,6 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
session, &btree->ovfl_lock, "btree overflow lock"));
WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush lock"));
- __wt_stat_init_dsrc_stats(&btree->dhandle->stats);
-
btree->write_gen = ckpt->write_gen; /* Write generation */
btree->modified = 0; /* Clean */
@@ -372,7 +360,7 @@ __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, int is_recno)
root_ref->page = root;
root_ref->state = WT_REF_MEM;
- root_ref->key.recno = is_recno ? 1 : 0;
+ root_ref->key.recno = is_recno ? 1 : WT_RECNO_OOB;
root->pg_intl_parent_ref = root_ref;
}
@@ -385,12 +373,15 @@ int
__wt_btree_tree_open(
WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
{
+ WT_BM *bm;
WT_BTREE *btree;
+ WT_DECL_ITEM(tmp);
WT_DECL_RET;
WT_ITEM dsk;
WT_PAGE *page;
btree = S2BT(session);
+ bm = btree->bm;
/*
* A buffer into which we read a root page; don't use a scratch buffer,
@@ -399,12 +390,43 @@ __wt_btree_tree_open(
WT_CLEAR(dsk);
/*
- * Read the page, then build the in-memory version of the page. Clear
- * any local reference to an allocated copy of the disk image on return,
- * the page steals it.
+ * Read and verify the page (verify to catch encrypted objects we can't
+ * decrypt, where we read the object successfully but we can't decrypt
+ * it, and we want to fail gracefully).
+ *
+ * Create a printable version of the address to pass to verify.
+ */
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size));
+
+ F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE);
+ if ((ret = __wt_bt_read(session, &dsk, addr, addr_size)) == 0)
+ ret = __wt_verify_dsk(session, tmp->data, &dsk);
+ F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE);
+ if (ret != 0)
+ __wt_err(session, ret,
+ "unable to read root page from %s", session->dhandle->name);
+ /*
+ * Failure to open metadata means that the database is unavailable.
+ * Try to provide a helpful failure message.
+ */
+ if (ret != 0 && WT_IS_METADATA(session->dhandle)) {
+ __wt_errx(session,
+ "WiredTiger has failed to open its metadata");
+ __wt_errx(session, "This may be due to the database"
+ " files being encrypted, being from an older"
+ " version or due to corruption on disk");
+ __wt_errx(session, "You should confirm that you have"
+ " opened the database with the correct options including"
+ " all encryption and compression options");
+ }
+ WT_ERR(ret);
+
+ /*
+ * Build the in-memory version of the page. Clear our local reference to
+ * the allocated copy of the disk image on return, the in-memory object
+ * steals it.
*/
- WT_ERR(__wt_bt_read(session, &dsk, addr, addr_size));
- WT_ERR(__wt_verify_dsk(session, (const char *)addr, &dsk));
WT_ERR(__wt_page_inmem(session, NULL, dsk.data, dsk.memsize,
WT_DATA_IN_ITEM(&dsk) ?
WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page));
@@ -414,6 +436,8 @@ __wt_btree_tree_open(
__wt_root_ref_init(&btree->root, page, btree->type != BTREE_ROW);
err: __wt_buf_free(session, &dsk);
+ __wt_scr_free(session, &tmp);
+
return (ret);
}
@@ -663,9 +687,11 @@ __btree_page_sizes(WT_SESSION_IMPL *session)
WT_RET(__wt_config_gets(session, cfg, "memory_page_max", &cval));
btree->maxmempage =
WT_MAX((uint64_t)cval.val, 50 * (uint64_t)btree->maxleafpage);
- cache_size = S2C(session)->cache_size;
- if (cache_size > 0)
- btree->maxmempage = WT_MIN(btree->maxmempage, cache_size / 4);
+ if (!F_ISSET(S2C(session), WT_CONN_CACHE_POOL)) {
+ if ((cache_size = S2C(session)->cache_size) > 0)
+ btree->maxmempage =
+ WT_MIN(btree->maxmempage, cache_size / 4);
+ }
/*
* Get the split percentage (reconciliation splits pages into smaller
diff --git a/src/btree/bt_io.c b/src/btree/bt_io.c
index a8bbf8a0266..836c1540c5f 100644
--- a/src/btree/bt_io.c
+++ b/src/btree/bt_io.c
@@ -24,10 +24,12 @@ __wt_bt_read(WT_SESSION_IMPL *session,
WT_ENCRYPTOR *encryptor;
WT_ITEM *ip;
const WT_PAGE_HEADER *dsk;
+ const char *fail_msg;
size_t result_len;
btree = S2BT(session);
bm = btree->bm;
+ fail_msg = NULL; /* -Wuninitialized */
/*
* If anticipating a compressed or encrypted block, read into a scratch
@@ -52,40 +54,36 @@ __wt_bt_read(WT_SESSION_IMPL *session,
if (F_ISSET(dsk, WT_PAGE_ENCRYPTED)) {
if (btree->kencryptor == NULL ||
(encryptor = btree->kencryptor->encryptor) == NULL ||
- encryptor->decrypt == NULL)
- WT_ERR_MSG(session, WT_ERROR,
- "read encrypted block where no decryption engine "
- "configured");
+ encryptor->decrypt == NULL) {
+ fail_msg =
+ "encrypted block in file for which no encryption "
+ "configured";
+ goto corrupt;
+ }
WT_ERR(__wt_scr_alloc(session, 0, &etmp));
- ret = __wt_decrypt(session,
- encryptor, WT_BLOCK_ENCRYPT_SKIP, ip, etmp);
- /*
- * It may be file corruption, which is really, really bad, or
- * may be a mismatch of encryption configuration, for example,
- * an incorrect secretkey.
- */
- if (ret != 0)
- WT_ERR(F_ISSET(btree, WT_BTREE_VERIFY) ||
- F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ?
- WT_ERROR :
- __wt_illegal_value(session, btree->dhandle->name));
+ if ((ret = __wt_decrypt(session,
+ encryptor, WT_BLOCK_ENCRYPT_SKIP, ip, etmp)) != 0) {
+ fail_msg = "block decryption failed";
+ goto corrupt;
+ }
ip = etmp;
dsk = ip->data;
- } else if (btree->kencryptor != NULL &&
- !F_ISSET(btree, WT_BTREE_VERIFY) &&
- !F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))
- WT_ERR_MSG(session, WT_ERROR,
- "encryption configured, and existing file is not "
- "encrypted");
+ } else if (btree->kencryptor != NULL) {
+ fail_msg =
+ "unencrypted block in file for which encryption configured";
+ goto corrupt;
+ }
if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) {
if (btree->compressor == NULL ||
- btree->compressor->decompress == NULL)
- WT_ERR_MSG(session, WT_ERROR,
- "read compressed block where no compression engine "
- "configured");
+ btree->compressor->decompress == NULL) {
+ fail_msg =
+ "compressed block in file for which no compression "
+ "configured";
+ goto corrupt;
+ }
/*
* Size the buffer based on the in-memory bytes we're expecting
@@ -118,11 +116,10 @@ __wt_bt_read(WT_SESSION_IMPL *session,
* it's OK, otherwise it's really, really bad.
*/
if (ret != 0 ||
- result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP)
- WT_ERR(F_ISSET(btree, WT_BTREE_VERIFY) ||
- F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ?
- WT_ERROR :
- __wt_illegal_value(session, btree->dhandle->name));
+ result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) {
+ fail_msg = "block decryption failed";
+ goto corrupt;
+ }
} else
/*
* If we uncompressed above, the page is in the correct buffer.
@@ -139,7 +136,7 @@ __wt_bt_read(WT_SESSION_IMPL *session,
if (tmp == NULL)
WT_ERR(__wt_scr_alloc(session, 0, &tmp));
WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size));
- WT_ERR(__wt_verify_dsk(session, (const char *)tmp->data, buf));
+ WT_ERR(__wt_verify_dsk(session, tmp->data, buf));
}
WT_STAT_FAST_CONN_INCR(session, cache_read);
@@ -149,6 +146,16 @@ __wt_bt_read(WT_SESSION_IMPL *session,
WT_STAT_FAST_CONN_INCRV(session, cache_bytes_read, dsk->mem_size);
WT_STAT_FAST_DATA_INCRV(session, cache_bytes_read, dsk->mem_size);
+ if (0) {
+corrupt: if (ret == 0)
+ ret = WT_ERROR;
+ if (!F_ISSET(btree, WT_BTREE_VERIFY) &&
+ !F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) {
+ __wt_err(session, ret, "%s", fail_msg);
+ ret = __wt_illegal_value(session, btree->dhandle->name);
+ }
+ }
+
err: __wt_scr_free(session, &tmp);
__wt_scr_free(session, &etmp);
return (ret);
diff --git a/src/btree/bt_ovfl.c b/src/btree/bt_ovfl.c
index d8456c5b61f..7104e702418 100644
--- a/src/btree/bt_ovfl.c
+++ b/src/btree/bt_ovfl.c
@@ -79,7 +79,7 @@ __wt_ovfl_read(WT_SESSION_IMPL *session,
* __ovfl_cache_col_visible --
* column-store: check for a globally visible update.
*/
-static int
+static bool
__ovfl_cache_col_visible(
WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_CELL_UNPACK *unpack)
{
@@ -99,15 +99,15 @@ __ovfl_cache_col_visible(
if (__wt_cell_rle(unpack) == 1 &&
upd != NULL && /* Sanity: upd should always be set. */
__wt_txn_visible_all(session, upd->txnid))
- return (1);
- return (0);
+ return (true);
+ return (false);
}
/*
* __ovfl_cache_row_visible --
* row-store: check for a globally visible update.
*/
-static int
+static bool
__ovfl_cache_row_visible(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip)
{
WT_UPDATE *upd;
@@ -115,9 +115,9 @@ __ovfl_cache_row_visible(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip)
/* Check to see if there's a globally visible update. */
for (upd = WT_ROW_UPDATE(page, rip); upd != NULL; upd = upd->next)
if (__wt_txn_visible_all(session, upd->txnid))
- return (1);
+ return (true);
- return (0);
+ return (false);
}
/*
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
index 86edd992b28..ba218fc332c 100644
--- a/src/btree/bt_page.c
+++ b/src/btree/bt_page.c
@@ -17,214 +17,6 @@ static int __inmem_row_leaf_entries(
WT_SESSION_IMPL *, const WT_PAGE_HEADER *, uint32_t *);
/*
- * __evict_force_check --
- * Check if a page matches the criteria for forced eviction.
- */
-static int
-__evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page)
-{
- WT_BTREE *btree;
-
- btree = S2BT(session);
-
- /* Pages are usually small enough, check that first. */
- if (page->memory_footprint < btree->maxmempage)
- return (0);
-
- /* Leaf pages only. */
- if (WT_PAGE_IS_INTERNAL(page))
- return (0);
-
- /*
- * It's hard to imagine a page with a huge memory footprint that has
- * never been modified, but check to be sure.
- */
- if (page->modify == NULL)
- return (0);
-
- /* Trigger eviction on the next page release. */
- __wt_page_evict_soon(page);
-
- /* Bump the oldest ID, we're about to do some visibility checks. */
- __wt_txn_update_oldest(session, 0);
-
- /* If eviction cannot succeed, don't try. */
- return (__wt_page_can_evict(session, page, 1, NULL));
-}
-
-/*
- * __wt_page_in_func --
- * Acquire a hazard pointer to a page; if the page is not in-memory,
- * read it from the disk and build an in-memory version.
- */
-int
-__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
-#ifdef HAVE_DIAGNOSTIC
- , const char *file, int line
-#endif
- )
-{
- WT_BTREE *btree;
- WT_DECL_RET;
- WT_PAGE *page;
- u_int sleep_cnt, wait_cnt;
- int busy, cache_work, force_attempts, oldgen;
-
- btree = S2BT(session);
-
- for (force_attempts = oldgen = 0, wait_cnt = 0;;) {
- switch (ref->state) {
- case WT_REF_DISK:
- case WT_REF_DELETED:
- if (LF_ISSET(WT_READ_CACHE))
- return (WT_NOTFOUND);
-
- /*
- * The page isn't in memory, attempt to read it.
- * Make sure there is space in the cache.
- */
- WT_RET(__wt_cache_eviction_check(session, 1, NULL));
- WT_RET(__wt_cache_read(session, ref));
- oldgen = LF_ISSET(WT_READ_WONT_NEED) ||
- F_ISSET(session, WT_SESSION_NO_CACHE);
- continue;
- case WT_REF_READING:
- if (LF_ISSET(WT_READ_CACHE))
- return (WT_NOTFOUND);
- if (LF_ISSET(WT_READ_NO_WAIT))
- return (WT_NOTFOUND);
-
- /* Waiting on another thread's read, stall. */
- WT_STAT_FAST_CONN_INCR(session, page_read_blocked);
- goto stall;
- case WT_REF_LOCKED:
- if (LF_ISSET(WT_READ_NO_WAIT))
- return (WT_NOTFOUND);
-
- /* Waiting on eviction, stall. */
- WT_STAT_FAST_CONN_INCR(session, page_locked_blocked);
- goto stall;
- case WT_REF_SPLIT:
- return (WT_RESTART);
- case WT_REF_MEM:
- /*
- * The page is in memory.
- *
- * Get a hazard pointer if one is required. We cannot
- * be evicting if no hazard pointer is required, we're
- * done.
- */
- if (F_ISSET(btree, WT_BTREE_IN_MEMORY))
- goto skip_evict;
-
- /*
- * The expected reason we can't get a hazard pointer is
- * because the page is being evicted, yield, try again.
- */
-#ifdef HAVE_DIAGNOSTIC
- WT_RET(
- __wt_hazard_set(session, ref, &busy, file, line));
-#else
- WT_RET(__wt_hazard_set(session, ref, &busy));
-#endif
- if (busy) {
- WT_STAT_FAST_CONN_INCR(
- session, page_busy_blocked);
- break;
- }
-
- /*
- * If eviction is configured for this file, check to see
- * if the page qualifies for forced eviction and update
- * the page's generation number. If eviction isn't being
- * done on this file, we're done.
- */
- if (LF_ISSET(WT_READ_NO_EVICT) ||
- F_ISSET(btree, WT_BTREE_NO_EVICTION))
- goto skip_evict;
-
- /*
- * Forcibly evict pages that are too big.
- */
- page = ref->page;
- if (force_attempts < 10 &&
- __evict_force_check(session, page)) {
- ++force_attempts;
- ret = __wt_page_release_evict(session, ref);
- /* If forced eviction fails, stall. */
- if (ret == EBUSY) {
- ret = 0;
- WT_STAT_FAST_CONN_INCR(session,
- page_forcible_evict_blocked);
- goto stall;
- }
- WT_RET(ret);
-
- /*
- * The result of a successful forced eviction
- * is a page-state transition (potentially to
- * an in-memory page we can use, or a restart
- * return for our caller), continue the outer
- * page-acquisition loop.
- */
- continue;
- }
-
- /*
- * If we read the page and we are configured to not
- * trash the cache, set the oldest read generation so
- * the page is forcibly evicted as soon as possible.
- *
- * Otherwise, update the page's read generation.
- */
- if (oldgen && page->read_gen == WT_READGEN_NOTSET)
- __wt_page_evict_soon(page);
- else if (!LF_ISSET(WT_READ_NO_GEN) &&
- page->read_gen != WT_READGEN_OLDEST &&
- page->read_gen < __wt_cache_read_gen(session))
- page->read_gen =
- __wt_cache_read_gen_bump(session);
-skip_evict:
- /*
- * Check if we need an autocommit transaction.
- * Starting a transaction can trigger eviction, so skip
- * it if eviction isn't permitted.
- */
- return (LF_ISSET(WT_READ_NO_EVICT) ? 0 :
- __wt_txn_autocommit_check(session));
- WT_ILLEGAL_VALUE(session);
- }
-
- /*
- * We failed to get the page -- yield before retrying, and if
- * we've yielded enough times, start sleeping so we don't burn
- * CPU to no purpose.
- */
- if (++wait_cnt < 1000)
- __wt_yield();
- else {
- if (0) {
-stall: wait_cnt += 1000;
- }
-
- /*
- * If stalling, check if the cache needs help. If we do
- * work for the cache, substitute that for a sleep.
- */
- WT_RET(
- __wt_cache_eviction_check(session, 1, &cache_work));
- if (!cache_work) {
- sleep_cnt = WT_MIN(wait_cnt, 10000);
- wait_cnt *= 2;
- WT_STAT_FAST_CONN_INCRV(
- session, page_sleep, sleep_cnt);
- __wt_sleep(0, sleep_cnt);
- }
- }
- }
-}
-
-/*
* __wt_page_alloc --
* Create or read a page into the cache.
*/
@@ -326,8 +118,8 @@ err: if ((pindex = WT_INTL_INDEX_GET_SAFE(page)) != NULL) {
/* Increment the cache statistics. */
__wt_cache_page_inmem_incr(session, page, size);
- (void)WT_ATOMIC_ADD8(cache->bytes_read, size);
- (void)WT_ATOMIC_ADD8(cache->pages_inmem, 1);
+ (void)__wt_atomic_add64(&cache->bytes_read, size);
+ (void)__wt_atomic_add64(&cache->pages_inmem, 1);
*pagep = page;
return (0);
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
index e27f7c3398c..d26b44e04c0 100644
--- a/src/btree/bt_read.c
+++ b/src/btree/bt_read.c
@@ -9,19 +9,328 @@
#include "wt_internal.h"
/*
- * __wt_cache_read --
- * Read a page from the file.
+ * __wt_las_remove_block --
+ * Remove all records matching a key prefix from the lookaside store.
*/
int
-__wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref)
+__wt_las_remove_block(WT_SESSION_IMPL *session,
+ WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size)
{
+ WT_DECL_ITEM(las_addr);
+ WT_DECL_ITEM(las_key);
+ WT_DECL_RET;
+ uint64_t las_counter, las_txnid;
+ uint32_t las_id;
+ int exact;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &las_addr));
+ WT_ERR(__wt_scr_alloc(session, 0, &las_key));
+
+ /*
+ * Search for the block's unique prefix and step through all matching
+ * records, removing them.
+ */
+ las_addr->data = addr;
+ las_addr->size = addr_size;
+ las_key->size = 0;
+ cursor->set_key(
+ cursor, btree_id, las_addr, (uint64_t)0, (uint32_t)0, las_key);
+ if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0)
+ ret = cursor->next(cursor);
+ for (; ret == 0; ret = cursor->next(cursor)) {
+ WT_ERR(cursor->get_key(cursor,
+ &las_id, las_addr, &las_counter, &las_txnid, las_key));
+
+ /*
+ * Confirm the search using the unique prefix; if not a match,
+ * we're done searching for records for this page.
+ */
+ if (las_id != btree_id ||
+ las_addr->size != addr_size ||
+ memcmp(las_addr->data, addr, addr_size) != 0)
+ break;
+
+ /*
+ * Cursor opened overwrite=true: won't return WT_NOTFOUND should
+ * another thread remove the record before we do, and the cursor
+ * remains positioned in that case.
+ */
+ WT_ERR(cursor->remove(cursor));
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+err: __wt_scr_free(session, &las_addr);
+ __wt_scr_free(session, &las_key);
+ return (ret);
+}
+
+/*
+ * __col_instantiate --
+ * Update a column-store page entry based on a lookaside table update list.
+ */
+static int
+__col_instantiate(WT_SESSION_IMPL *session,
+ uint64_t recno, WT_REF *ref, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+{
+ /* Search the page and add updates. */
+ WT_RET(__wt_col_search(session, recno, ref, cbt));
+ WT_RET(__wt_col_modify(session, cbt, recno, NULL, upd, 0));
+ return (0);
+}
+
+/*
+ * __row_instantiate --
+ * Update a row-store page entry based on a lookaside table update list.
+ */
+static int
+__row_instantiate(WT_SESSION_IMPL *session,
+ WT_ITEM *key, WT_REF *ref, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+{
+ /* Search the page and add updates. */
+ WT_RET(__wt_row_search(session, key, ref, cbt, 1));
+ WT_RET(__wt_row_modify(session, cbt, key, NULL, upd, 0));
+ return (0);
+}
+
+/*
+ * __las_page_instantiate --
+ * Instantiate lookaside update records in a recently read page.
+ */
+static int
+__las_page_instantiate(WT_SESSION_IMPL *session,
+ WT_REF *ref, uint32_t read_id, const uint8_t *addr, size_t addr_size)
+{
+ WT_CURSOR *cursor;
+ WT_CURSOR_BTREE cbt;
+ WT_DECL_ITEM(current_key);
+ WT_DECL_ITEM(las_addr);
+ WT_DECL_ITEM(las_key);
+ WT_DECL_ITEM(las_value);
+ WT_DECL_RET;
+ WT_PAGE *page;
+ WT_UPDATE *first_upd, *last_upd, *upd;
+ size_t incr, total_incr;
+ uint64_t current_recno, las_counter, las_txnid, recno, upd_txnid;
+ uint32_t las_id, upd_size, session_flags;
+ int exact;
+ const uint8_t *p;
+
+ cursor = NULL;
+ page = ref->page;
+ first_upd = last_upd = upd = NULL;
+ total_incr = 0;
+ current_recno = recno = WT_RECNO_OOB;
+ session_flags = 0; /* [-Werror=maybe-uninitialized] */
+
+ __wt_btcur_init(session, &cbt);
+ __wt_btcur_open(&cbt);
+
+ WT_ERR(__wt_scr_alloc(session, 0, &current_key));
+ WT_ERR(__wt_scr_alloc(session, 0, &las_addr));
+ WT_ERR(__wt_scr_alloc(session, 0, &las_key));
+ WT_ERR(__wt_scr_alloc(session, 0, &las_value));
+
+ /* Open a lookaside table cursor. */
+ WT_ERR(__wt_las_cursor(session, &cursor, &session_flags));
+
+ /*
+ * The lookaside records are in key and update order, that is, there
+ * will be a set of in-order updates for a key, then another set of
+ * in-order updates for a subsequent key. We process all of the updates
+ * for a key and then insert those updates into the page, then all the
+ * updates for the next key, and so on.
+ *
+ * Search for the block's unique prefix, stepping through any matching
+ * records.
+ */
+ las_addr->data = addr;
+ las_addr->size = addr_size;
+ las_key->size = 0;
+ cursor->set_key(
+ cursor, read_id, las_addr, (uint64_t)0, (uint32_t)0, las_key);
+ if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0)
+ ret = cursor->next(cursor);
+ for (; ret == 0; ret = cursor->next(cursor)) {
+ WT_ERR(cursor->get_key(cursor,
+ &las_id, las_addr, &las_counter, &las_txnid, las_key));
+
+ /*
+ * Confirm the search using the unique prefix; if not a match,
+ * we're done searching for records for this page.
+ */
+ if (las_id != read_id ||
+ las_addr->size != addr_size ||
+ memcmp(las_addr->data, addr, addr_size) != 0)
+ break;
+
+ /*
+ * If the on-page value has become globally visible, this record
+ * is no longer needed.
+ */
+ if (__wt_txn_visible_all(session, las_txnid))
+ continue;
+
+ /* Allocate the WT_UPDATE structure. */
+ WT_ERR(cursor->get_value(
+ cursor, &upd_txnid, &upd_size, las_value));
+ WT_ERR(__wt_update_alloc(session,
+ (upd_size == WT_UPDATE_DELETED_VALUE) ? NULL : las_value,
+ &upd, &incr));
+ total_incr += incr;
+ upd->txnid = upd_txnid;
+
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ p = las_key->data;
+ WT_ERR(__wt_vunpack_uint(&p, 0, &recno));
+ if (current_recno == recno)
+ break;
+ WT_ASSERT(session, current_recno < recno);
+
+ if (first_upd != NULL) {
+ WT_ERR(__col_instantiate(session,
+ current_recno, ref, &cbt, first_upd));
+ first_upd = NULL;
+ }
+ current_recno = recno;
+ break;
+ case WT_PAGE_ROW_LEAF:
+ if (current_key->size == las_key->size &&
+ memcmp(current_key->data,
+ las_key->data, las_key->size) == 0)
+ break;
+
+ if (first_upd != NULL) {
+ WT_ERR(__row_instantiate(session,
+ current_key, ref, &cbt, first_upd));
+ first_upd = NULL;
+ }
+ WT_ERR(__wt_buf_set(session,
+ current_key, las_key->data, las_key->size));
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ /* Append the latest update to the list. */
+ if (first_upd == NULL)
+ first_upd = last_upd = upd;
+ else {
+ last_upd->next = upd;
+ last_upd = upd;
+ }
+ upd = NULL;
+ }
+ WT_ERR_NOTFOUND_OK(ret);
+
+ /* Insert the last set of updates, if any. */
+ if (first_upd != NULL)
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ WT_ERR(__col_instantiate(session,
+ current_recno, ref, &cbt, first_upd));
+ first_upd = NULL;
+ break;
+ case WT_PAGE_ROW_LEAF:
+ WT_ERR(__row_instantiate(session,
+ current_key, ref, &cbt, first_upd));
+ first_upd = NULL;
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ /* Discard the cursor. */
+ WT_ERR(__wt_las_cursor_close(session, &cursor, session_flags));
+
+ if (total_incr != 0) {
+ __wt_cache_page_inmem_incr(session, page, total_incr);
+
+ /*
+ * We've modified/dirtied the page, but that's not necessary and
+ * if we keep the page clean, it's easier to evict. We leave the
+ * lookaside table updates in place, so if we evict this page
+ * without dirtying it, any future instantiation of it will find
+ * the records it needs. If the page is dirtied before eviction,
+ * then we'll write any needed lookaside table records for the
+ * new location of the page.
+ */
+ __wt_page_modify_clear(session, page);
+ }
+
+err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
+ WT_TRET(__wt_btcur_close(&cbt, 1));
+
+ /*
+ * On error, upd points to a single unlinked WT_UPDATE structure,
+ * first_upd points to a list.
+ */
+ if (upd != NULL)
+ __wt_free(session, upd);
+ if (first_upd != NULL)
+ __wt_free_update_list(session, first_upd);
+
+ __wt_scr_free(session, &current_key);
+ __wt_scr_free(session, &las_addr);
+ __wt_scr_free(session, &las_key);
+ __wt_scr_free(session, &las_value);
+
+ return (ret);
+}
+
+/*
+ * __evict_force_check --
+ * Check if a page matches the criteria for forced eviction.
+ */
+static int
+__evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+
+ /* Pages are usually small enough, check that first. */
+ if (page->memory_footprint < btree->maxmempage)
+ return (0);
+
+ /* Leaf pages only. */
+ if (WT_PAGE_IS_INTERNAL(page))
+ return (0);
+
+ /*
+ * It's hard to imagine a page with a huge memory footprint that has
+ * never been modified, but check to be sure.
+ */
+ if (page->modify == NULL)
+ return (0);
+
+ /* Trigger eviction on the next page release. */
+ __wt_page_evict_soon(page);
+
+ /* Bump the oldest ID, we're about to do some visibility checks. */
+ __wt_txn_update_oldest(session, 0);
+
+ /* If eviction cannot succeed, don't try. */
+ return (__wt_page_can_evict(session, page, 1, NULL));
+}
+
+/*
+ * __page_read --
+ * Read a page from the file.
+ */
+static int
+__page_read(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ const WT_PAGE_HEADER *dsk;
+ WT_BTREE *btree;
WT_DECL_RET;
WT_ITEM tmp;
WT_PAGE *page;
- WT_PAGE_STATE previous_state;
size_t addr_size;
+ uint32_t previous_state;
const uint8_t *addr;
+ btree = S2BT(session);
page = NULL;
/*
@@ -35,9 +344,9 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref)
* WT_REF_LOCKED, for deleted pages. If successful, we've won the
* race, read the page.
*/
- if (WT_ATOMIC_CAS4(ref->state, WT_REF_DISK, WT_REF_READING))
+ if (__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_READING))
previous_state = WT_REF_DISK;
- else if (WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED))
+ else if (__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED))
previous_state = WT_REF_DELETED;
else
return (0);
@@ -45,8 +354,6 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref)
/*
* Get the address: if there is no address, the page was deleted, but a
* subsequent search or insert is forcing re-creation of the name space.
- * Otherwise, there's an address, read the backing disk page and build
- * an in-memory version of the page.
*/
WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
if (addr == NULL) {
@@ -54,27 +361,51 @@ __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref)
WT_ERR(__wt_btree_new_leaf_page(session, &page));
ref->page = page;
- } else {
- /*
- * Read the page, then build the in-memory version of the page.
- * Clear any local reference to an allocated copy of the disk
- * image on return, the page steals it.
- */
- WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size));
- WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize,
- WT_DATA_IN_ITEM(&tmp) ?
- WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page));
- tmp.mem = NULL;
-
- /* If the page was deleted, instantiate that information. */
- if (previous_state == WT_REF_DELETED)
- WT_ERR(__wt_delete_page_instantiate(session, ref));
+ goto done;
}
- WT_ERR(__wt_verbose(session, WT_VERB_READ,
- "page %p: %s", page, __wt_page_type_string(page->type)));
+ /*
+ * There's an address, read or map the backing disk page and build an
+ * in-memory version of the page.
+ */
+ WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size));
+ WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize,
+ WT_DATA_IN_ITEM(&tmp) ?
+ WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page));
+
+ /*
+ * Clear the local reference to an allocated copy of the disk image on
+ * return; the page steals it, errors in this code should not free it.
+ */
+ tmp.mem = NULL;
- WT_PUBLISH(ref->state, WT_REF_MEM);
+ /*
+ * If reading for a checkpoint, there's no additional work to do, the
+ * page on disk is correct as written.
+ */
+ if (session->dhandle->checkpoint != NULL)
+ goto done;
+
+ /* If the page was deleted, instantiate that information. */
+ if (previous_state == WT_REF_DELETED)
+ WT_ERR(__wt_delete_page_instantiate(session, ref));
+
+ /*
+ * Instantiate updates from the database's lookaside table. The page
+ * flag was set when the page was written, potentially a long time ago.
+ * We only care if the lookaside table is currently active, check that
+ * before doing any work.
+ */
+ dsk = tmp.data;
+ if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE) && __wt_las_is_written(session)) {
+ WT_STAT_FAST_CONN_INCR(session, cache_read_lookaside);
+ WT_STAT_FAST_DATA_INCR(session, cache_read_lookaside);
+
+ WT_ERR(__las_page_instantiate(
+ session, ref, btree->id, addr, addr_size));
+ }
+
+done: WT_PUBLISH(ref->state, WT_REF_MEM);
return (0);
err: /*
@@ -90,3 +421,183 @@ err: /*
return (ret);
}
+
+/*
+ * __wt_page_in_func --
+ * Acquire a hazard pointer to a page; if the page is not in-memory,
+ * read it from the disk and build an in-memory version.
+ */
+int
+__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
+#ifdef HAVE_DIAGNOSTIC
+ , const char *file, int line
+#endif
+ )
+{
+ WT_BTREE *btree;
+ WT_DECL_RET;
+ WT_PAGE *page;
+ u_int sleep_cnt, wait_cnt;
+ int busy, cache_work, force_attempts, oldgen, stalled;
+
+ btree = S2BT(session);
+ stalled = 0;
+
+ for (force_attempts = oldgen = 0, sleep_cnt = wait_cnt = 0;;) {
+ switch (ref->state) {
+ case WT_REF_DISK:
+ case WT_REF_DELETED:
+ if (LF_ISSET(WT_READ_CACHE))
+ return (WT_NOTFOUND);
+
+ /*
+ * The page isn't in memory, read it. If this thread is
+ * allowed to do eviction work, check for space in the
+ * cache.
+ */
+ if (!LF_ISSET(WT_READ_NO_EVICT))
+ WT_RET(__wt_cache_eviction_check(
+ session, 1, NULL));
+ WT_RET(__page_read(session, ref));
+ oldgen = LF_ISSET(WT_READ_WONT_NEED) ||
+ F_ISSET(session, WT_SESSION_NO_CACHE);
+ continue;
+ case WT_REF_READING:
+ if (LF_ISSET(WT_READ_CACHE))
+ return (WT_NOTFOUND);
+ if (LF_ISSET(WT_READ_NO_WAIT))
+ return (WT_NOTFOUND);
+
+ /* Waiting on another thread's read, stall. */
+ WT_STAT_FAST_CONN_INCR(session, page_read_blocked);
+ stalled = 1;
+ break;
+ case WT_REF_LOCKED:
+ if (LF_ISSET(WT_READ_NO_WAIT))
+ return (WT_NOTFOUND);
+
+ /* Waiting on eviction, stall. */
+ WT_STAT_FAST_CONN_INCR(session, page_locked_blocked);
+ stalled = 1;
+ break;
+ case WT_REF_SPLIT:
+ return (WT_RESTART);
+ case WT_REF_MEM:
+ /*
+ * The page is in memory.
+ *
+ * Get a hazard pointer if one is required. We cannot
+ * be evicting if no hazard pointer is required, we're
+ * done.
+ */
+ if (F_ISSET(btree, WT_BTREE_IN_MEMORY))
+ goto skip_evict;
+
+ /*
+ * The expected reason we can't get a hazard pointer is
+ * because the page is being evicted, yield, try again.
+ */
+#ifdef HAVE_DIAGNOSTIC
+ WT_RET(
+ __wt_hazard_set(session, ref, &busy, file, line));
+#else
+ WT_RET(__wt_hazard_set(session, ref, &busy));
+#endif
+ if (busy) {
+ WT_STAT_FAST_CONN_INCR(
+ session, page_busy_blocked);
+ break;
+ }
+
+ /*
+ * If eviction is configured for this file, check to see
+ * if the page qualifies for forced eviction and update
+ * the page's generation number. If eviction isn't being
+ * done on this file, we're done.
+ */
+ if (LF_ISSET(WT_READ_NO_EVICT) ||
+ F_ISSET(session, WT_SESSION_NO_EVICTION) ||
+ F_ISSET(btree, WT_BTREE_NO_EVICTION))
+ goto skip_evict;
+
+ /*
+ * Forcibly evict pages that are too big.
+ */
+ page = ref->page;
+ if (force_attempts < 10 &&
+ __evict_force_check(session, page)) {
+ ++force_attempts;
+ ret = __wt_page_release_evict(session, ref);
+ /* If forced eviction fails, stall. */
+ if (ret == EBUSY) {
+ ret = 0;
+ WT_STAT_FAST_CONN_INCR(session,
+ page_forcible_evict_blocked);
+ stalled = 1;
+ break;
+ }
+ WT_RET(ret);
+
+ /*
+ * The result of a successful forced eviction
+ * is a page-state transition (potentially to
+ * an in-memory page we can use, or a restart
+ * return for our caller), continue the outer
+ * page-acquisition loop.
+ */
+ continue;
+ }
+
+ /*
+ * If we read the page and we are configured to not
+ * trash the cache, set the oldest read generation so
+ * the page is forcibly evicted as soon as possible.
+ *
+ * Otherwise, update the page's read generation.
+ */
+ if (oldgen && page->read_gen == WT_READGEN_NOTSET)
+ __wt_page_evict_soon(page);
+ else if (!LF_ISSET(WT_READ_NO_GEN) &&
+ page->read_gen != WT_READGEN_OLDEST &&
+ page->read_gen < __wt_cache_read_gen(session))
+ page->read_gen =
+ __wt_cache_read_gen_bump(session);
+skip_evict:
+ /*
+ * Check if we need an autocommit transaction.
+ * Starting a transaction can trigger eviction, so skip
+ * it if eviction isn't permitted.
+ */
+ return (LF_ISSET(WT_READ_NO_EVICT) ? 0 :
+ __wt_txn_autocommit_check(session));
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ /*
+ * We failed to get the page -- yield before retrying, and if
+ * we've yielded enough times, start sleeping so we don't burn
+ * CPU to no purpose.
+ */
+ if (stalled)
+ wait_cnt += 1000;
+ else if (++wait_cnt < 1000) {
+ __wt_yield();
+ continue;
+ }
+
+ /*
+ * If stalling and this thread is allowed to do eviction work,
+ * check if the cache needs help. If we do work for the cache,
+ * substitute that for a sleep.
+ */
+ if (!LF_ISSET(WT_READ_NO_EVICT)) {
+ WT_RET(
+ __wt_cache_eviction_check(session, 1, &cache_work));
+ if (cache_work)
+ continue;
+ }
+ sleep_cnt = WT_MIN(sleep_cnt + 1000, 10000);
+ WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt);
+ __wt_sleep(0, sleep_cnt);
+ }
+}
diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c
index f41a5d86e9f..c2a211bdd2d 100644
--- a/src/btree/bt_slvg.c
+++ b/src/btree/bt_slvg.c
@@ -197,9 +197,9 @@ __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[])
* Turn off read checksum and verification error messages while we're
* reading the file, we expect to see corrupted blocks.
*/
- F_SET(session, WT_SESSION_SALVAGE_CORRUPT_OK);
+ F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE);
ret = __slvg_read(session, ss);
- F_CLR(session, WT_SESSION_SALVAGE_CORRUPT_OK);
+ F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE);
WT_ERR(ret);
/*
@@ -349,9 +349,6 @@ err: WT_TRET(bm->salvage_end(bm, session));
__wt_scr_free(session, &ss->tmp1);
__wt_scr_free(session, &ss->tmp2);
- /* Wrap up reporting. */
- WT_TRET(__wt_progress(session, NULL, ss->fcnt));
-
return (ret);
}
@@ -381,8 +378,9 @@ __slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss)
if (eof)
break;
- /* Report progress every 10 chunks. */
- if (++ss->fcnt % 10 == 0)
+ /* Report progress occasionally. */
+#define WT_SALVAGE_PROGRESS_INTERVAL 100
+ if (++ss->fcnt % WT_SALVAGE_PROGRESS_INTERVAL == 0)
WT_ERR(__wt_progress(session, NULL, ss->fcnt));
/*
@@ -1305,7 +1303,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref)
/* Write the new version of the leaf page to disk. */
WT_ERR(__slvg_modify_init(session, page));
- WT_ERR(__wt_reconcile(session, ref, cookie, WT_SKIP_UPDATE_ERR));
+ WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR));
/* Reset the page. */
page->pg_var_d = save_col_var;
@@ -2011,7 +2009,7 @@ __slvg_row_build_leaf(
/* Write the new version of the leaf page to disk. */
WT_ERR(__slvg_modify_init(session, page));
- WT_ERR(__wt_reconcile(session, ref, cookie, WT_SKIP_UPDATE_ERR));
+ WT_ERR(__wt_reconcile(session, ref, cookie, WT_VISIBILITY_ERR));
/* Reset the page. */
page->pg_row_entries += skip_stop;
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index dbd4042129d..4b9ab45c678 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -45,10 +45,13 @@ static int
__split_stash_add(
WT_SESSION_IMPL *session, uint64_t split_gen, void *p, size_t len)
{
+ WT_CONNECTION_IMPL *conn;
WT_SPLIT_STASH *stash;
WT_ASSERT(session, p != NULL);
+ conn = S2C(session);
+
/* Grow the list as necessary. */
WT_RET(__wt_realloc_def(session, &session->split_stash_alloc,
session->split_stash_cnt + 1, &session->split_stash));
@@ -58,8 +61,8 @@ __split_stash_add(
stash->p = p;
stash->len = len;
- WT_STAT_FAST_CONN_ATOMIC_INCRV(session, rec_split_stashed_bytes, len);
- WT_STAT_FAST_CONN_ATOMIC_INCR(session, rec_split_stashed_objects);
+ (void)__wt_atomic_add64(&conn->split_stashed_bytes, len);
+ (void)__wt_atomic_add64(&conn->split_stashed_objects, 1);
/* See if we can free any previous entries. */
if (session->split_stash_cnt > 1)
@@ -75,10 +78,13 @@ __split_stash_add(
void
__wt_split_stash_discard(WT_SESSION_IMPL *session)
{
+ WT_CONNECTION_IMPL *conn;
WT_SPLIT_STASH *stash;
uint64_t oldest;
size_t i;
+ conn = S2C(session);
+
/* Get the oldest split generation. */
oldest = __split_oldest_gen(session);
@@ -93,10 +99,8 @@ __wt_split_stash_discard(WT_SESSION_IMPL *session)
* It's a bad thing if another thread is in this memory after
* we free it, make sure nothing good happens to that thread.
*/
- WT_STAT_FAST_CONN_ATOMIC_DECRV(
- session, rec_split_stashed_bytes, stash->len);
- WT_STAT_FAST_CONN_ATOMIC_DECR(
- session, rec_split_stashed_objects);
+ (void)__wt_atomic_sub64(&conn->split_stashed_bytes, stash->len);
+ (void)__wt_atomic_sub64(&conn->split_stashed_objects, 1);
__wt_overwrite_and_free_len(session, stash->p, stash->len);
}
@@ -169,7 +173,7 @@ __split_safe_free(WT_SESSION_IMPL *session,
* __split_should_deepen --
* Return if we should deepen the tree.
*/
-static int
+static bool
__split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref)
{
WT_BTREE *btree;
@@ -192,7 +196,7 @@ __split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref)
* pressure on the cache).
*/
if (page->memory_footprint < btree->maxmempage)
- return (0);
+ return (false);
/*
* Ensure the page has enough entries to make it worth splitting and
@@ -200,7 +204,7 @@ __split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref)
* splitting won't help).
*/
if (pindex->entries > btree->split_deepen_min_child)
- return (1);
+ return (true);
/*
* Don't allow a single page to put pressure on cache usage. The root
@@ -212,9 +216,9 @@ __split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref)
if (pindex->entries >= 100 &&
(__wt_ref_is_root(ref) ||
page->memory_footprint >= S2C(session)->cache_size / 4))
- return (1);
+ return (true);
- return (0);
+ return (false);
}
/*
@@ -339,7 +343,7 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
switch (page->type) {
case WT_PAGE_COL_INT:
- recno = 0;
+ recno = 0; /* Less than any valid record number. */
WT_INTL_FOREACH_BEGIN(session, page, ref) {
WT_ASSERT(session, ref->key.recno > recno);
recno = ref->key.recno;
@@ -557,7 +561,7 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
*/
WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex);
WT_INTL_INDEX_SET(parent, alloc_index);
- split_gen = WT_ATOMIC_ADD8(S2C(session)->split_gen, 1);
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
panic = 1;
#ifdef HAVE_DIAGNOSTIC
@@ -680,13 +684,11 @@ __split_multi_inmem(
WT_DECL_RET;
WT_PAGE *page;
WT_UPDATE *upd;
- WT_UPD_SKIPPED *skip;
+ WT_SAVE_UPD *supd;
uint64_t recno;
uint32_t i, slot;
- WT_CLEAR(cbt);
- cbt.iface.session = &session->iface;
- cbt.btree = S2BT(session);
+ __wt_btcur_init(session, &cbt);
__wt_btcur_open(&cbt);
/*
@@ -700,22 +702,22 @@ __split_multi_inmem(
* allocated page on error, when discarding the allocated WT_REF.
*/
WT_RET(__wt_page_inmem(session, ref,
- multi->skip_dsk, ((WT_PAGE_HEADER *)multi->skip_dsk)->mem_size,
+ multi->supd_dsk, ((WT_PAGE_HEADER *)multi->supd_dsk)->mem_size,
WT_PAGE_DISK_ALLOC, &page));
- multi->skip_dsk = NULL;
+ multi->supd_dsk = NULL;
if (orig->type == WT_PAGE_ROW_LEAF)
WT_RET(__wt_scr_alloc(session, 0, &key));
/* Re-create each modification we couldn't write. */
- for (i = 0, skip = multi->skip; i < multi->skip_entries; ++i, ++skip)
+ for (i = 0, supd = multi->supd; i < multi->supd_entries; ++i, ++supd)
switch (orig->type) {
case WT_PAGE_COL_FIX:
case WT_PAGE_COL_VAR:
/* Build a key. */
- upd = skip->ins->upd;
- skip->ins->upd = NULL;
- recno = WT_INSERT_RECNO(skip->ins);
+ upd = supd->ins->upd;
+ supd->ins->upd = NULL;
+ recno = WT_INSERT_RECNO(supd->ins);
/* Search the page. */
WT_ERR(__wt_col_search(session, recno, ref, &cbt));
@@ -726,19 +728,19 @@ __split_multi_inmem(
break;
case WT_PAGE_ROW_LEAF:
/* Build a key. */
- if (skip->ins == NULL) {
- slot = WT_ROW_SLOT(orig, skip->rip);
+ if (supd->ins == NULL) {
+ slot = WT_ROW_SLOT(orig, supd->rip);
upd = orig->pg_row_upd[slot];
orig->pg_row_upd[slot] = NULL;
WT_ERR(__wt_row_leaf_key(
- session, orig, skip->rip, key, 0));
+ session, orig, supd->rip, key, 0));
} else {
- upd = skip->ins->upd;
- skip->ins->upd = NULL;
+ upd = supd->ins->upd;
+ supd->ins->upd = NULL;
- key->data = WT_INSERT_KEY(skip->ins);
- key->size = WT_INSERT_KEY_SIZE(skip->ins);
+ key->data = WT_INSERT_KEY(supd->ins);
+ key->size = WT_INSERT_KEY_SIZE(supd->ins);
}
/* Search the page. */
@@ -761,7 +763,7 @@ __split_multi_inmem(
page->modify->first_dirty_txn = WT_TXN_FIRST;
err: /* Free any resources that may have been cached in the cursor. */
- WT_TRET(__wt_btcur_close(&cbt));
+ WT_TRET(__wt_btcur_close(&cbt, 1));
__wt_scr_free(session, &key);
return (ret);
@@ -797,7 +799,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
*/
ref->home = NULL;
- if (multi->skip == NULL) {
+ if (multi->supd == NULL) {
/*
* Copy the address: we could simply take the buffer, but that
* would complicate error handling, freeing the reference array
@@ -826,7 +828,7 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
break;
}
- ref->state = multi->skip == NULL ? WT_REF_DISK : WT_REF_MEM;
+ ref->state = multi->supd == NULL ? WT_REF_DISK : WT_REF_MEM;
/*
* If our caller wants to track the memory allocations, we have a return
@@ -837,16 +839,13 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
return (0);
}
-#define WT_SPLIT_EXCLUSIVE 0x01 /* Page held exclusively */
-#define WT_SPLIT_INMEM 0x02 /* In-memory split */
-
/*
* __split_parent --
* Resolve a multi-page split, inserting new information into the parent.
*/
static int
__split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
- WT_REF **ref_new, uint32_t new_entries, size_t parent_incr, uint32_t flags)
+ WT_REF **ref_new, uint32_t new_entries, size_t parent_incr, int exclusive)
{
WT_DECL_RET;
WT_IKEY *ikey;
@@ -874,26 +873,39 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
* memory inside of the lock and may want to invest effort in making the
* locked period shorter.
*
- * We could race with another thread deepening our parent. To deal
- * with that, read the parent pointer each time we try to lock it, and
- * check that it's still correct after it is locked.
+ * We use the reconciliation lock here because not only do we have to
+ * single-thread the split, we have to lock out reconciliation of the
+ * parent because reconciliation of the parent can't deal with finding
+ * a split child during internal page traversal. Basically, there's no
+ * reason to use a different lock if we have to block reconciliation
+ * anyway.
*/
for (;;) {
parent = ref->home;
- F_CAS_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED, ret);
+ F_CAS_ATOMIC(parent, WT_PAGE_RECONCILIATION, ret);
if (ret == 0) {
+ /*
+ * We can race with another thread deepening our parent.
+ * To deal with that, read the parent pointer each time
+ * we try to lock it, and check it's still correct after
+ * it's locked.
+ */
if (parent == ref->home)
break;
- F_CLR_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED);
+ F_CLR_ATOMIC(parent, WT_PAGE_RECONCILIATION);
continue;
}
+
/*
- * If we're attempting an in-memory split and we can't lock the
- * parent, give up. This avoids an infinite loop where we are
- * trying to split a page while its parent is being
- * checkpointed.
+ * A checkpoint reconciling this parent page can deadlock with
+ * our split. We have an exclusive page lock on the child before
+ * we acquire the page's reconciliation lock, and reconciliation
+ * acquires the page's reconciliation lock before it encounters
+ * the child's exclusive lock (which causes reconciliation to
+ * loop until the exclusive lock is resolved). If we can't lock
+ * the parent, give up to avoid that deadlock.
*/
- if (LF_ISSET(WT_SPLIT_INMEM))
+ if (S2BT(session)->checkpointing)
return (EBUSY);
__wt_yield();
}
@@ -905,9 +917,10 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
* could conceivably be evicted. Get a hazard pointer on the parent
* now, so that we can safely access it after updating the index.
*
- * Take care that getting the page doesn't trigger eviction, or we
- * could block trying to split a different child of our parent and
- * deadlock.
+ * Take care getting the page doesn't trigger eviction work: we could
+ * block trying to split a different child of our parent and deadlock
+ * or we could be the eviction server relied upon by other threads to
+ * populate the eviction queue.
*/
if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) {
WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT));
@@ -933,8 +946,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
if (next_ref->state == WT_REF_DELETED &&
__wt_delete_page_skip(session, next_ref) &&
- WT_ATOMIC_CAS4(next_ref->state,
- WT_REF_DELETED, WT_REF_SPLIT))
+ __wt_atomic_casv32(
+ &next_ref->state, WT_REF_DELETED, WT_REF_SPLIT))
deleted_entries++;
}
@@ -994,7 +1007,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
*/
WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex);
WT_INTL_INDEX_SET(parent, alloc_index);
- split_gen = WT_ATOMIC_ADD8(S2C(session)->split_gen, 1);
+ split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
alloc_index = NULL;
#ifdef HAVE_DIAGNOSTIC
@@ -1089,8 +1102,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
* Add it to the session discard list, to be freed when it's safe.
*/
size = sizeof(WT_PAGE_INDEX) + pindex->entries * sizeof(WT_REF *);
- WT_TRET(__split_safe_free(session,
- split_gen, LF_ISSET(WT_SPLIT_EXCLUSIVE) ? 1 : 0, pindex, size));
+ WT_TRET(__split_safe_free(session, split_gen, exclusive, pindex, size));
parent_decr += size;
/*
@@ -1115,7 +1127,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
* Do the check here because we've just grown the parent page and
* are holding it locked.
*/
- if (ret == 0 && !LF_ISSET(WT_SPLIT_EXCLUSIVE) &&
+ if (ret == 0 && !exclusive &&
__split_should_deepen(session, parent_ref))
ret = __split_deepen(session, parent);
@@ -1125,7 +1137,7 @@ err: if (!complete)
if (next_ref->state == WT_REF_SPLIT)
next_ref->state = WT_REF_DELETED;
}
- F_CLR_ATOMIC(parent, WT_PAGE_SPLIT_LOCKED);
+ F_CLR_ATOMIC(parent, WT_PAGE_RECONCILIATION);
if (hazard)
WT_TRET(__wt_hazard_clear(session, parent));
@@ -1164,7 +1176,13 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
right = NULL;
page_decr = parent_incr = right_incr = 0;
+ /*
+ * Assert splitting makes sense; specifically assert the page is dirty,
+ * we depend on that, otherwise the page might be evicted based on its
+ * last reconciliation which no longer matches reality after the split.
+ */
WT_ASSERT(session, __wt_page_can_split(session, page));
+ WT_ASSERT(session, __wt_page_is_modified(page));
/* Find the last item on the page. */
ins_head = page->pg_row_entries == 0 ?
@@ -1192,7 +1210,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
* The key-instantiation code checks for races, clear the key fields so
* we don't trigger them.
*/
- child->key.recno = 0;
+ child->key.recno = WT_RECNO_OOB;
child->key.ikey = NULL;
child->state = WT_REF_MEM;
@@ -1367,7 +1385,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
*/
page = NULL;
if ((ret = __split_parent(
- session, ref, split_ref, 2, parent_incr, WT_SPLIT_INMEM)) != 0) {
+ session, ref, split_ref, 2, parent_incr, 0)) != 0) {
/*
* Move the insert list element back to the original page list.
* For simplicity, the previous skip list pointers originally
@@ -1384,8 +1402,7 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
* We marked the new page dirty; we're going to discard it, but
* first mark it clean and fix up the cache statistics.
*/
- right->modify->write_gen = 0;
- __wt_cache_dirty_decr(session, right);
+ __wt_page_modify_clear(session, right);
WT_ERR(ret);
}
@@ -1442,8 +1459,7 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref)
* Pages with unresolved changes are not marked clean during
* reconciliation, do it now.
*/
- mod->write_gen = 0;
- __wt_cache_dirty_decr(session, page);
+ __wt_page_modify_clear(session, page);
__wt_ref_out(session, ref);
/* Swap the new page into place. */
@@ -1486,8 +1502,8 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
* Split into the parent; if we're closing the file, we hold it
* exclusively.
*/
- WT_ERR(__split_parent( session, ref, ref_new,
- new_entries, parent_incr, closing ? WT_SPLIT_EXCLUSIVE : 0));
+ WT_ERR(__split_parent(
+ session, ref, ref_new, new_entries, parent_incr, closing));
WT_STAT_FAST_CONN_INCR(session, cache_eviction_split);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_split);
@@ -1500,10 +1516,7 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
* Pages with unresolved changes are not marked clean during
* reconciliation, do it now.
*/
- if (__wt_page_is_modified(page)) {
- mod->write_gen = 0;
- __wt_cache_dirty_decr(session, page);
- }
+ __wt_page_modify_clear(session, page);
__wt_page_out(session, &page);
return (0);
diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c
index 6285edde217..b379712f6e7 100644
--- a/src/btree/bt_stat.c
+++ b/src/btree/bt_stat.c
@@ -8,10 +8,11 @@
#include "wt_internal.h"
-static int __stat_page(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS *);
-static void __stat_page_col_var(WT_PAGE *, WT_DSRC_STATS *);
-static void __stat_page_row_int(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS *);
-static void __stat_page_row_leaf(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS *);
+static int __stat_page(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS **);
+static void __stat_page_col_var(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS **);
+static void __stat_page_row_int(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS **);
+static void
+ __stat_page_row_leaf(WT_SESSION_IMPL *, WT_PAGE *, WT_DSRC_STATS **);
/*
* __wt_btree_stat_init --
@@ -23,22 +24,22 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
WT_BM *bm;
WT_BTREE *btree;
WT_DECL_RET;
- WT_DSRC_STATS *stats;
+ WT_DSRC_STATS **stats;
WT_REF *next_walk;
btree = S2BT(session);
bm = btree->bm;
- stats = &btree->dhandle->stats;
+ stats = btree->dhandle->stats;
- WT_RET(bm->stat(bm, session, stats));
+ WT_RET(bm->stat(bm, session, stats[0]));
- WT_STAT_SET(stats, btree_fixed_len, btree->bitcnt);
- WT_STAT_SET(stats, btree_maximum_depth, btree->maximum_depth);
- WT_STAT_SET(stats, btree_maxintlpage, btree->maxintlpage);
- WT_STAT_SET(stats, btree_maxintlkey, btree->maxintlkey);
- WT_STAT_SET(stats, btree_maxleafpage, btree->maxleafpage);
- WT_STAT_SET(stats, btree_maxleafkey, btree->maxleafkey);
- WT_STAT_SET(stats, btree_maxleafvalue, btree->maxleafvalue);
+ WT_STAT_SET(session, stats, btree_fixed_len, btree->bitcnt);
+ WT_STAT_SET(session, stats, btree_maximum_depth, btree->maximum_depth);
+ WT_STAT_SET(session, stats, btree_maxintlpage, btree->maxintlpage);
+ WT_STAT_SET(session, stats, btree_maxintlkey, btree->maxintlkey);
+ WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage);
+ WT_STAT_SET(session, stats, btree_maxleafkey, btree->maxleafkey);
+ WT_STAT_SET(session, stats, btree_maxleafvalue, btree->maxleafvalue);
/* Everything else is really, really expensive. */
if (!F_ISSET(cst, WT_CONN_STAT_ALL))
@@ -47,14 +48,15 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
/*
* Clear the statistics we're about to count.
*/
- WT_STAT_SET(stats, btree_column_deleted, 0);
- WT_STAT_SET(stats, btree_column_fix, 0);
- WT_STAT_SET(stats, btree_column_internal, 0);
- WT_STAT_SET(stats, btree_column_variable, 0);
- WT_STAT_SET(stats, btree_entries, 0);
- WT_STAT_SET(stats, btree_overflow, 0);
- WT_STAT_SET(stats, btree_row_internal, 0);
- WT_STAT_SET(stats, btree_row_leaf, 0);
+ WT_STAT_SET(session, stats, btree_column_deleted, 0);
+ WT_STAT_SET(session, stats, btree_column_fix, 0);
+ WT_STAT_SET(session, stats, btree_column_internal, 0);
+ WT_STAT_SET(session, stats, btree_column_rle, 0);
+ WT_STAT_SET(session, stats, btree_column_variable, 0);
+ WT_STAT_SET(session, stats, btree_entries, 0);
+ WT_STAT_SET(session, stats, btree_overflow, 0);
+ WT_STAT_SET(session, stats, btree_row_internal, 0);
+ WT_STAT_SET(session, stats, btree_row_leaf, 0);
next_walk = NULL;
while ((ret = __wt_tree_walk(session, &next_walk, NULL, 0)) == 0 &&
@@ -71,7 +73,7 @@ __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
* Stat any Btree page.
*/
static int
-__stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats)
+__stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats)
{
/*
* All internal pages and overflow pages are trivial, all we track is
@@ -79,14 +81,15 @@ __stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats)
*/
switch (page->type) {
case WT_PAGE_COL_FIX:
- WT_STAT_INCR(stats, btree_column_fix);
- WT_STAT_INCRV(stats, btree_entries, page->pg_fix_entries);
+ WT_STAT_INCR(session, stats, btree_column_fix);
+ WT_STAT_INCRV(
+ session, stats, btree_entries, page->pg_fix_entries);
break;
case WT_PAGE_COL_INT:
- WT_STAT_INCR(stats, btree_column_internal);
+ WT_STAT_INCR(session, stats, btree_column_internal);
break;
case WT_PAGE_COL_VAR:
- __stat_page_col_var(page, stats);
+ __stat_page_col_var(session, page, stats);
break;
case WT_PAGE_ROW_INT:
__stat_page_row_int(session, page, stats);
@@ -104,21 +107,22 @@ __stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats)
* Stat a WT_PAGE_COL_VAR page.
*/
static void
-__stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats)
+__stat_page_col_var(
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats)
{
WT_CELL *cell;
WT_CELL_UNPACK *unpack, _unpack;
WT_COL *cip;
WT_INSERT *ins;
WT_UPDATE *upd;
- uint64_t deleted_cnt, entry_cnt, ovfl_cnt;
+ uint64_t deleted_cnt, entry_cnt, ovfl_cnt, rle_cnt;
uint32_t i;
int orig_deleted;
unpack = &_unpack;
- deleted_cnt = entry_cnt = ovfl_cnt = 0;
+ deleted_cnt = entry_cnt = ovfl_cnt = rle_cnt = 0;
- WT_STAT_INCR(stats, btree_column_variable);
+ WT_STAT_INCR(session, stats, btree_column_variable);
/*
* Walk the page counting regular items, adjusting if the item has been
@@ -137,8 +141,10 @@ __stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats)
__wt_cell_unpack(cell, unpack);
if (unpack->type == WT_CELL_ADDR_DEL)
orig_deleted = 1;
- else
+ else {
entry_cnt += __wt_cell_rle(unpack);
+ rle_cnt += __wt_cell_rle(unpack) - 1;
+ }
if (unpack->ovfl)
++ovfl_cnt;
}
@@ -169,9 +175,10 @@ __stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats)
else
++entry_cnt;
- WT_STAT_INCRV(stats, btree_column_deleted, deleted_cnt);
- WT_STAT_INCRV(stats, btree_entries, entry_cnt);
- WT_STAT_INCRV(stats, btree_overflow, ovfl_cnt);
+ WT_STAT_INCRV(session, stats, btree_column_deleted, deleted_cnt);
+ WT_STAT_INCRV(session, stats, btree_column_rle, rle_cnt);
+ WT_STAT_INCRV(session, stats, btree_entries, entry_cnt);
+ WT_STAT_INCRV(session, stats, btree_overflow, ovfl_cnt);
}
/*
@@ -180,7 +187,7 @@ __stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats)
*/
static void
__stat_page_row_int(
- WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats)
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats)
{
WT_BTREE *btree;
WT_CELL *cell;
@@ -190,7 +197,7 @@ __stat_page_row_int(
btree = S2BT(session);
ovfl_cnt = 0;
- WT_STAT_INCR(stats, btree_row_internal);
+ WT_STAT_INCR(session, stats, btree_row_internal);
/*
* Overflow keys are hard: we have to walk the disk image to count them,
@@ -204,7 +211,7 @@ __stat_page_row_int(
++ovfl_cnt;
}
- WT_STAT_INCRV(stats, btree_overflow, ovfl_cnt);
+ WT_STAT_INCRV(session, stats, btree_overflow, ovfl_cnt);
}
/*
@@ -213,7 +220,7 @@ __stat_page_row_int(
*/
static void
__stat_page_row_leaf(
- WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS *stats)
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats)
{
WT_BTREE *btree;
WT_CELL *cell;
@@ -226,7 +233,7 @@ __stat_page_row_leaf(
btree = S2BT(session);
entry_cnt = ovfl_cnt = 0;
- WT_STAT_INCR(stats, btree_row_leaf);
+ WT_STAT_INCR(session, stats, btree_row_leaf);
/*
* Walk any K/V pairs inserted into the page before the first from-disk
@@ -267,6 +274,6 @@ __stat_page_row_leaf(
++ovfl_cnt;
}
- WT_STAT_INCRV(stats, btree_entries, entry_cnt);
- WT_STAT_INCRV(stats, btree_overflow, ovfl_cnt);
+ WT_STAT_INCRV(session, stats, btree_entries, entry_cnt);
+ WT_STAT_INCRV(session, stats, btree_overflow, ovfl_cnt);
}
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index 838d778dadf..29ae5b185cd 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -259,7 +259,6 @@ __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op)
break;
case WT_SYNC_CLOSE:
case WT_SYNC_DISCARD:
- case WT_SYNC_DISCARD_FORCE:
WT_ERR(__wt_evict_file(session, op));
break;
WT_ILLEGAL_VALUE_ERR(session);
diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c
index 3f615babb07..1fd660d4cd4 100644
--- a/src/btree/bt_vrfy.c
+++ b/src/btree/bt_vrfy.c
@@ -245,9 +245,6 @@ err: /* Inform the underlying block manager we're done. */
if (ckptbase != NULL)
__wt_meta_ckptlist_free(session, ckptbase);
- /* Wrap up reporting. */
- WT_TRET(__wt_progress(session, NULL, vs->fcnt));
-
/* Free allocated memory. */
__wt_scr_free(session, &vs->max_key);
__wt_scr_free(session, &vs->max_addr);
@@ -343,9 +340,10 @@ __verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs)
* of the page to be built, and then a subsequent logical verification
* which happens here.
*
- * Report progress every 10 pages.
+ * Report progress occasionally.
*/
- if (++vs->fcnt % 10 == 0)
+#define WT_VERIFY_PROGRESS_INTERVAL 100
+ if (++vs->fcnt % WT_VERIFY_PROGRESS_INTERVAL == 0)
WT_RET(__wt_progress(session, NULL, vs->fcnt));
#ifdef HAVE_DIAGNOSTIC
diff --git a/src/btree/bt_vrfy_dsk.c b/src/btree/bt_vrfy_dsk.c
index 904a16a7548..e80bde3c91e 100644
--- a/src/btree/bt_vrfy_dsk.c
+++ b/src/btree/bt_vrfy_dsk.c
@@ -26,13 +26,13 @@ static int __verify_dsk_row(
WT_SESSION_IMPL *, const char *, const WT_PAGE_HEADER *);
#define WT_ERR_VRFY(session, ...) do { \
- if (!(F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))) \
+ if (!(F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))) \
__wt_errx(session, __VA_ARGS__); \
goto err; \
} while (0)
#define WT_RET_VRFY(session, ...) do { \
- if (!(F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK))) \
+ if (!(F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE))) \
__wt_errx(session, __VA_ARGS__); \
return (WT_ERROR); \
} while (0)
@@ -43,7 +43,7 @@ static int __verify_dsk_row(
*/
int
__wt_verify_dsk_image(WT_SESSION_IMPL *session,
- const char *addr, const WT_PAGE_HEADER *dsk, size_t size, int empty_page_ok)
+ const char *tag, const WT_PAGE_HEADER *dsk, size_t size, int empty_page_ok)
{
const uint8_t *p, *end;
u_int i;
@@ -63,7 +63,7 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session,
default:
WT_RET_VRFY(session,
"page at %s has an invalid type of %" PRIu32,
- addr, dsk->type);
+ tag, dsk->type);
}
/* Check the page record number. */
@@ -71,51 +71,54 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session,
case WT_PAGE_COL_FIX:
case WT_PAGE_COL_INT:
case WT_PAGE_COL_VAR:
- if (dsk->recno != 0)
+ if (dsk->recno != WT_RECNO_OOB)
break;
WT_RET_VRFY(session,
- "%s page at %s has a record number of zero",
- __wt_page_type_string(dsk->type), addr);
+ "%s page at %s has an invalid record number of %d",
+ __wt_page_type_string(dsk->type), tag, WT_RECNO_OOB);
case WT_PAGE_BLOCK_MANAGER:
case WT_PAGE_OVFL:
case WT_PAGE_ROW_INT:
case WT_PAGE_ROW_LEAF:
- if (dsk->recno == 0)
+ if (dsk->recno == WT_RECNO_OOB)
break;
WT_RET_VRFY(session,
- "%s page at %s has a non-zero record number",
- __wt_page_type_string(dsk->type), addr);
+ "%s page at %s has a record number, which is illegal for "
+ "this page type",
+ __wt_page_type_string(dsk->type), tag);
}
/* Check the page flags. */
flags = dsk->flags;
if (LF_ISSET(WT_PAGE_COMPRESSED))
LF_CLR(WT_PAGE_COMPRESSED);
- if (LF_ISSET(WT_PAGE_ENCRYPTED))
- LF_CLR(WT_PAGE_ENCRYPTED);
if (dsk->type == WT_PAGE_ROW_LEAF) {
if (LF_ISSET(WT_PAGE_EMPTY_V_ALL) &&
LF_ISSET(WT_PAGE_EMPTY_V_NONE))
WT_RET_VRFY(session,
"page at %s has invalid flags combination: 0x%"
PRIx8,
- addr, dsk->flags);
+ tag, dsk->flags);
if (LF_ISSET(WT_PAGE_EMPTY_V_ALL))
LF_CLR(WT_PAGE_EMPTY_V_ALL);
if (LF_ISSET(WT_PAGE_EMPTY_V_NONE))
LF_CLR(WT_PAGE_EMPTY_V_NONE);
}
+ if (LF_ISSET(WT_PAGE_ENCRYPTED))
+ LF_CLR(WT_PAGE_ENCRYPTED);
+ if (LF_ISSET(WT_PAGE_LAS_UPDATE))
+ LF_CLR(WT_PAGE_LAS_UPDATE);
if (flags != 0)
WT_RET_VRFY(session,
"page at %s has invalid flags set: 0x%" PRIx8,
- addr, flags);
+ tag, flags);
/* Unused bytes */
for (p = dsk->unused, i = sizeof(dsk->unused); i > 0; --i)
if (*p != '\0')
WT_RET_VRFY(session,
"page at %s has non-zero unused page header bytes",
- addr);
+ tag);
/*
* Any bytes after the data chunk should be nul bytes; ignore if the
@@ -129,7 +132,7 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session,
if (*p != '\0')
WT_RET_VRFY(session,
"%s page at %s has non-zero trailing bytes",
- __wt_page_type_string(dsk->type), addr);
+ __wt_page_type_string(dsk->type), tag);
}
/* Check for empty pages, then verify the items on the page. */
@@ -141,28 +144,28 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session,
case WT_PAGE_ROW_LEAF:
if (!empty_page_ok && dsk->u.entries == 0)
WT_RET_VRFY(session, "%s page at %s has no entries",
- __wt_page_type_string(dsk->type), addr);
+ __wt_page_type_string(dsk->type), tag);
break;
case WT_PAGE_BLOCK_MANAGER:
case WT_PAGE_OVFL:
if (dsk->u.datalen == 0)
WT_RET_VRFY(session, "%s page at %s has no data",
- __wt_page_type_string(dsk->type), addr);
+ __wt_page_type_string(dsk->type), tag);
break;
}
switch (dsk->type) {
case WT_PAGE_COL_INT:
- return (__verify_dsk_col_int(session, addr, dsk));
+ return (__verify_dsk_col_int(session, tag, dsk));
case WT_PAGE_COL_FIX:
- return (__verify_dsk_col_fix(session, addr, dsk));
+ return (__verify_dsk_col_fix(session, tag, dsk));
case WT_PAGE_COL_VAR:
- return (__verify_dsk_col_var(session, addr, dsk));
+ return (__verify_dsk_col_var(session, tag, dsk));
case WT_PAGE_ROW_INT:
case WT_PAGE_ROW_LEAF:
- return (__verify_dsk_row(session, addr, dsk));
+ return (__verify_dsk_row(session, tag, dsk));
case WT_PAGE_BLOCK_MANAGER:
case WT_PAGE_OVFL:
- return (__verify_dsk_chunk(session, addr, dsk, dsk->u.datalen));
+ return (__verify_dsk_chunk(session, tag, dsk, dsk->u.datalen));
WT_ILLEGAL_VALUE(session);
}
/* NOTREACHED */
@@ -173,9 +176,9 @@ __wt_verify_dsk_image(WT_SESSION_IMPL *session,
* Verify a single Btree page as read from disk.
*/
int
-__wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf)
+__wt_verify_dsk(WT_SESSION_IMPL *session, const char *tag, WT_ITEM *buf)
{
- return (__wt_verify_dsk_image(session, addr, buf->data, buf->size, 0));
+ return (__wt_verify_dsk_image(session, tag, buf->data, buf->size, 0));
}
/*
@@ -184,7 +187,7 @@ __wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf)
*/
static int
__verify_dsk_row(
- WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk)
+ WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk)
{
WT_BM *bm;
WT_BTREE *btree;
@@ -220,16 +223,16 @@ __verify_dsk_row(
++cell_num;
/* Carefully unpack the cell. */
- if (__wt_cell_unpack_safe(cell, unpack, end) != 0) {
- ret = __err_cell_corrupted(session, cell_num, addr);
+ if (__wt_cell_unpack_safe(cell, unpack, dsk, end) != 0) {
+ ret = __err_cell_corrupted(session, cell_num, tag);
goto err;
}
/* Check the raw and collapsed cell types. */
WT_ERR(__err_cell_type(
- session, cell_num, addr, unpack->raw, dsk->type));
+ session, cell_num, tag, unpack->raw, dsk->type));
WT_ERR(__err_cell_type(
- session, cell_num, addr, unpack->type, dsk->type));
+ session, cell_num, tag, unpack->type, dsk->type));
cell_type = unpack->type;
/*
@@ -256,7 +259,7 @@ __verify_dsk_row(
WT_ERR_VRFY(session,
"cell %" PRIu32 " on page at %s is the "
"first of two adjacent keys",
- cell_num - 1, addr);
+ cell_num - 1, tag);
}
last_cell_type = WAS_KEY;
break;
@@ -269,14 +272,14 @@ __verify_dsk_row(
switch (last_cell_type) {
case FIRST:
WT_ERR_VRFY(session,
- "page at %s begins with a value", addr);
+ "page at %s begins with a value", tag);
case WAS_KEY:
break;
case WAS_VALUE:
WT_ERR_VRFY(session,
"cell %" PRIu32 " on page at %s is the "
"first of two adjacent values",
- cell_num - 1, addr);
+ cell_num - 1, tag);
}
last_cell_type = WAS_VALUE;
break;
@@ -327,7 +330,7 @@ __verify_dsk_row(
"the %" PRIu32 " key on page at %s is the first "
"non-overflow key on the page and has a non-zero "
"prefix compression value",
- cell_num, addr);
+ cell_num, tag);
/* Confirm the prefix compression count is possible. */
if (cell_num > 1 && prefix > last->size)
@@ -335,7 +338,7 @@ __verify_dsk_row(
"key %" PRIu32 " on page at %s has a prefix "
"compression count of %" PRIu32 ", larger than "
"the length of the previous key, %" WT_SIZET_FMT,
- cell_num, addr, prefix, last->size);
+ cell_num, tag, prefix, last->size);
/*
* If Huffman decoding required, unpack the cell to build the
@@ -394,7 +397,7 @@ key_compare: /*
WT_ERR_VRFY(session,
"the %" PRIu32 " and %" PRIu32 " keys on "
"page at %s are incorrectly sorted",
- cell_num - 2, cell_num, addr);
+ cell_num - 2, cell_num, tag);
}
/*
@@ -414,7 +417,7 @@ key_compare: /*
}
WT_ASSERT(session, last != current);
}
- WT_ERR(__verify_dsk_memsize(session, addr, dsk, cell));
+ WT_ERR(__verify_dsk_memsize(session, tag, dsk, cell));
/*
* On row-store internal pages, and on row-store leaf pages, where the
@@ -428,7 +431,7 @@ key_compare: /*
"%s page at %s has a key count of %" PRIu32 " and a "
"physical entry count of %" PRIu32,
__wt_page_type_string(dsk->type),
- addr, key_cnt, dsk->u.entries);
+ tag, key_cnt, dsk->u.entries);
if (dsk->type == WT_PAGE_ROW_LEAF &&
F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL) &&
key_cnt != dsk->u.entries)
@@ -437,7 +440,7 @@ key_compare: /*
"key count of %" PRIu32 " and a physical entry count of %"
PRIu32,
__wt_page_type_string(dsk->type),
- addr, key_cnt, dsk->u.entries);
+ tag, key_cnt, dsk->u.entries);
if (dsk->type == WT_PAGE_ROW_LEAF &&
F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE) &&
key_cnt * 2 != dsk->u.entries)
@@ -446,10 +449,10 @@ key_compare: /*
"key count of %" PRIu32 " and a physical entry count of %"
PRIu32,
__wt_page_type_string(dsk->type),
- addr, key_cnt, dsk->u.entries);
+ tag, key_cnt, dsk->u.entries);
if (0) {
-eof: ret = __err_eof(session, cell_num, addr);
+eof: ret = __err_eof(session, cell_num, tag);
}
if (0) {
@@ -468,7 +471,7 @@ err: if (ret == 0)
*/
static int
__verify_dsk_col_int(
- WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk)
+ WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk)
{
WT_BM *bm;
WT_BTREE *btree;
@@ -487,20 +490,20 @@ __verify_dsk_col_int(
++cell_num;
/* Carefully unpack the cell. */
- if (__wt_cell_unpack_safe(cell, unpack, end) != 0)
- return (__err_cell_corrupted(session, cell_num, addr));
+ if (__wt_cell_unpack_safe(cell, unpack, dsk, end) != 0)
+ return (__err_cell_corrupted(session, cell_num, tag));
/* Check the raw and collapsed cell types. */
WT_RET(__err_cell_type(
- session, cell_num, addr, unpack->raw, dsk->type));
+ session, cell_num, tag, unpack->raw, dsk->type));
WT_RET(__err_cell_type(
- session, cell_num, addr, unpack->type, dsk->type));
+ session, cell_num, tag, unpack->type, dsk->type));
/* Check if any referenced item is entirely in the file. */
if (!bm->addr_valid(bm, session, unpack->data, unpack->size))
- return (__err_eof(session, cell_num, addr));
+ return (__err_eof(session, cell_num, tag));
}
- WT_RET(__verify_dsk_memsize(session, addr, dsk, cell));
+ WT_RET(__verify_dsk_memsize(session, tag, dsk, cell));
return (0);
}
@@ -511,7 +514,7 @@ __verify_dsk_col_int(
*/
static int
__verify_dsk_col_fix(
- WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk)
+ WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk)
{
WT_BTREE *btree;
uint32_t datalen;
@@ -519,7 +522,7 @@ __verify_dsk_col_fix(
btree = S2BT(session);
datalen = __bitstr_size(btree->bitcnt * dsk->u.entries);
- return (__verify_dsk_chunk(session, addr, dsk, datalen));
+ return (__verify_dsk_chunk(session, tag, dsk, datalen));
}
/*
@@ -528,7 +531,7 @@ __verify_dsk_col_fix(
*/
static int
__verify_dsk_col_var(
- WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk)
+ WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk)
{
WT_BM *bm;
WT_BTREE *btree;
@@ -554,20 +557,20 @@ __verify_dsk_col_var(
++cell_num;
/* Carefully unpack the cell. */
- if (__wt_cell_unpack_safe(cell, unpack, end) != 0)
- return (__err_cell_corrupted(session, cell_num, addr));
+ if (__wt_cell_unpack_safe(cell, unpack, dsk, end) != 0)
+ return (__err_cell_corrupted(session, cell_num, tag));
/* Check the raw and collapsed cell types. */
WT_RET(__err_cell_type(
- session, cell_num, addr, unpack->raw, dsk->type));
+ session, cell_num, tag, unpack->raw, dsk->type));
WT_RET(__err_cell_type(
- session, cell_num, addr, unpack->type, dsk->type));
+ session, cell_num, tag, unpack->type, dsk->type));
cell_type = unpack->type;
/* Check if any referenced item is entirely in the file. */
if (cell_type == WT_CELL_VALUE_OVFL &&
!bm->addr_valid(bm, session, unpack->data, unpack->size))
- return (__err_eof(session, cell_num, addr));
+ return (__err_eof(session, cell_num, tag));
/*
* Compare the last two items and see if reconciliation missed
@@ -586,7 +589,7 @@ match_err: WT_RET_VRFY(session,
"data entries %" PRIu32 " and %" PRIu32
" on page at %s are identical and should "
"have been run-length encoded",
- cell_num - 1, cell_num, addr);
+ cell_num - 1, cell_num, tag);
switch (cell_type) {
case WT_CELL_DEL:
@@ -604,7 +607,7 @@ match_err: WT_RET_VRFY(session,
break;
}
}
- WT_RET(__verify_dsk_memsize(session, addr, dsk, cell));
+ WT_RET(__verify_dsk_memsize(session, tag, dsk, cell));
return (0);
}
@@ -615,7 +618,7 @@ match_err: WT_RET_VRFY(session,
*/
static int
__verify_dsk_memsize(WT_SESSION_IMPL *session,
- const char *addr, const WT_PAGE_HEADER *dsk, WT_CELL *cell)
+ const char *tag, const WT_PAGE_HEADER *dsk, WT_CELL *cell)
{
size_t len;
@@ -630,7 +633,7 @@ __verify_dsk_memsize(WT_SESSION_IMPL *session,
WT_RET_VRFY(session,
"%s page at %s has %" WT_SIZET_FMT " unexpected bytes of data "
"after the last cell",
- __wt_page_type_string(dsk->type), addr, len);
+ __wt_page_type_string(dsk->type), tag, len);
}
/*
@@ -639,7 +642,7 @@ __verify_dsk_memsize(WT_SESSION_IMPL *session,
*/
static int
__verify_dsk_chunk(WT_SESSION_IMPL *session,
- const char *addr, const WT_PAGE_HEADER *dsk, uint32_t datalen)
+ const char *tag, const WT_PAGE_HEADER *dsk, uint32_t datalen)
{
WT_BTREE *btree;
uint8_t *p, *end;
@@ -655,14 +658,14 @@ __verify_dsk_chunk(WT_SESSION_IMPL *session,
if (p + datalen > end)
WT_RET_VRFY(session,
"data on page at %s extends past the end of the page",
- addr);
+ tag);
/* Any bytes after the data chunk should be nul bytes. */
for (p += datalen; p < end; ++p)
if (*p != '\0')
WT_RET_VRFY(session,
"%s page at %s has non-zero trailing bytes",
- __wt_page_type_string(dsk->type), addr);
+ __wt_page_type_string(dsk->type), tag);
return (0);
}
@@ -673,11 +676,11 @@ __verify_dsk_chunk(WT_SESSION_IMPL *session,
*/
static int
__err_cell_corrupted(
- WT_SESSION_IMPL *session, uint32_t entry_num, const char *addr)
+ WT_SESSION_IMPL *session, uint32_t entry_num, const char *tag)
{
WT_RET_VRFY(session,
"item %" PRIu32 " on page at %s is a corrupted cell",
- entry_num, addr);
+ entry_num, tag);
}
/*
@@ -686,7 +689,7 @@ __err_cell_corrupted(
*/
static int
__err_cell_type(WT_SESSION_IMPL *session,
- uint32_t entry_num, const char *addr, uint8_t cell_type, uint8_t dsk_type)
+ uint32_t entry_num, const char *tag, uint8_t cell_type, uint8_t dsk_type)
{
switch (cell_type) {
case WT_CELL_ADDR_DEL:
@@ -735,7 +738,7 @@ __err_cell_type(WT_SESSION_IMPL *session,
WT_RET_VRFY(session,
"illegal cell and page type combination: cell %" PRIu32
" on page at %s is a %s cell on a %s page",
- entry_num, addr,
+ entry_num, tag,
__wt_cell_type_string(cell_type), __wt_page_type_string(dsk_type));
}
@@ -744,10 +747,10 @@ __err_cell_type(WT_SESSION_IMPL *session,
* Generic item references non-existent file pages error.
*/
static int
-__err_eof(WT_SESSION_IMPL *session, uint32_t entry_num, const char *addr)
+__err_eof(WT_SESSION_IMPL *session, uint32_t entry_num, const char *tag)
{
WT_RET_VRFY(session,
"off-page item %" PRIu32
" on page at %s references non-existent file pages",
- entry_num, addr);
+ entry_num, tag);
}
diff --git a/src/btree/col_modify.c b/src/btree/col_modify.c
index 2fe09681090..cbc5143698b 100644
--- a/src/btree/col_modify.c
+++ b/src/btree/col_modify.c
@@ -17,7 +17,7 @@ static int __col_insert_alloc(
*/
int
__wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
- uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove)
+ uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, int is_remove)
{
WT_BTREE *btree;
WT_DECL_RET;
@@ -25,7 +25,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
WT_INSERT_HEAD *ins_head, **ins_headp;
WT_ITEM _value;
WT_PAGE *page;
- WT_UPDATE *old_upd;
+ WT_UPDATE *old_upd, *upd;
size_t ins_size, upd_size;
u_int i, skipdepth;
int append, logged;
@@ -33,6 +33,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
btree = cbt->btree;
ins = NULL;
page = cbt->ref->page;
+ upd = upd_arg;
append = logged = 0;
/* This code expects a remove to have a NULL value. */
@@ -48,10 +49,10 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
* There's some chance the application specified a record past
* the last record on the page. If that's the case, and we're
* inserting a new WT_INSERT/WT_UPDATE pair, it goes on the
- * append list, not the update list. In addition, a recno of 0
+ * append list, not the update list. Also, an out-of-band recno
* implies an append operation, we're allocating a new row.
*/
- if (recno == 0 ||
+ if (recno == WT_RECNO_OOB ||
recno > (btree->type == BTREE_COL_VAR ?
__col_var_last_recno(page) : __col_fix_last_recno(page)))
append = 1;
@@ -76,7 +77,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
* If we are restoring updates that couldn't be evicted, the
* key must not exist on the new page.
*/
- WT_ASSERT(session, upd == NULL);
+ WT_ASSERT(session, upd_arg == NULL);
/* Make sure the update can proceed. */
WT_ERR(__wt_txn_update_check(
@@ -134,7 +135,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
cbt->ins_head = ins_head;
cbt->ins = ins;
- if (upd == NULL) {
+ if (upd_arg == NULL) {
WT_ERR(
__wt_update_alloc(session, value, &upd, &upd_size));
WT_ERR(__wt_txn_modify(session, upd));
@@ -160,7 +161,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
* The serial mutex acts as our memory barrier to flush these
* writes before inserting them into the list.
*/
- if (WT_SKIP_FIRST(ins_head) == NULL || recno == 0)
+ if (cbt->ins_stack[0] == NULL || recno == WT_RECNO_OOB)
for (i = 0; i < skipdepth; i++) {
cbt->ins_stack[i] = &ins_head->head[i];
ins->next[i] = cbt->next_stack[i] = NULL;
@@ -192,7 +193,8 @@ err: /*
if (logged)
__wt_txn_unmodify(session);
__wt_free(session, ins);
- __wt_free(session, upd);
+ if (upd_arg == NULL)
+ __wt_free(session, upd);
}
return (ret);
diff --git a/src/btree/row_key.c b/src/btree/row_key.c
index f2868afe13a..4affa7fa62a 100644
--- a/src/btree/row_key.c
+++ b/src/btree/row_key.c
@@ -448,7 +448,8 @@ next: switch (direction) {
* update the page's memory footprint, on failure, free
* the allocated memory.
*/
- if (WT_ATOMIC_CAS8(WT_ROW_KEY_COPY(rip), copy, ikey))
+ if (__wt_atomic_cas_ptr(
+ (void *)&WT_ROW_KEY_COPY(rip), copy, ikey))
__wt_cache_page_inmem_incr(session,
page, sizeof(WT_IKEY) + ikey->size);
else
@@ -525,7 +526,7 @@ __wt_row_ikey(WT_SESSION_IMPL *session,
WT_ASSERT(session, oldv == 0 || (oldv & WT_IK_FLAG) != 0);
WT_ASSERT(session, ref->state != WT_REF_SPLIT);
WT_ASSERT(session,
- WT_ATOMIC_CAS8(ref->key.ikey, (WT_IKEY *)oldv, ikey));
+ __wt_atomic_cas_ptr(&ref->key.ikey, (WT_IKEY *)oldv, ikey));
}
#else
ref->key.ikey = ikey;
diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c
index 62177b7e4c7..888c54d1ec9 100644
--- a/src/btree/row_modify.c
+++ b/src/btree/row_modify.c
@@ -26,7 +26,7 @@ __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page)
* Select a spinlock for the page; let the barrier immediately below
* keep things from racing too badly.
*/
- modify->page_lock = ++conn->page_lock_cnt % WT_PAGE_LOCKS(conn);
+ modify->page_lock = ++conn->page_lock_cnt % WT_PAGE_LOCKS;
/*
* Multiple threads of control may be searching and deciding to modify
@@ -34,7 +34,7 @@ __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page)
* footprint, else discard the modify structure, another thread did the
* work.
*/
- if (WT_ATOMIC_CAS8(page->modify, NULL, modify))
+ if (__wt_atomic_cas_ptr(&page->modify, NULL, modify))
__wt_cache_page_inmem_incr(session, page, sizeof(*modify));
else
__wt_free(session, modify);
@@ -112,6 +112,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
* there should only be one update list per key.
*/
WT_ASSERT(session, *upd_entry == NULL);
+
/*
* Set the "old" entry to the second update in the list
* so that the serialization function succeeds in
@@ -192,7 +193,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
* The serial mutex acts as our memory barrier to flush these
* writes before inserting them into the list.
*/
- if (WT_SKIP_FIRST(ins_head) == NULL)
+ if (cbt->ins_stack[0] == NULL)
for (i = 0; i < skipdepth; i++) {
cbt->ins_stack[i] = &ins_head->head[i];
ins->next[i] = cbt->next_stack[i] = NULL;
@@ -316,7 +317,7 @@ __wt_update_obsolete_check(
*/
if (first != NULL &&
(next = first->next) != NULL &&
- WT_ATOMIC_CAS8(first->next, next, NULL))
+ __wt_atomic_cas_ptr(&first->next, next, NULL))
return (next);
/*
diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c
index 9803b924355..d83d3253c44 100644
--- a/src/btree/row_srch.c
+++ b/src/btree/row_srch.c
@@ -471,6 +471,7 @@ __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
WT_PAGE *page;
WT_PAGE_INDEX *pindex;
WT_REF *current, *descent;
+ uint32_t cnt;
btree = S2BT(session);
@@ -528,18 +529,22 @@ restart:
/*
* If the tree is new (and not empty), it might have a large insert
- * list, pick the key in the middle of that insert list.
+ * list. Count how many records are in the list.
*/
F_SET(cbt, WT_CBT_SEARCH_SMALLEST);
if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL)
WT_ERR(WT_NOTFOUND);
- for (p = t = WT_SKIP_FIRST(cbt->ins_head);;) {
+ for (cnt = 1, p = WT_SKIP_FIRST(cbt->ins_head);; ++cnt)
if ((p = WT_SKIP_NEXT(p)) == NULL)
break;
- if ((p = WT_SKIP_NEXT(p)) == NULL)
+
+ /*
+ * Select a random number from 0 to (N - 1), return that record.
+ */
+ cnt = __wt_random(&session->rnd) % cnt;
+ for (p = t = WT_SKIP_FIRST(cbt->ins_head);; t = p)
+ if (cnt-- == 0 || (p = WT_SKIP_NEXT(p)) == NULL)
break;
- t = WT_SKIP_NEXT(t);
- }
cbt->ref = current;
cbt->compare = 0;
cbt->ins = t;
diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c
new file mode 100644
index 00000000000..e269e8702e1
--- /dev/null
+++ b/src/cache/cache_las.c
@@ -0,0 +1,391 @@
+/*-
+ * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_las_stats_update --
+ * Update the lookaside table statistics for return to the application.
+ */
+void
+__wt_las_stats_update(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_CONNECTION_STATS **cstats;
+ WT_DSRC_STATS **dstats;
+
+ conn = S2C(session);
+
+ /*
+ * Lookaside table statistics are copied from the underlying lookaside
+ * table data-source statistics. If there's no lookaside table, values
+ * remain 0. In the current system, there's always a lookaside table,
+ * but there's no reason not to be cautious.
+ */
+ if (conn->las_cursor == NULL)
+ return;
+
+ /*
+ * We have a cursor, and we need the underlying data handle; we can get
+ * to it by way of the underlying btree handle, but it's a little ugly.
+ */
+ cstats = conn->stats;
+ dstats = ((WT_CURSOR_BTREE *)conn->las_cursor)->btree->dhandle->stats;
+
+ WT_STAT_SET(session, cstats,
+ cache_lookaside_insert, WT_STAT_READ(dstats, cursor_insert));
+ WT_STAT_SET(session, cstats,
+ cache_lookaside_remove, WT_STAT_READ(dstats, cursor_remove));
+}
+
+/*
+ * __las_cursor_create --
+ * Open a new lookaside table cursor.
+ */
+static int
+__las_cursor_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp)
+{
+ WT_BTREE *btree;
+ const char *open_cursor_cfg[] = {
+ WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL };
+
+ WT_RET(__wt_open_cursor(
+ session, WT_LAS_URI, NULL, open_cursor_cfg, cursorp));
+
+ /*
+ * Set special flags for the lookaside table: the lookaside flag (used,
+ * for example, to avoid writing records during reconciliation), also
+ * turn off checkpoints and logging.
+ *
+ * Test flags before setting them so updates can't race in subsequent
+ * opens (the first update is safe because it's single-threaded from
+ * wiredtiger_open).
+ */
+ btree = S2BT(session);
+ if (!F_ISSET(btree, WT_BTREE_LOOKASIDE))
+ F_SET(btree, WT_BTREE_LOOKASIDE);
+ if (!F_ISSET(btree, WT_BTREE_NO_CHECKPOINT))
+ F_SET(btree, WT_BTREE_NO_CHECKPOINT);
+ if (!F_ISSET(btree, WT_BTREE_NO_LOGGING))
+ F_SET(btree, WT_BTREE_NO_LOGGING);
+
+ return (0);
+}
+
+/*
+ * __wt_las_create --
+ * Initialize the database's lookaside store.
+ */
+int
+__wt_las_create(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ const char *drop_cfg[] = {
+ WT_CONFIG_BASE(session, WT_SESSION_drop), "force=true", NULL };
+
+ conn = S2C(session);
+
+ /*
+ * Done at startup: we cannot do it on demand because we require the
+ * schema lock to create and drop the file, and it may not always be
+ * available.
+ *
+ * Open an internal session, used for the shared lookaside cursor.
+ *
+ * Sessions associated with a lookaside cursor should never be tapped
+ * for eviction.
+ */
+ WT_RET(__wt_open_internal_session(
+ conn, "lookaside table", 1, 1, &conn->las_session));
+ session = conn->las_session;
+ F_SET(session, WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION);
+
+ /* Discard any previous incarnation of the file. */
+ WT_RET(__wt_session_drop(session, WT_LAS_URI, drop_cfg));
+
+ /* Re-create the file. */
+ WT_RET(__wt_session_create(session, WT_LAS_URI, WT_LAS_FORMAT));
+
+ /* Open the shared cursor. */
+ WT_WITHOUT_DHANDLE(session,
+ ret = __las_cursor_create(session, &conn->las_cursor));
+
+ return (ret);
+}
+
+/*
+ * __wt_las_destroy --
+ * Destroy the database's lookaside store.
+ */
+int
+__wt_las_destroy(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION *wt_session;
+
+ conn = S2C(session);
+
+ if (conn->las_session == NULL)
+ return (0);
+
+ wt_session = &conn->las_session->iface;
+ ret = wt_session->close(wt_session, NULL);
+
+ conn->las_cursor = NULL;
+ conn->las_session = NULL;
+
+ return (ret);
+}
+
+/*
+ * __wt_las_set_written --
+ * Flag that the lookaside table has been written.
+ */
+void
+__wt_las_set_written(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+ if (!conn->las_written) {
+ conn->las_written = true;
+
+ /*
+ * Push the flag: unnecessary, but from now page reads must deal
+ * with lookaside table records, and we only do the write once.
+ */
+ WT_FULL_BARRIER();
+ }
+}
+
+/*
+ * __wt_las_is_written --
+ * Return if the lookaside table has been written.
+ */
+bool
+__wt_las_is_written(WT_SESSION_IMPL *session)
+{
+ return (S2C(session)->las_written);
+}
+
+/*
+ * __wt_las_cursor --
+ * Return a lookaside cursor.
+ */
+int
+__wt_las_cursor(
+ WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ *cursorp = NULL;
+
+ /*
+ * We don't want to get tapped for eviction after we start using the
+ * lookaside cursor; save a copy of the current eviction state, we'll
+ * turn eviction off before we return.
+ *
+ * Don't cache lookaside table pages, we're here because of eviction
+ * problems and there's no reason to believe lookaside pages will be
+ * useful more than once.
+ */
+ *session_flags =
+ F_ISSET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
+
+ conn = S2C(session);
+
+ /* Eviction and sweep threads have their own lookaside table cursors. */
+ if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) {
+ if (session->las_cursor == NULL) {
+ WT_WITHOUT_DHANDLE(session, ret =
+ __las_cursor_create(session, &session->las_cursor));
+ WT_RET(ret);
+ }
+
+ *cursorp = session->las_cursor;
+ } else {
+ /* Lock the shared lookaside cursor. */
+ __wt_spin_lock(session, &conn->las_lock);
+
+ *cursorp = conn->las_cursor;
+ }
+
+ /* Turn caching and eviction off. */
+ F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
+
+ return (0);
+}
+
+/*
+ * __wt_las_cursor_close --
+ * Discard a lookaside cursor.
+ */
+int
+__wt_las_cursor_close(
+ WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+
+ if ((cursor = *cursorp) == NULL)
+ return (0);
+ *cursorp = NULL;
+
+ /* Reset the cursor. */
+ ret = cursor->reset(cursor);
+
+ /*
+ * We turned off caching and eviction while the lookaside cursor was in
+ * use, restore the session's flags.
+ */
+ F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
+ F_SET(session, session_flags);
+
+ /*
+ * Eviction and sweep threads have their own lookaside table cursors;
+ * else, unlock the shared lookaside cursor.
+ */
+ if (!F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR))
+ __wt_spin_unlock(session, &conn->las_lock);
+
+ return (ret);
+}
+
+/*
+ * __wt_las_sweep --
+ * Sweep the lookaside table.
+ */
+int
+__wt_las_sweep(WT_SESSION_IMPL *session)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_CURSOR *cursor;
+ WT_DECL_ITEM(las_addr);
+ WT_DECL_ITEM(las_key);
+ WT_DECL_RET;
+ WT_ITEM *key;
+ uint64_t cnt, las_counter, las_txnid;
+ uint32_t las_id, session_flags;
+ int notused;
+
+ conn = S2C(session);
+ cursor = NULL;
+ key = &conn->las_sweep_key;
+ session_flags = 0; /* [-Werror=maybe-uninitialized] */
+
+ WT_ERR(__wt_scr_alloc(session, 0, &las_addr));
+ WT_ERR(__wt_scr_alloc(session, 0, &las_key));
+
+ WT_ERR(__wt_las_cursor(session, &cursor, &session_flags));
+
+ /*
+ * If we're not starting a new sweep, position the cursor using the key
+ * from the last call (we don't care if we're before or after the key,
+ * just roughly in the same spot is fine).
+ */
+ if (conn->las_sweep_call != 0 && key->data != NULL) {
+ __wt_cursor_set_raw_key(cursor, key);
+ if ((ret = cursor->search_near(cursor, &notused)) != 0)
+ goto srch_notfound;
+ }
+
+ /*
+ * The sweep server wakes up every 10 seconds (by default), it's a slow
+ * moving thread. Try to review the entire lookaside table once every 5
+ * minutes, or every 30 calls.
+ *
+ * The reason is because the lookaside table exists because we're seeing
+ * cache/eviction pressure (it allows us to trade performance and disk
+ * space for cache space), and it's likely lookaside blocks are being
+ * evicted, and reading them back in doesn't help things. A trickier,
+ * but possibly better, alternative might be to review all lookaside
+ * blocks in the cache in order to get rid of them, and slowly review
+ * lookaside blocks that have already been evicted.
+ *
+ * We can't know for sure how many records are in the lookaside table,
+ * the cursor insert and remove statistics aren't updated atomically.
+ * Start with reviewing 100 rows, and if it takes more than the target
+ * number of calls to finish, increase the number of rows checked on
+ * each call; if it takes less than the target calls to finish, then
+ * decrease the number of rows reviewed on each call (but never less
+ * than 100).
+ */
+#define WT_SWEEP_LOOKASIDE_MIN_CNT 100
+#define WT_SWEEP_LOOKASIDE_PASS_TARGET 30
+ ++conn->las_sweep_call;
+ if ((cnt = conn->las_sweep_cnt) < WT_SWEEP_LOOKASIDE_MIN_CNT)
+ cnt = conn->las_sweep_cnt = WT_SWEEP_LOOKASIDE_MIN_CNT;
+
+ /* Walk the file. */
+ for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) {
+ /*
+ * If the loop terminates after completing a work unit, we will
+ * continue the table sweep next time. Get a local copy of the
+ * sweep key, we're going to reset the cursor; do so before
+ * calling cursor.remove, cursor.remove can discard our hazard
+ * pointer and the page could be evicted from underneath us.
+ */
+ if (cnt == 1) {
+ WT_ERR(__wt_cursor_get_raw_key(cursor, key));
+ if (!WT_DATA_IN_ITEM(key))
+ WT_ERR(__wt_buf_set(
+ session, key, key->data, key->size));
+ }
+
+ WT_ERR(cursor->get_key(cursor,
+ &las_id, las_addr, &las_counter, &las_txnid, las_key));
+
+ /*
+ * If the on-page record transaction ID associated with the
+ * record is globally visible, the record can be discarded.
+ *
+ * Cursor opened overwrite=true: won't return WT_NOTFOUND should
+ * another thread remove the record before we do, and the cursor
+ * remains positioned in that case.
+ */
+ if (__wt_txn_visible_all(session, las_txnid))
+ WT_ERR(cursor->remove(cursor));
+ }
+
+ /*
+ * When reaching the lookaside table end or the target number of calls,
+ * adjust the row count. Decrease/increase the row count depending on
+ * if the number of calls is less/more than the target.
+ */
+ if (ret == WT_NOTFOUND ||
+ conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET) {
+ if (conn->las_sweep_call < WT_SWEEP_LOOKASIDE_PASS_TARGET &&
+ conn->las_sweep_cnt > WT_SWEEP_LOOKASIDE_MIN_CNT)
+ conn->las_sweep_cnt -= WT_SWEEP_LOOKASIDE_MIN_CNT;
+ if (conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET)
+ conn->las_sweep_cnt += WT_SWEEP_LOOKASIDE_MIN_CNT;
+ }
+
+srch_notfound:
+ if (ret == WT_NOTFOUND)
+ conn->las_sweep_call = 0;
+
+ WT_ERR_NOTFOUND_OK(ret);
+
+ if (0) {
+err: __wt_buf_free(session, key);
+ }
+
+ WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
+
+ __wt_scr_free(session, &las_addr);
+ __wt_scr_free(session, &las_key);
+
+ return (ret);
+}
diff --git a/src/config/config_def.c b/src/config/config_def.c
index 73837c46ee8..91cfcedfcaf 100644
--- a/src/config/config_def.c
+++ b/src/config/config_def.c
@@ -76,6 +76,7 @@ static const WT_CONFIG_CHECK
confchk_wiredtiger_open_shared_cache_subconfigs[] = {
{ "chunk", "int", NULL, "min=1MB,max=10TB", NULL, 0 },
{ "name", "string", NULL, NULL, NULL, 0 },
+ { "quota", "int", NULL, NULL, NULL, 0 },
{ "reserve", "int", NULL, NULL, NULL, 0 },
{ "size", "int", NULL, "min=1MB,max=10TB", NULL, 0 },
{ NULL, NULL, NULL, NULL, NULL, 0 }
@@ -121,7 +122,7 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = {
{ "lsm_merge", "boolean", NULL, NULL, NULL, 0 },
{ "shared_cache", "category",
NULL, NULL,
- confchk_wiredtiger_open_shared_cache_subconfigs, 4 },
+ confchk_wiredtiger_open_shared_cache_subconfigs, 5 },
{ "statistics", "list",
NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
NULL, 0 },
@@ -520,7 +521,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = {
{ "session_scratch_max", "int", NULL, NULL, NULL, 0 },
{ "shared_cache", "category",
NULL, NULL,
- confchk_wiredtiger_open_shared_cache_subconfigs, 4 },
+ confchk_wiredtiger_open_shared_cache_subconfigs, 5 },
{ "statistics", "list",
NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
NULL, 0 },
@@ -595,7 +596,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = {
{ "session_scratch_max", "int", NULL, NULL, NULL, 0 },
{ "shared_cache", "category",
NULL, NULL,
- confchk_wiredtiger_open_shared_cache_subconfigs, 4 },
+ confchk_wiredtiger_open_shared_cache_subconfigs, 5 },
{ "statistics", "list",
NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
NULL, 0 },
@@ -668,7 +669,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = {
{ "session_scratch_max", "int", NULL, NULL, NULL, 0 },
{ "shared_cache", "category",
NULL, NULL,
- confchk_wiredtiger_open_shared_cache_subconfigs, 4 },
+ confchk_wiredtiger_open_shared_cache_subconfigs, 5 },
{ "statistics", "list",
NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
NULL, 0 },
@@ -740,7 +741,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = {
{ "session_scratch_max", "int", NULL, NULL, NULL, 0 },
{ "shared_cache", "category",
NULL, NULL,
- confchk_wiredtiger_open_shared_cache_subconfigs, 4 },
+ confchk_wiredtiger_open_shared_cache_subconfigs, 5 },
{ "statistics", "list",
NULL, "choices=[\"all\",\"fast\",\"none\",\"clear\"]",
NULL, 0 },
@@ -807,8 +808,8 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"eviction_dirty_trigger=95,eviction_target=80,eviction_trigger=95"
",file_manager=(close_handle_minimum=250,close_idle_time=30,"
"close_scan_interval=10),lsm_manager=(merge=,worker_thread_max=4)"
- ",lsm_merge=,shared_cache=(chunk=10MB,name=,reserve=0,size=500MB)"
- ",statistics=none,statistics_log=(on_close=0,"
+ ",lsm_merge=,shared_cache=(chunk=10MB,name=,quota=0,reserve=0,"
+ "size=500MB),statistics=none,statistics_log=(on_close=0,"
"path=\"WiredTigerStat.%d.%H\",sources=,"
"timestamp=\"%b %d %H:%M:%S\",wait=0),verbose=",
confchk_WT_CONNECTION_reconfigure, 17
@@ -959,9 +960,9 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"log=(archive=,compressor=,enabled=0,file_max=100MB,path=,"
"prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4),"
"lsm_merge=,mmap=,multiprocess=0,session_max=100,"
- "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,reserve=0"
- ",size=500MB),statistics=none,statistics_log=(on_close=0,"
- "path=\"WiredTigerStat.%d.%H\",sources=,"
+ "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0,"
+ "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0"
+ ",path=\"WiredTigerStat.%d.%H\",sources=,"
"timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
",method=fsync),use_environment_priv=0,verbose=",
confchk_wiredtiger_open, 34
@@ -979,9 +980,9 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"log=(archive=,compressor=,enabled=0,file_max=100MB,path=,"
"prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4),"
"lsm_merge=,mmap=,multiprocess=0,session_max=100,"
- "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,reserve=0"
- ",size=500MB),statistics=none,statistics_log=(on_close=0,"
- "path=\"WiredTigerStat.%d.%H\",sources=,"
+ "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0,"
+ "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0"
+ ",path=\"WiredTigerStat.%d.%H\",sources=,"
"timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
",method=fsync),use_environment_priv=0,verbose=,version=(major=0,"
"minor=0)",
@@ -999,9 +1000,9 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"log=(archive=,compressor=,enabled=0,file_max=100MB,path=,"
"prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4),"
"lsm_merge=,mmap=,multiprocess=0,session_max=100,"
- "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,reserve=0"
- ",size=500MB),statistics=none,statistics_log=(on_close=0,"
- "path=\"WiredTigerStat.%d.%H\",sources=,"
+ "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0,"
+ "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0"
+ ",path=\"WiredTigerStat.%d.%H\",sources=,"
"timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
",method=fsync),verbose=,version=(major=0,minor=0)",
confchk_wiredtiger_open_basecfg, 31
@@ -1018,9 +1019,9 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"log=(archive=,compressor=,enabled=0,file_max=100MB,path=,"
"prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4),"
"lsm_merge=,mmap=,multiprocess=0,session_max=100,"
- "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,reserve=0"
- ",size=500MB),statistics=none,statistics_log=(on_close=0,"
- "path=\"WiredTigerStat.%d.%H\",sources=,"
+ "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0,"
+ "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0"
+ ",path=\"WiredTigerStat.%d.%H\",sources=,"
"timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0"
",method=fsync),verbose=",
confchk_wiredtiger_open_usercfg, 30
diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c
index 067ad00560e..b1155d06826 100644
--- a/src/conn/conn_api.c
+++ b/src/conn/conn_api.c
@@ -432,7 +432,7 @@ __wt_encryptor_config(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cval,
"requires connection encryption to be set");
hash = __wt_hash_city64(keyid->str, keyid->len);
bucket = hash % WT_HASH_ARRAY_SIZE;
- SLIST_FOREACH(kenc, &nenc->keyedhashlh[bucket], l)
+ TAILQ_FOREACH(kenc, &nenc->keyedhashqh[bucket], q)
if (WT_STRING_MATCH(kenc->keyid, keyid->str, keyid->len))
goto out;
@@ -450,8 +450,8 @@ __wt_encryptor_config(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *cval,
WT_ERR(encryptor->sizing(encryptor, &session->iface,
&kenc->size_const));
kenc->encryptor = encryptor;
- SLIST_INSERT_HEAD(&nenc->keyedlh, kenc, l);
- SLIST_INSERT_HEAD(&nenc->keyedhashlh[bucket], kenc, hashl);
+ TAILQ_INSERT_HEAD(&nenc->keyedqh, kenc, q);
+ TAILQ_INSERT_HEAD(&nenc->keyedhashqh[bucket], kenc, hashq);
out: __wt_spin_unlock(session, &conn->encryptor_lock);
*kencryptorp = kenc;
@@ -506,9 +506,9 @@ __conn_add_encryptor(WT_CONNECTION *wt_conn,
WT_ERR(__wt_calloc_one(session, &nenc));
WT_ERR(__wt_strdup(session, name, &nenc->name));
nenc->encryptor = encryptor;
- SLIST_INIT(&nenc->keyedlh);
+ TAILQ_INIT(&nenc->keyedqh);
for (i = 0; i < WT_HASH_ARRAY_SIZE; i++)
- SLIST_INIT(&nenc->keyedhashlh[i]);
+ TAILQ_INIT(&nenc->keyedhashqh[i]);
TAILQ_INSERT_TAIL(&conn->encryptqh, nenc, q);
nenc = NULL;
@@ -537,15 +537,14 @@ __wt_conn_remove_encryptor(WT_SESSION_IMPL *session)
conn = S2C(session);
while ((nenc = TAILQ_FIRST(&conn->encryptqh)) != NULL) {
- while ((kenc = SLIST_FIRST(&nenc->keyedlh)) != NULL) {
+ while ((kenc = TAILQ_FIRST(&nenc->keyedqh)) != NULL) {
/* Call any termination method. */
if (kenc->owned && kenc->encryptor->terminate != NULL)
WT_TRET(kenc->encryptor->terminate(
kenc->encryptor, (WT_SESSION *)session));
/* Remove from the connection's list, free memory. */
- SLIST_REMOVE(
- &nenc->keyedlh, kenc, __wt_keyed_encryptor, l);
+ TAILQ_REMOVE(&nenc->keyedqh, kenc, q);
__wt_free(session, kenc->keyid);
__wt_free(session, kenc);
}
@@ -1725,7 +1724,8 @@ __conn_write_base_config(WT_SESSION_IMPL *session, const char *cfg[])
"encryption=(secretkey=),"
"exclusive=,"
"log=(recover=),"
- "use_environment_priv=,", &base_config));
+ "use_environment_priv=,"
+ "verbose=,", &base_config));
WT_ERR(__wt_config_init(session, &parser, base_config));
while ((ret = __wt_config_next(&parser, &k, &v)) == 0) {
/* Fix quoting for non-trivial settings. */
@@ -1795,6 +1795,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
WT_DECL_RET;
const WT_NAME_FLAG *ft;
WT_SESSION_IMPL *session;
+ int64_t config_base_set;
const char *enc_cfg[] = { NULL, NULL };
char version[64];
@@ -1836,6 +1837,10 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
cfg[0] = WT_CONFIG_BASE(session, wiredtiger_open);
cfg[1] = config;
+ /* Capture the config_base setting file for later use. */
+ WT_ERR(__wt_config_gets(session, cfg, "config_base", &cval));
+ config_base_set = cval.val;
+
/* Configure error messages so we get them right early. */
WT_ERR(__wt_config_gets(session, cfg, "error_prefix", &cval));
if (cval.len != 0)
@@ -1873,7 +1878,10 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
WIREDTIGER_VERSION_MAJOR, WIREDTIGER_VERSION_MINOR) >=
(int)sizeof(version), ENOMEM);
__conn_config_append(cfg, version);
- WT_ERR(__conn_config_file(session, WT_BASECONFIG, 0, cfg, i1));
+
+ /* Ignore the base_config file if we config_base set to false. */
+ if (config_base_set != 0)
+ WT_ERR(__conn_config_file(session, WT_BASECONFIG, 0, cfg, i1));
__conn_config_append(cfg, config);
WT_ERR(__conn_config_file(session, WT_USERCONFIG, 1, cfg, i2));
WT_ERR(__conn_config_env(session, cfg, i3));
@@ -1904,7 +1912,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
conn->hazard_max = (uint32_t)cval.val;
WT_ERR(__wt_config_gets(session, cfg, "session_max", &cval));
- conn->session_size = (uint32_t)cval.val + WT_NUM_INTERNAL_SESSIONS;
+ conn->session_size = (uint32_t)cval.val + WT_EXTRA_INTERNAL_SESSIONS;
WT_ERR(__wt_config_gets(session, cfg, "session_scratch_max", &cval));
conn->session_scratch_max = (size_t)cval.val;
@@ -2023,11 +2031,12 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
WT_ERR(__wt_turtle_init(session));
WT_ERR(__wt_metadata_open(session));
- /*
- * Start the worker threads last.
- */
+ /* Start the worker threads and run recovery. */
WT_ERR(__wt_connection_workers(session, cfg));
+ /* Create the lookaside table. */
+ WT_ERR(__wt_las_create(session));
+
WT_STATIC_ASSERT(offsetof(WT_CONNECTION_IMPL, iface) == 0);
*wt_connp = &conn->iface;
diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c
index d62425fe536..8f62c7140c7 100644
--- a/src/conn/conn_cache.c
+++ b/src/conn/conn_cache.c
@@ -156,7 +156,8 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[])
/* Allocate the LRU eviction queue. */
cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR;
- WT_ERR(__wt_calloc_def(session, cache->evict_slots, &cache->evict));
+ WT_ERR(__wt_calloc_def(session,
+ cache->evict_slots, &cache->evict_queue));
/*
* We get/set some values in the cache statistics (rather than have
@@ -178,12 +179,12 @@ __wt_cache_stats_update(WT_SESSION_IMPL *session)
{
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
- WT_CONNECTION_STATS *stats;
+ WT_CONNECTION_STATS **stats;
uint64_t inuse, leaf, used;
conn = S2C(session);
cache = conn->cache;
- stats = &conn->stats;
+ stats = conn->stats;
inuse = __wt_cache_bytes_inuse(cache);
/*
@@ -193,19 +194,23 @@ __wt_cache_stats_update(WT_SESSION_IMPL *session)
used = cache->bytes_overflow + cache->bytes_internal;
leaf = inuse > used ? inuse - used : 0;
- WT_STAT_SET(stats, cache_bytes_max, conn->cache_size);
- WT_STAT_SET(stats, cache_bytes_inuse, inuse);
+ WT_STAT_SET(session, stats, cache_bytes_max, conn->cache_size);
+ WT_STAT_SET(session, stats, cache_bytes_inuse, inuse);
- WT_STAT_SET(stats, cache_overhead, cache->overhead_pct);
- WT_STAT_SET(stats, cache_pages_inuse, __wt_cache_pages_inuse(cache));
- WT_STAT_SET(stats, cache_bytes_dirty, __wt_cache_dirty_inuse(cache));
- WT_STAT_SET(stats,
+ WT_STAT_SET(session, stats, cache_overhead, cache->overhead_pct);
+ WT_STAT_SET(
+ session, stats, cache_pages_inuse, __wt_cache_pages_inuse(cache));
+ WT_STAT_SET(
+ session, stats, cache_bytes_dirty, __wt_cache_dirty_inuse(cache));
+ WT_STAT_SET(session, stats,
cache_eviction_maximum_page_size, cache->evict_max_page_size);
- WT_STAT_SET(stats, cache_pages_dirty, cache->pages_dirty);
+ WT_STAT_SET(session, stats, cache_pages_dirty, cache->pages_dirty);
- WT_STAT_SET(stats, cache_bytes_internal, cache->bytes_internal);
- WT_STAT_SET(stats, cache_bytes_overflow, cache->bytes_overflow);
- WT_STAT_SET(stats, cache_bytes_leaf, leaf);
+ WT_STAT_SET(
+ session, stats, cache_bytes_internal, cache->bytes_internal);
+ WT_STAT_SET(
+ session, stats, cache_bytes_overflow, cache->bytes_overflow);
+ WT_STAT_SET(session, stats, cache_bytes_leaf, leaf);
}
/*
@@ -246,7 +251,7 @@ __wt_cache_destroy(WT_SESSION_IMPL *session)
__wt_spin_destroy(session, &cache->evict_lock);
__wt_spin_destroy(session, &cache->evict_walk_lock);
- __wt_free(session, cache->evict);
+ __wt_free(session, cache->evict_queue);
__wt_free(session, conn->cache);
return (ret);
}
diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c
index fdc95a32387..aaae58ef168 100644
--- a/src/conn/conn_cache_pool.c
+++ b/src/conn/conn_cache_pool.c
@@ -22,21 +22,22 @@
*/
#define WT_CACHE_POOL_REDUCE_THRESHOLD 20
/* Balancing passes after a bump before a connection is a candidate. */
-#define WT_CACHE_POOL_BUMP_SKIPS 10
+#define WT_CACHE_POOL_BUMP_SKIPS 5
/* Balancing passes after a reduction before a connection is a candidate. */
-#define WT_CACHE_POOL_REDUCE_SKIPS 5
+#define WT_CACHE_POOL_REDUCE_SKIPS 10
/*
* Constants that control how much influence different metrics have on
* the pressure calculation.
*/
-#define WT_CACHE_POOL_APP_EVICT_MULTIPLIER 10
-#define WT_CACHE_POOL_APP_WAIT_MULTIPLIER 50
+#define WT_CACHE_POOL_APP_EVICT_MULTIPLIER 3
+#define WT_CACHE_POOL_APP_WAIT_MULTIPLIER 6
#define WT_CACHE_POOL_READ_MULTIPLIER 1
-static int __cache_pool_adjust(WT_SESSION_IMPL *, uint64_t, uint64_t, int *);
+static int __cache_pool_adjust(
+ WT_SESSION_IMPL *, uint64_t, uint64_t, int, int *);
static int __cache_pool_assess(WT_SESSION_IMPL *, uint64_t *);
-static int __cache_pool_balance(WT_SESSION_IMPL *);
+static int __cache_pool_balance(WT_SESSION_IMPL *, int);
/*
* __wt_cache_pool_config --
@@ -51,7 +52,7 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
WT_DECL_RET;
char *pool_name;
int created, updating;
- uint64_t chunk, reserve, size, used_cache;
+ uint64_t chunk, quota, reserve, size, used_cache;
conn = S2C(session);
created = updating = 0;
@@ -142,6 +143,11 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
chunk = (uint64_t)cval.val;
else
chunk = cp->chunk;
+ if (__wt_config_gets(session, &cfg[1],
+ "shared_cache.quota", &cval) == 0 && cval.val != 0)
+ quota = (uint64_t)cval.val;
+ else
+ quota = cp->quota;
} else {
/*
* The only time shared cache configuration uses default
@@ -155,6 +161,9 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
session, cfg, "shared_cache.chunk", &cval));
WT_ASSERT(session, cval.val != 0);
chunk = (uint64_t)cval.val;
+ WT_ERR(__wt_config_gets(
+ session, cfg, "shared_cache.quota", &cval));
+ quota = (uint64_t)cval.val;
}
/*
@@ -197,8 +206,10 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
/* The configuration is verified - it's safe to update the pool. */
cp->size = size;
cp->chunk = chunk;
+ cp->quota = quota;
conn->cache->cp_reserved = reserve;
+ conn->cache->cp_quota = quota;
/* Wake up the cache pool server so any changes are noticed. */
if (updating)
@@ -402,7 +413,7 @@ __wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session)
* effectively used.
*/
static int
-__cache_pool_balance(WT_SESSION_IMPL *session)
+__cache_pool_balance(WT_SESSION_IMPL *session, int forward)
{
WT_CACHE_POOL *cp;
WT_DECL_RET;
@@ -421,16 +432,16 @@ __cache_pool_balance(WT_SESSION_IMPL *session)
WT_ERR(__cache_pool_assess(session, &highest));
bump_threshold = WT_CACHE_POOL_BUMP_THRESHOLD;
+
/*
* Actively attempt to:
* - Reduce the amount allocated, if we are over the budget
* - Increase the amount used if there is capacity and any pressure.
*/
- for (bump_threshold = WT_CACHE_POOL_BUMP_THRESHOLD;
- F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) &&
- F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN);) {
+ while (F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) &&
+ F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN)) {
WT_ERR(__cache_pool_adjust(
- session, highest, bump_threshold, &adjusted));
+ session, highest, bump_threshold, forward, &adjusted));
/*
* Stop if the amount of cache being used is stable, and we
* aren't over capacity.
@@ -456,30 +467,39 @@ __cache_pool_assess(WT_SESSION_IMPL *session, uint64_t *phighest)
WT_CACHE *cache;
WT_CONNECTION_IMPL *entry;
uint64_t app_evicts, app_waits, reads;
- uint64_t entries, highest, tmp;
+ uint64_t balanced_size, entries, highest, tmp;
cp = __wt_process.cache_pool;
- entries = 0;
+ balanced_size = entries = 0;
highest = 1; /* Avoid divide by zero */
+ TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) {
+ if (entry->cache_size == 0 || entry->cache == NULL)
+ continue;
+ ++entries;
+ }
+
+ if (entries > 0)
+ balanced_size = cp->currently_used / entries;
+
/* Generate read pressure information. */
TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) {
- if (entry->cache_size == 0 ||
- entry->cache == NULL)
+ if (entry->cache_size == 0 || entry->cache == NULL)
continue;
cache = entry->cache;
- ++entries;
/*
* Figure out a delta since the last time we did an assessment
* for each metric we are tracking. Watch out for wrapping
* of values.
+ *
+ * Count pages read, assuming pages are 4KB.
*/
- tmp = cache->bytes_read;
+ tmp = cache->bytes_read >> 12;
if (tmp >= cache->cp_saved_read)
reads = tmp - cache->cp_saved_read;
else
- reads = (UINT64_MAX - cache->cp_saved_read) + tmp;
+ reads = tmp;
cache->cp_saved_read = tmp;
/* Update the application eviction count information */
@@ -500,12 +520,19 @@ __cache_pool_assess(WT_SESSION_IMPL *session, uint64_t *phighest)
(UINT64_MAX - cache->cp_saved_app_waits) + tmp;
cache->cp_saved_app_waits = tmp;
- /* Calculate the weighted pressure for this member */
- cache->cp_pass_pressure =
- (app_evicts * WT_CACHE_POOL_APP_EVICT_MULTIPLIER) +
+ /* Calculate the weighted pressure for this member. */
+ tmp = (app_evicts * WT_CACHE_POOL_APP_EVICT_MULTIPLIER) +
(app_waits * WT_CACHE_POOL_APP_WAIT_MULTIPLIER) +
(reads * WT_CACHE_POOL_READ_MULTIPLIER);
+ /* Weight smaller caches higher. */
+ tmp = (uint64_t)(tmp *
+ ((double)balanced_size / entry->cache_size));
+
+ /* Smooth over history. */
+ cache->cp_pass_pressure =
+ (9 * cache->cp_pass_pressure + tmp) / 10;
+
if (cache->cp_pass_pressure > highest)
highest = cache->cp_pass_pressure;
@@ -524,24 +551,25 @@ __cache_pool_assess(WT_SESSION_IMPL *session, uint64_t *phighest)
/*
* __cache_pool_adjust --
- * Adjust the allocation of cache to each connection. If force is set
+ * Adjust the allocation of cache to each connection. If full is set
* ignore cache load information, and reduce the allocation for every
* connection allocated more than their reserved size.
*/
static int
__cache_pool_adjust(WT_SESSION_IMPL *session,
- uint64_t highest, uint64_t bump_threshold, int *adjustedp)
+ uint64_t highest, uint64_t bump_threshold, int forward, int *adjustedp)
{
WT_CACHE_POOL *cp;
WT_CACHE *cache;
WT_CONNECTION_IMPL *entry;
- uint64_t adjusted, highest_percentile, pressure, reserved;
- int force, grew;
+ uint64_t adjustment, highest_percentile, pressure, reserved, smallest;
+ int busy, pool_full, grow;
+ u_int pct_full;
*adjustedp = 0;
cp = __wt_process.cache_pool;
- force = (cp->currently_used > cp->size);
- grew = 0;
+ grow = 0;
+ pool_full = (cp->currently_used >= cp->size);
/* Highest as a percentage, avoid 0 */
highest_percentile = (highest / 100) + 1;
@@ -549,13 +577,17 @@ __cache_pool_adjust(WT_SESSION_IMPL *session,
WT_RET(__wt_verbose(session,
WT_VERB_SHARED_CACHE, "Cache pool distribution: "));
WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
- "\t" "cache_size, pressure, skips: "));
+ "\t" "cache (MB), pressure, skips, busy, %% full:"));
}
- TAILQ_FOREACH(entry, &cp->cache_pool_qh, cpq) {
+ for (entry = forward ? TAILQ_FIRST(&cp->cache_pool_qh) :
+ TAILQ_LAST(&cp->cache_pool_qh, __wt_cache_pool_qh);
+ entry != NULL;
+ entry = forward ? TAILQ_NEXT(entry, cpq) :
+ TAILQ_PREV(entry, __wt_cache_pool_qh, cpq)) {
cache = entry->cache;
reserved = cache->cp_reserved;
- adjusted = 0;
+ adjustment = 0;
/*
* The read pressure is calculated as a percentage of how
@@ -565,84 +597,109 @@ __cache_pool_adjust(WT_SESSION_IMPL *session,
* assigned.
*/
pressure = cache->cp_pass_pressure / highest_percentile;
+ busy = __wt_eviction_needed(entry->default_session, &pct_full);
+
WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
- "\t%" PRIu64 ", %" PRIu64 ", %" PRIu32,
- entry->cache_size, pressure, cache->cp_skip_count));
+ "\t%5" PRIu64 ", %3" PRIu64 ", %2" PRIu32 ", %d, %2u",
+ entry->cache_size >> 20, pressure, cache->cp_skip_count,
+ busy, pct_full));
/* Allow to stabilize after changes. */
if (cache->cp_skip_count > 0 && --cache->cp_skip_count > 0)
continue;
+
/*
* If the entry is currently allocated less than the reserved
- * size, increase it's allocation. This should only happen if:
- * - It's the first time we've seen this member
- * - The reserved size has been adjusted
+ * size, increase its allocation. This should only happen if:
+ * - it's the first time we've seen this member, or
+ * - the reserved size has been adjusted
*/
if (entry->cache_size < reserved) {
- grew = 1;
- adjusted = reserved - entry->cache_size;
-
+ grow = 1;
+ adjustment = reserved - entry->cache_size;
/*
* Conditions for reducing the amount of resources for an
* entry:
- * - If we are forcing and this entry has more than the
- * minimum amount of space in use.
- * - If the read pressure in this entry is below the
- * threshold, other entries need more cache, the entry has
- * more than the minimum space and there is no available
- * space in the pool.
+ * - the pool is full,
+ * - application threads are not busy doing eviction already,
+ * - this entry has more than the minimum amount of space in
+ * use,
+ * - the read pressure in this entry is below the threshold,
+ * other entries need more cache, the entry has more than
+ * the minimum space and there is no available space in the
+ * pool.
*/
- } else if ((force && entry->cache_size > reserved) ||
- (pressure < WT_CACHE_POOL_REDUCE_THRESHOLD &&
- highest > 1 && entry->cache_size > reserved &&
- cp->currently_used >= cp->size)) {
- grew = 0;
+ } else if (pool_full && !busy &&
+ entry->cache_size > reserved &&
+ pressure < WT_CACHE_POOL_REDUCE_THRESHOLD && highest > 1) {
+ grow = 0;
/*
- * Shrink by a chunk size if that doesn't drop us
- * below the reserved size.
+ * Don't drop the size down too much - or it can
+ * trigger aggressive eviction in the connection,
+ * which is likely to lead to lower throughput and
+ * potentially a negative feedback loop in the
+ * balance algorithm.
*/
- if (entry->cache_size > cp->chunk + reserved)
- adjusted = cp->chunk;
- else
- adjusted = entry->cache_size - reserved;
+ smallest = (100 * __wt_cache_bytes_inuse(cache)) /
+ cache->eviction_trigger;
+ if (entry->cache_size > smallest)
+ adjustment = WT_MIN(cp->chunk,
+ (entry->cache_size - smallest) / 2);
+ adjustment =
+ WT_MIN(adjustment, entry->cache_size - reserved);
/*
* Conditions for increasing the amount of resources for an
* entry:
- * - There was some activity across the pool
- * - This entry is using less than the entire cache pool
- * - The connection is using enough cache to require eviction
- * - There is space available in the pool
- * - Additional cache would benefit the connection OR
- * - The pool is less than half distributed
+ * - there is space available in the pool
+ * - the connection isn't over quota
+ * - the connection is using enough cache to require eviction
+ * - there was some activity across the pool
+ * - this entry is using less than the entire cache pool
+ * - additional cache would benefit the connection OR
+ * - the pool is less than half distributed
*/
- } else if (entry->cache_size < cp->size &&
+ } else if (!pool_full &&
+ (cache->cp_quota == 0 ||
+ entry->cache_size < cache->cp_quota) &&
__wt_cache_bytes_inuse(cache) >=
(entry->cache_size * cache->eviction_target) / 100 &&
- ((cp->currently_used < cp->size &&
- pressure > bump_threshold) ||
+ (pressure > bump_threshold ||
cp->currently_used < cp->size * 0.5)) {
- grew = 1;
- adjusted = WT_MIN(cp->chunk,
- cp->size - cp->currently_used);
+ grow = 1;
+ adjustment = WT_MIN(WT_MIN(cp->chunk,
+ cp->size - cp->currently_used),
+ cache->cp_quota - entry->cache_size);
}
- if (adjusted > 0) {
+ /*
+ * Bounds checking: don't go over the pool size or under the
+ * reserved size for this cache.
+ *
+ * Shrink by a chunk size if that doesn't drop us
+ * below the reserved size.
+ *
+ * Limit the reduction to half of the free space in the
+ * connection's cache. This should reduce cache sizes
+ * gradually without stalling application threads.
+ */
+ if (adjustment > 0) {
*adjustedp = 1;
- if (grew > 0) {
+ if (grow) {
cache->cp_skip_count = WT_CACHE_POOL_BUMP_SKIPS;
- entry->cache_size += adjusted;
- cp->currently_used += adjusted;
+ entry->cache_size += adjustment;
+ cp->currently_used += adjustment;
} else {
cache->cp_skip_count =
WT_CACHE_POOL_REDUCE_SKIPS;
WT_ASSERT(session,
- entry->cache_size >= adjusted &&
- cp->currently_used >= adjusted);
- entry->cache_size -= adjusted;
- cp->currently_used -= adjusted;
+ entry->cache_size >= adjustment &&
+ cp->currently_used >= adjustment);
+ entry->cache_size -= adjustment;
+ cp->currently_used -= adjustment;
}
WT_RET(__wt_verbose(session, WT_VERB_SHARED_CACHE,
"Allocated %s%" PRId64 " to %s",
- grew ? "" : "-", adjusted, entry->home));
+ grow ? "" : "-", adjustment, entry->home));
+
/*
* TODO: Add a loop waiting for connection to give up
* cache.
@@ -663,11 +720,13 @@ __wt_cache_pool_server(void *arg)
WT_CACHE_POOL *cp;
WT_DECL_RET;
WT_SESSION_IMPL *session;
+ int forward;
session = (WT_SESSION_IMPL *)arg;
cp = __wt_process.cache_pool;
cache = S2C(session)->cache;
+ forward = 1;
while (F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) &&
F_ISSET(cache, WT_CACHE_POOL_RUN)) {
@@ -695,8 +754,10 @@ __wt_cache_pool_server(void *arg)
* Continue even if there was an error. Details of errors are
* reported in the balance function.
*/
- if (F_ISSET(cache, WT_CACHE_POOL_MANAGER))
- (void)__cache_pool_balance(session);
+ if (F_ISSET(cache, WT_CACHE_POOL_MANAGER)) {
+ (void)__cache_pool_balance(session, forward);
+ forward = !forward;
+ }
}
if (0) {
diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c
index 76f55fa44e5..92497484408 100644
--- a/src/conn/conn_dhandle.c
+++ b/src/conn/conn_dhandle.c
@@ -55,6 +55,8 @@ __conn_dhandle_alloc(WT_SESSION_IMPL *session,
WT_ERR(__wt_spin_init(
session, &dhandle->close_lock, "data handle close"));
+ __wt_stat_dsrc_init(dhandle);
+
*dhandlep = dhandle;
return (0);
@@ -81,7 +83,7 @@ __wt_conn_dhandle_find(
bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE;
if (checkpoint == NULL) {
- SLIST_FOREACH(dhandle, &conn->dhhash[bucket], hashl) {
+ TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) {
if (F_ISSET(dhandle, WT_DHANDLE_DEAD))
continue;
if (dhandle->checkpoint == NULL &&
@@ -91,7 +93,7 @@ __wt_conn_dhandle_find(
}
}
} else
- SLIST_FOREACH(dhandle, &conn->dhhash[bucket], hashl) {
+ TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) {
if (F_ISSET(dhandle, WT_DHANDLE_DEAD))
continue;
if (dhandle->checkpoint != NULL &&
@@ -404,7 +406,7 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session,
if (uri != NULL) {
bucket =
__wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE;
- SLIST_FOREACH(dhandle, &conn->dhhash[bucket], hashl)
+ TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq)
if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
!F_ISSET(dhandle, WT_DHANDLE_DEAD) &&
strcmp(uri, dhandle->name) == 0 &&
@@ -412,7 +414,7 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session,
WT_RET(__conn_btree_apply_internal(
session, dhandle, func, cfg));
} else {
- SLIST_FOREACH(dhandle, &conn->dhlh, l)
+ TAILQ_FOREACH(dhandle, &conn->dhqh, q)
if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
!F_ISSET(dhandle, WT_DHANDLE_DEAD) &&
(apply_checkpoints ||
@@ -489,7 +491,7 @@ __wt_conn_btree_apply_single(WT_SESSION_IMPL *session,
hash = __wt_hash_city64(uri, strlen(uri));
bucket = hash % WT_HASH_ARRAY_SIZE;
- SLIST_FOREACH(dhandle, &conn->dhhash[bucket], hashl)
+ TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq)
if (F_ISSET(dhandle, WT_DHANDLE_OPEN) &&
!F_ISSET(dhandle, WT_DHANDLE_DEAD) &&
(hash == dhandle->name_hash &&
@@ -538,7 +540,7 @@ __wt_conn_dhandle_close_all(
WT_ASSERT(session, session->dhandle == NULL);
bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE;
- SLIST_FOREACH(dhandle, &conn->dhhash[bucket], hashl) {
+ TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) {
if (strcmp(dhandle->name, uri) != 0 ||
F_ISSET(dhandle, WT_DHANDLE_DEAD))
continue;
@@ -596,6 +598,7 @@ __conn_dhandle_remove(WT_SESSION_IMPL *session, int final)
bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE;
WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
+ WT_ASSERT(session, dhandle != conn->cache->evict_file_next);
/* Check if the handle was reacquired by a session while we waited. */
if (!final &&
@@ -675,7 +678,7 @@ __wt_conn_dhandle_discard(WT_SESSION_IMPL *session)
* the list, so we do it the hard way.
*/
restart:
- SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+ TAILQ_FOREACH(dhandle, &conn->dhqh, q) {
if (WT_IS_METADATA(dhandle))
continue;
@@ -694,7 +697,7 @@ restart:
F_SET(session, WT_SESSION_NO_DATA_HANDLES);
/* Close the metadata file handle. */
- while ((dhandle = SLIST_FIRST(&conn->dhlh)) != NULL)
+ while ((dhandle = TAILQ_FIRST(&conn->dhqh)) != NULL)
WT_WITH_DHANDLE(session, dhandle,
WT_TRET(__wt_conn_dhandle_discard_single(session, 1, 0)));
diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c
index 94e69897c1d..7a8a6cba838 100644
--- a/src/conn/conn_handle.c
+++ b/src/conn/conn_handle.c
@@ -21,14 +21,14 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn)
session = conn->default_session;
for (i = 0; i < WT_HASH_ARRAY_SIZE; i++) {
- SLIST_INIT(&conn->dhhash[i]); /* Data handle hash lists */
- SLIST_INIT(&conn->fhhash[i]); /* File handle hash lists */
+ TAILQ_INIT(&conn->dhhash[i]); /* Data handle hash lists */
+ TAILQ_INIT(&conn->fhhash[i]); /* File handle hash lists */
}
- SLIST_INIT(&conn->dhlh); /* Data handle list */
+ TAILQ_INIT(&conn->dhqh); /* Data handle list */
TAILQ_INIT(&conn->dlhqh); /* Library list */
TAILQ_INIT(&conn->dsrcqh); /* Data source list */
- SLIST_INIT(&conn->fhlh); /* File list */
+ TAILQ_INIT(&conn->fhqh); /* File list */
TAILQ_INIT(&conn->collqh); /* Collator list */
TAILQ_INIT(&conn->compqh); /* Compressor list */
TAILQ_INIT(&conn->encryptqh); /* Encryptor list */
@@ -45,7 +45,7 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn)
WT_RET(__wt_conn_config_init(session));
/* Statistics. */
- __wt_stat_init_connection_stats(&conn->stats);
+ __wt_stat_connection_init(conn);
/* Locks. */
WT_RET(__wt_spin_init(session, &conn->api_lock, "api"));
@@ -55,11 +55,14 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn)
WT_RET(__wt_spin_init(session, &conn->fh_lock, "file list"));
WT_RET(__wt_rwlock_alloc(session,
&conn->hot_backup_lock, "hot backup"));
+ WT_RET(__wt_spin_init(session, &conn->las_lock, "lookaside table"));
WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure"));
WT_RET(__wt_spin_init(session, &conn->schema_lock, "schema"));
WT_RET(__wt_spin_init(session, &conn->table_lock, "table creation"));
- WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS(conn), &conn->page_lock));
- for (i = 0; i < WT_PAGE_LOCKS(conn); ++i)
+
+ WT_RET(__wt_calloc_def(session, WT_PAGE_LOCKS, &conn->page_lock));
+ WT_CACHE_LINE_ALIGNMENT_VERIFY(session, conn->page_lock);
+ for (i = 0; i < WT_PAGE_LOCKS; ++i)
WT_RET(
__wt_spin_init(session, &conn->page_lock[i], "btree page"));
@@ -91,8 +94,8 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn)
*/
WT_RET(__wt_spin_init(session, &conn->block_lock, "block manager"));
for (i = 0; i < WT_HASH_ARRAY_SIZE; i++)
- SLIST_INIT(&conn->blockhash[i]);/* Block handle hash lists */
- SLIST_INIT(&conn->blocklh); /* Block manager list */
+ TAILQ_INIT(&conn->blockhash[i]);/* Block handle hash lists */
+ TAILQ_INIT(&conn->blockqh); /* Block manager list */
return (0);
}
@@ -138,10 +141,11 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn)
__wt_spin_destroy(session, &conn->encryptor_lock);
__wt_spin_destroy(session, &conn->fh_lock);
WT_TRET(__wt_rwlock_destroy(session, &conn->hot_backup_lock));
+ __wt_spin_destroy(session, &conn->las_lock);
__wt_spin_destroy(session, &conn->reconfig_lock);
__wt_spin_destroy(session, &conn->schema_lock);
__wt_spin_destroy(session, &conn->table_lock);
- for (i = 0; i < WT_PAGE_LOCKS(conn); ++i)
+ for (i = 0; i < WT_PAGE_LOCKS; ++i)
__wt_spin_destroy(session, &conn->page_lock[i]);
__wt_free(session, conn->page_lock);
diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c
index de4bf7268ed..2b115190b06 100644
--- a/src/conn/conn_log.c
+++ b/src/conn/conn_log.c
@@ -287,8 +287,9 @@ __log_file_server(void *arg)
WT_DECL_RET;
WT_FH *close_fh;
WT_LOG *log;
- WT_LSN close_end_lsn, close_lsn, min_lsn;
+ WT_LSN close_end_lsn, min_lsn;
WT_SESSION_IMPL *session;
+ uint32_t filenum;
int locked;
session = arg;
@@ -300,66 +301,97 @@ __log_file_server(void *arg)
* If there is a log file to close, make sure any outstanding
* write operations have completed, then fsync and close it.
*/
- if ((close_fh = log->log_close_fh) != NULL &&
- (ret = __wt_log_extract_lognum(session, close_fh->name,
- &close_lsn.file)) == 0 &&
- close_lsn.file < log->write_lsn.file) {
+ if ((close_fh = log->log_close_fh) != NULL) {
+ WT_ERR(__wt_log_extract_lognum(session, close_fh->name,
+ &filenum));
/*
- * We've copied the file handle, clear out the one in
- * log structure to allow it to be set again.
+ * We update the close file handle before updating the
+ * close LSN when changing files. It is possible we
+ * could see mismatched settings. If we do, yield
+ * until it is set. This should rarely happen.
*/
- log->log_close_fh = NULL;
- /*
- * Set the close_end_lsn to the LSN immediately after
- * ours. That is, the beginning of the next log file.
- * We need to know the LSN file number of our own close
- * in case earlier calls are still in progress and the
- * next one to move the sync_lsn into the next file for
- * later syncs.
- */
- close_lsn.offset = 0;
- close_end_lsn = close_lsn;
- close_end_lsn.file++;
- WT_ERR(__wt_fsync(session, close_fh));
- __wt_spin_lock(session, &log->log_sync_lock);
- locked = 1;
- WT_ERR(__wt_close(session, &close_fh));
- WT_ASSERT(session,
- WT_LOG_CMP(&close_end_lsn, &log->sync_lsn) >= 0);
- log->sync_lsn = close_end_lsn;
- WT_ERR(__wt_cond_signal(session, log->log_sync_cond));
- locked = 0;
- __wt_spin_unlock(session, &log->log_sync_lock);
+ while (log->log_close_lsn.file < filenum)
+ __wt_yield();
+
+ if (__wt_log_cmp(
+ &log->write_lsn, &log->log_close_lsn) >= 0) {
+ /*
+ * We've copied the file handle, clear out the
+ * one in the log structure to allow it to be
+ * set again. Copy the LSN before clearing
+ * the file handle.
+ * Use a barrier to make sure the compiler does
+ * not reorder the following two statements.
+ */
+ close_end_lsn = log->log_close_lsn;
+ WT_FULL_BARRIER();
+ log->log_close_fh = NULL;
+ /*
+ * Set the close_end_lsn to the LSN immediately
+ * after ours. That is, the beginning of the
+ * next log file. We need to know the LSN
+ * file number of our own close in case earlier
+ * calls are still in progress and the next one
+ * to move the sync_lsn into the next file for
+ * later syncs.
+ */
+ close_end_lsn.file++;
+ close_end_lsn.offset = 0;
+ WT_ERR(__wt_fsync(session, close_fh));
+ __wt_spin_lock(session, &log->log_sync_lock);
+ locked = 1;
+ WT_ERR(__wt_close(session, &close_fh));
+ WT_ASSERT(session, __wt_log_cmp(
+ &close_end_lsn, &log->sync_lsn) >= 0);
+ log->sync_lsn = close_end_lsn;
+ WT_ERR(__wt_cond_signal(
+ session, log->log_sync_cond));
+ locked = 0;
+ __wt_spin_unlock(session, &log->log_sync_lock);
+ }
}
/*
* If a later thread asked for a background sync, do it now.
*/
- if (WT_LOG_CMP(&log->bg_sync_lsn, &log->sync_lsn) > 0) {
+ if (__wt_log_cmp(&log->bg_sync_lsn, &log->sync_lsn) > 0) {
/*
* Save the latest write LSN which is the minimum
* we will have written to disk.
*/
min_lsn = log->write_lsn;
/*
- * The sync LSN we asked for better be smaller than
- * the current written LSN.
+ * We have to wait until the LSN we asked for is
+ * written. If it isn't signal the wrlsn thread
+ * to get it written.
*/
- WT_ASSERT(session,
- WT_LOG_CMP(&log->bg_sync_lsn, &min_lsn) <= 0);
- WT_ERR(__wt_fsync(session, log->log_fh));
- __wt_spin_lock(session, &log->log_sync_lock);
- locked = 1;
- /*
- * The sync LSN could have advanced while we were
- * writing to disk.
- */
- if (WT_LOG_CMP(&log->sync_lsn, &min_lsn) <= 0) {
- log->sync_lsn = min_lsn;
+ if (__wt_log_cmp(&log->bg_sync_lsn, &min_lsn) <= 0) {
+ WT_ERR(__wt_fsync(session, log->log_fh));
+ __wt_spin_lock(session, &log->log_sync_lock);
+ locked = 1;
+ /*
+ * The sync LSN could have advanced while we
+ * were writing to disk.
+ */
+ if (__wt_log_cmp(
+ &log->sync_lsn, &min_lsn) <= 0) {
+ log->sync_lsn = min_lsn;
+ WT_ERR(__wt_cond_signal(
+ session, log->log_sync_cond));
+ }
+ locked = 0;
+ __wt_spin_unlock(session, &log->log_sync_lock);
+ } else {
WT_ERR(__wt_cond_signal(
- session, log->log_sync_cond));
+ session, conn->log_wrlsn_cond));
+ /*
+ * We do not want to wait potentially a second
+ * to process this. Yield to give the wrlsn
+ * thread a chance to run and try again in
+ * this case.
+ */
+ __wt_yield();
+ continue;
}
- locked = 0;
- __wt_spin_unlock(session, &log->log_sync_lock);
}
/* Wait until the next event. */
WT_ERR(__wt_cond_wait(
@@ -394,26 +426,29 @@ typedef struct {
/*
* __wt_log_wrlsn --
* Process written log slots and attempt to coalesce them if the LSNs
- * are contiguous. Returns 1 if slots were freed, 0 if no slots were
- * freed in the progress arg. Must be called with the log slot lock held.
+ * are contiguous. The purpose of this function is to advance the
+ * write_lsn in LSN order after the buffer is written to the log file.
*/
int
-__wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield)
+__wt_log_wrlsn(WT_SESSION_IMPL *session)
{
WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
WT_LOG *log;
WT_LOG_WRLSN_ENTRY written[WT_SLOT_POOL];
WT_LOGSLOT *coalescing, *slot;
+ WT_LSN save_lsn;
size_t written_i;
uint32_t i, save_i;
conn = S2C(session);
log = conn->log;
+ __wt_spin_lock(session, &log->log_writelsn_lock);
+restart:
coalescing = NULL;
+ WT_INIT_LSN(&save_lsn);
written_i = 0;
i = 0;
- if (free_i != NULL)
- *free_i = WT_SLOT_POOL;
/*
* Walk the array once saving any slots that are in the
@@ -422,9 +457,14 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield)
while (i < WT_SLOT_POOL) {
save_i = i;
slot = &log->slot_pool[i++];
- if (free_i != NULL && *free_i == WT_SLOT_POOL &&
- slot->slot_state == WT_LOG_SLOT_FREE)
- *free_i = save_i;
+ /*
+ * XXX - During debugging I saw slot 0 become orphaned.
+ * I believe it is fixed, but check for now.
+ * This assertion should catch that.
+ */
+ if (slot->slot_state == 0)
+ WT_ASSERT(session,
+ slot->slot_release_lsn.file >= log->write_lsn.file);
if (slot->slot_state != WT_LOG_SLOT_WRITTEN)
continue;
written[written_i].slot_index = save_i;
@@ -435,15 +475,8 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield)
* based on the release LSN, and then look for them in order.
*/
if (written_i > 0) {
- /*
- * If wanted, reset the yield variable to indicate that we
- * have found written slots.
- */
- if (yield != NULL)
- *yield = 0;
WT_INSERTION_SORT(written, written_i,
WT_LOG_WRLSN_ENTRY, WT_WRLSN_ENTRY_CMP_LT);
-
/*
* We know the written array is sorted by LSN. Go
* through them either advancing write_lsn or coalesce
@@ -451,8 +484,28 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield)
*/
for (i = 0; i < written_i; i++) {
slot = &log->slot_pool[written[i].slot_index];
+ /*
+ * The log server thread pushes out slots periodically.
+ * Sometimes they are empty slots. If we find an
+ * empty slot, where empty means the start and end LSN
+ * are the same, free it and continue.
+ */
+ if (__wt_log_cmp(&slot->slot_start_lsn,
+ &slot->slot_release_lsn) == 0 &&
+ __wt_log_cmp(&slot->slot_start_lsn,
+ &slot->slot_end_lsn) == 0) {
+ __wt_log_slot_free(session, slot);
+ continue;
+ }
if (coalescing != NULL) {
- if (WT_LOG_CMP(&coalescing->slot_end_lsn,
+ /*
+ * If the write_lsn changed, we may be able to
+ * process slots. Try again.
+ */
+ if (__wt_log_cmp(
+ &log->write_lsn, &save_lsn) != 0)
+ goto restart;
+ if (__wt_log_cmp(&coalescing->slot_end_lsn,
&written[i].lsn) != 0) {
coalescing = slot;
continue;
@@ -461,6 +514,8 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield)
* If we get here we have a slot to coalesce
* and free.
*/
+ coalescing->slot_last_offset =
+ slot->slot_last_offset;
coalescing->slot_end_lsn = slot->slot_end_lsn;
WT_STAT_FAST_CONN_INCR(
session, log_slot_coalesced);
@@ -473,8 +528,12 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield)
/*
* If this written slot is not the next LSN,
* try to start coalescing with later slots.
+ * A synchronous write may update write_lsn
+ * so save the last one we saw to check when
+ * coalescing slots.
*/
- if (WT_LOG_CMP(
+ save_lsn = log->write_lsn;
+ if (__wt_log_cmp(
&log->write_lsn, &written[i].lsn) != 0) {
coalescing = slot;
continue;
@@ -483,27 +542,29 @@ __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield)
* If we get here we have a slot to process.
* Advance the LSN and process the slot.
*/
- WT_ASSERT(session, WT_LOG_CMP(&written[i].lsn,
+ WT_ASSERT(session, __wt_log_cmp(&written[i].lsn,
&slot->slot_release_lsn) == 0);
+ if (slot->slot_start_lsn.offset !=
+ slot->slot_last_offset)
+ slot->slot_start_lsn.offset =
+ slot->slot_last_offset;
log->write_start_lsn = slot->slot_start_lsn;
log->write_lsn = slot->slot_end_lsn;
- WT_RET(__wt_cond_signal(
+ WT_ERR(__wt_cond_signal(
session, log->log_write_cond));
WT_STAT_FAST_CONN_INCR(session, log_write_lsn);
/*
* Signal the close thread if needed.
*/
if (F_ISSET(slot, WT_SLOT_CLOSEFH))
- WT_RET(__wt_cond_signal(
+ WT_ERR(__wt_cond_signal(
session, conn->log_file_cond));
}
- WT_RET(__wt_log_slot_free(session, slot));
- if (free_i != NULL && *free_i == WT_SLOT_POOL &&
- slot->slot_state == WT_LOG_SLOT_FREE)
- *free_i = save_i;
+ __wt_log_slot_free(session, slot);
}
}
- return (0);
+err: __wt_spin_unlock(session, &log->log_writelsn_lock);
+ return (ret);
}
/*
@@ -515,31 +576,26 @@ __log_wrlsn_server(void *arg)
{
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
- WT_LOG *log;
WT_SESSION_IMPL *session;
- int locked, yield;
session = arg;
conn = S2C(session);
- log = conn->log;
- locked = yield = 0;
while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
- __wt_spin_lock(session, &log->log_slot_lock);
- locked = 1;
- WT_ERR(__wt_log_wrlsn(session, NULL, &yield));
- locked = 0;
- __wt_spin_unlock(session, &log->log_slot_lock);
- if (++yield < 1000)
- __wt_yield();
- else
- WT_ERR(__wt_cond_wait(session,
- conn->log_wrlsn_cond, 100000));
+ /*
+ * Write out any log record buffers.
+ */
+ WT_ERR(__wt_log_wrlsn(session));
+ WT_ERR(__wt_cond_wait(session, conn->log_wrlsn_cond, 10000));
}
+ /*
+ * On close we need to do this one more time because there could
+ * be straggling log writes that need to be written.
+ */
+ WT_ERR(__wt_log_force_write(session, 1));
+ WT_ERR(__wt_log_wrlsn(session));
if (0) {
err: __wt_err(session, ret, "log wrlsn server error");
}
- if (locked)
- __wt_spin_unlock(session, &log->log_slot_lock);
return (WT_THREAD_RET_VALUE);
}
@@ -554,44 +610,81 @@ __log_server(void *arg)
WT_DECL_RET;
WT_LOG *log;
WT_SESSION_IMPL *session;
- u_int locked;
+ int freq_per_sec, signalled;
session = arg;
conn = S2C(session);
log = conn->log;
- locked = 0;
+ signalled = 0;
+
+ /*
+ * Set this to the number of times per second we want to force out the
+ * log slot buffer.
+ */
+#define WT_FORCE_PER_SECOND 20
+ freq_per_sec = WT_FORCE_PER_SECOND;
+
+ /*
+ * The log server thread does a variety of work. It forces out any
+ * buffered log writes. It pre-allocates log files and it performs
+ * log archiving. The reason the wrlsn thread does not force out
+ * the buffered writes is because we want to process and move the
+ * write_lsn forward as quickly as possible. The same reason applies
+ * to why the log file server thread does not force out the writes.
+ * That thread does fsync calls which can take a long time and we
+ * don't want log records sitting in the buffer over the time it
+ * takes to sync out an earlier file.
+ */
while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
/*
- * Perform log pre-allocation.
+ * Slots depend on future activity. Force out buffered
+ * writes in case we are idle. This cannot be part of the
+ * wrlsn thread because of interaction advancing the write_lsn
+ * and a buffer may need to wait for the write_lsn to advance
+ * in the case of a synchronous buffer. We end up with a hang.
*/
- if (conn->log_prealloc > 0)
- WT_ERR(__log_prealloc_once(session));
+ WT_ERR_BUSY_OK(__wt_log_force_write(session, 0));
/*
- * Perform the archive.
+ * We don't want to archive or pre-allocate files as often as
+ * we want to force out log buffers. Only do it once per second
+ * or if the condition was signalled.
*/
- if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) {
- if (__wt_try_writelock(
- session, log->log_archive_lock) == 0) {
- locked = 1;
- WT_ERR(__log_archive_once(session, 0));
- WT_ERR( __wt_writeunlock(
- session, log->log_archive_lock));
- locked = 0;
- } else
- WT_ERR(__wt_verbose(session, WT_VERB_LOG,
- "log_archive: Blocked due to open log "
- "cursor holding archive lock"));
+ if (--freq_per_sec <= 0 || signalled != 0) {
+ freq_per_sec = WT_FORCE_PER_SECOND;
+
+ /*
+ * Perform log pre-allocation.
+ */
+ if (conn->log_prealloc > 0)
+ WT_ERR(__log_prealloc_once(session));
+
+ /*
+ * Perform the archive.
+ */
+ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE)) {
+ if (__wt_try_writelock(
+ session, log->log_archive_lock) == 0) {
+ ret = __log_archive_once(session, 0);
+ WT_TRET(__wt_writeunlock(
+ session, log->log_archive_lock));
+ WT_ERR(ret);
+ } else
+ WT_ERR(
+ __wt_verbose(session, WT_VERB_LOG,
+ "log_archive: Blocked due to open "
+ "log cursor holding archive lock"));
+ }
}
+
/* Wait until the next event. */
- WT_ERR(__wt_cond_wait(session, conn->log_cond, WT_MILLION));
+ WT_ERR(__wt_cond_wait_signal(session, conn->log_cond,
+ WT_MILLION / WT_FORCE_PER_SECOND, &signalled));
}
if (0) {
err: __wt_err(session, ret, "log server error");
}
- if (locked)
- (void)__wt_writeunlock(session, log->log_archive_lock);
return (WT_THREAD_RET_VALUE);
}
@@ -624,6 +717,8 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[])
WT_RET(__wt_spin_init(session, &log->log_lock, "log"));
WT_RET(__wt_spin_init(session, &log->log_slot_lock, "log slot"));
WT_RET(__wt_spin_init(session, &log->log_sync_lock, "log sync"));
+ WT_RET(__wt_spin_init(session, &log->log_writelsn_lock,
+ "log write LSN"));
WT_RET(__wt_rwlock_alloc(session,
&log->log_archive_lock, "log archive lock"));
if (FLD_ISSET(conn->direct_io, WT_FILE_TYPE_LOG))
@@ -755,13 +850,11 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
WT_TRET(__wt_thread_join(session, conn->log_tid));
conn->log_tid_set = 0;
}
- WT_TRET(__wt_cond_destroy(session, &conn->log_cond));
if (conn->log_file_tid_set) {
WT_TRET(__wt_cond_signal(session, conn->log_file_cond));
WT_TRET(__wt_thread_join(session, conn->log_file_tid));
conn->log_file_tid_set = 0;
}
- WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond));
if (conn->log_file_session != NULL) {
wt_session = &conn->log_file_session->iface;
WT_TRET(wt_session->close(wt_session, NULL));
@@ -772,13 +865,13 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
WT_TRET(__wt_thread_join(session, conn->log_wrlsn_tid));
conn->log_wrlsn_tid_set = 0;
}
- WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond));
if (conn->log_wrlsn_session != NULL) {
wt_session = &conn->log_wrlsn_session->iface;
WT_TRET(wt_session->close(wt_session, NULL));
conn->log_wrlsn_session = NULL;
}
+ WT_TRET(__wt_log_slot_destroy(session));
WT_TRET(__wt_log_close(session));
/* Close the server thread's session. */
@@ -788,13 +881,18 @@ __wt_logmgr_destroy(WT_SESSION_IMPL *session)
conn->log_session = NULL;
}
- WT_TRET(__wt_log_slot_destroy(session));
+ /* Destroy the condition variables now that all threads are stopped */
+ WT_TRET(__wt_cond_destroy(session, &conn->log_cond));
+ WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond));
+ WT_TRET(__wt_cond_destroy(session, &conn->log_wrlsn_cond));
+
WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond));
WT_TRET(__wt_cond_destroy(session, &conn->log->log_write_cond));
WT_TRET(__wt_rwlock_destroy(session, &conn->log->log_archive_lock));
__wt_spin_destroy(session, &conn->log->log_lock);
__wt_spin_destroy(session, &conn->log->log_slot_lock);
__wt_spin_destroy(session, &conn->log->log_sync_lock);
+ __wt_spin_destroy(session, &conn->log->log_writelsn_lock);
__wt_free(session, conn->log_path);
__wt_free(session, conn->log);
return (ret);
diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c
index c4350d90adb..8bc69bb3e80 100644
--- a/src/conn/conn_open.c
+++ b/src/conn/conn_open.c
@@ -30,6 +30,7 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[])
/* WT_SESSION_IMPL array. */
WT_RET(__wt_calloc(session,
conn->session_size, sizeof(WT_SESSION_IMPL), &conn->sessions));
+ WT_CACHE_LINE_ALIGNMENT_VERIFY(session, conn->sessions);
/*
* Open the default session. We open this before starting service
@@ -110,14 +111,17 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
F_CLR(conn, WT_CONN_SERVER_RUN);
WT_TRET(__wt_async_destroy(session));
WT_TRET(__wt_lsm_manager_destroy(session));
+ WT_TRET(__wt_sweep_destroy(session));
F_SET(conn, WT_CONN_CLOSING);
WT_TRET(__wt_checkpoint_server_destroy(session));
WT_TRET(__wt_statlog_destroy(session, 1));
- WT_TRET(__wt_sweep_destroy(session));
WT_TRET(__wt_evict_destroy(session));
+ /* Shut down the lookaside table, after all eviction is complete. */
+ WT_TRET(__wt_las_destroy(session));
+
/* Close open data handles. */
WT_TRET(__wt_conn_dhandle_discard(session));
@@ -128,7 +132,8 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
* conditional because we allocate the log path so that printlog can
* run without running logging or recovery.
*/
- if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
+ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
+ FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE))
WT_TRET(__wt_txn_checkpoint_log(
session, 1, WT_TXN_LOG_CKPT_STOP, NULL));
F_CLR(conn, WT_CONN_LOG_SERVER_RUN);
@@ -145,14 +150,14 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
* Complain if files weren't closed, ignoring the lock file, we'll
* close it in a minute.
*/
- SLIST_FOREACH(fh, &conn->fhlh, l) {
+ TAILQ_FOREACH(fh, &conn->fhqh, q) {
if (fh == conn->lock_fh)
continue;
__wt_errx(session,
"Connection has open file handles: %s", fh->name);
WT_TRET(__wt_close(session, &fh));
- fh = SLIST_FIRST(&conn->fhlh);
+ fh = TAILQ_FIRST(&conn->fhqh);
}
/* Disconnect from shared cache - must be before cache destroy. */
@@ -236,9 +241,7 @@ __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[])
/* Run recovery. */
WT_RET(__wt_txn_recover(session));
- /*
- * Start the handle sweep thread.
- */
+ /* Start the handle sweep thread. */
WT_RET(__wt_sweep_create(session));
/* Start the optional async threads. */
diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c
index 9c438c01cd2..3b188bfd22a 100644
--- a/src/conn/conn_stat.c
+++ b/src/conn/conn_stat.c
@@ -42,11 +42,25 @@ __stat_sources_free(WT_SESSION_IMPL *session, char ***sources)
void
__wt_conn_stat_init(WT_SESSION_IMPL *session)
{
+ WT_CONNECTION_IMPL *conn;
+ WT_CONNECTION_STATS **stats;
+
+ conn = S2C(session);
+ stats = conn->stats;
+
__wt_async_stats_update(session);
__wt_cache_stats_update(session);
+ __wt_las_stats_update(session);
__wt_txn_stats_update(session);
- WT_CONN_STAT(session, file_open) = S2C(session)->open_file_count;
+ WT_STAT_SET(session, stats, file_open, conn->open_file_count);
+ WT_STAT_SET(session,
+ stats, session_cursor_open, conn->open_cursor_count);
+ WT_STAT_SET(session, stats, dh_conn_handle_count, conn->dhandle_count);
+ WT_STAT_SET(session,
+ stats, rec_split_stashed_objects, conn->split_stashed_objects);
+ WT_STAT_SET(session,
+ stats, rec_split_stashed_bytes, conn->split_stashed_bytes);
}
/*
@@ -135,11 +149,11 @@ __statlog_dump(WT_SESSION_IMPL *session, const char *name, int conn_stats)
{
WT_CONNECTION_IMPL *conn;
WT_CURSOR *cursor;
+ WT_CURSOR_STAT *cst;
WT_DECL_ITEM(tmp);
WT_DECL_RET;
- WT_STATS *stats;
- u_int i;
- uint64_t max;
+ int64_t *stats;
+ int i;
const char *uri;
const char *cfg[] = {
WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL };
@@ -163,15 +177,14 @@ __statlog_dump(WT_SESSION_IMPL *session, const char *name, int conn_stats)
*/
switch (ret = __wt_curstat_open(session, uri, cfg, &cursor)) {
case 0:
- max = conn_stats ?
- sizeof(WT_CONNECTION_STATS) / sizeof(WT_STATS) :
- sizeof(WT_DSRC_STATS) / sizeof(WT_STATS);
- for (i = 0,
- stats = WT_CURSOR_STATS(cursor); i < max; ++i, ++stats)
+ cst = (WT_CURSOR_STAT *)cursor;
+ for (stats = cst->stats, i = 0; i < cst->stats_count; ++i)
WT_ERR(__wt_fprintf(conn->stat_fp,
- "%s %" PRIu64 " %s %s\n",
- conn->stat_stamp,
- stats->v, name, stats->desc));
+ "%s %" PRId64 " %s %s\n",
+ conn->stat_stamp, stats[i],
+ name, conn_stats ?
+ __wt_stat_connection_desc(i) :
+ __wt_stat_dsrc_desc(i)));
WT_ERR(cursor->close(cursor));
break;
case EBUSY:
diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c
index ec6f628a02e..8da32416242 100644
--- a/src/conn/conn_sweep.c
+++ b/src/conn/conn_sweep.c
@@ -8,55 +8,58 @@
#include "wt_internal.h"
+#define WT_DHANDLE_CAN_DISCARD(dhandle) \
+ (!F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_OPEN) && \
+ dhandle->session_inuse == 0 && dhandle->session_ref == 0)
+
/*
* __sweep_mark --
* Mark idle handles with a time of death, and note if we see dead
* handles.
*/
static int
-__sweep_mark(WT_SESSION_IMPL *session, int *dead_handlesp)
+__sweep_mark(WT_SESSION_IMPL *session, time_t now)
{
WT_CONNECTION_IMPL *conn;
WT_DATA_HANDLE *dhandle;
- time_t now;
conn = S2C(session);
- *dead_handlesp = 0;
- /* Don't discard handles that have been open recently. */
- WT_RET(__wt_seconds(session, &now));
-
- WT_STAT_FAST_CONN_INCR(session, dh_conn_sweeps);
- SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+ TAILQ_FOREACH(dhandle, &conn->dhqh, q) {
if (WT_IS_METADATA(dhandle))
continue;
- if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) {
- ++*dead_handlesp;
- continue;
- }
- if (dhandle->session_inuse != 0 ||
- now <= dhandle->timeofdeath + conn->sweep_idle_time ||
- conn->sweep_idle_time == 0)
- continue;
- if (dhandle->timeofdeath == 0) {
- dhandle->timeofdeath = now;
- WT_STAT_FAST_CONN_INCR(session, dh_conn_tod);
+
+ /*
+ * There are some internal increments of the in-use count such
+ * as eviction. Don't keep handles alive because of those
+ * cases, but if we see multiple cursors open, clear the time
+ * of death.
+ */
+ if (dhandle->session_inuse > 1)
+ dhandle->timeofdeath = 0;
+
+ /*
+ * If the handle is open exclusive or currently in use, or the
+ * time of death is already set, move on.
+ */
+ if (F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) ||
+ dhandle->session_inuse > 0 ||
+ dhandle->timeofdeath != 0)
continue;
- }
- /* We now have a candidate to close. */
- ++*dead_handlesp;
+ dhandle->timeofdeath = now;
+ WT_STAT_FAST_CONN_INCR(session, dh_sweep_tod);
}
return (0);
}
/*
- * __sweep_expire_handle --
+ * __sweep_expire_one --
* Mark a single handle dead.
*/
static int
-__sweep_expire_handle(WT_SESSION_IMPL *session)
+__sweep_expire_one(WT_SESSION_IMPL *session)
{
WT_BTREE *btree;
WT_DATA_HANDLE *dhandle;
@@ -113,42 +116,31 @@ err: WT_TRET(__wt_writeunlock(session, dhandle->rwlock));
* until we have reached the configured minimum number of handles.
*/
static int
-__sweep_expire(WT_SESSION_IMPL *session)
+__sweep_expire(WT_SESSION_IMPL *session, time_t now)
{
WT_CONNECTION_IMPL *conn;
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
- time_t now;
conn = S2C(session);
- /* If sweep_idle_time is 0, then we won't expire any cursors */
- if (conn->sweep_idle_time == 0)
- return (0);
-
- /* Don't discard handles that have been open recently. */
- WT_RET(__wt_seconds(session, &now));
-
- WT_STAT_FAST_CONN_INCR(session, dh_conn_sweeps);
- SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+ TAILQ_FOREACH(dhandle, &conn->dhqh, q) {
/*
- * Ignore open files once the open file count reaches the
+ * Ignore open files once the btree file count is below the
* minimum number of handles.
*/
- if (conn->open_file_count < conn->sweep_handles_min)
+ if (conn->open_btree_count < conn->sweep_handles_min)
break;
- if (WT_IS_METADATA(dhandle))
- continue;
- if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
- F_ISSET(dhandle, WT_DHANDLE_DEAD))
- continue;
- if (dhandle->session_inuse != 0 ||
+ if (WT_IS_METADATA(dhandle) ||
+ !F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
+ dhandle->session_inuse != 0 ||
+ dhandle->timeofdeath == 0 ||
now <= dhandle->timeofdeath + conn->sweep_idle_time)
continue;
WT_WITH_DHANDLE(session, dhandle,
- ret = __sweep_expire_handle(session));
+ ret = __sweep_expire_one(session));
WT_RET_BUSY_OK(ret);
}
@@ -156,11 +148,11 @@ __sweep_expire(WT_SESSION_IMPL *session)
}
/*
- * __sweep_flush --
- * Flush pages from dead trees.
+ * __sweep_discard_trees --
+ * Discard pages from dead trees.
*/
static int
-__sweep_flush(WT_SESSION_IMPL *session)
+__sweep_discard_trees(WT_SESSION_IMPL *session, u_int *dead_handlesp)
{
WT_CONNECTION_IMPL *conn;
WT_DATA_HANDLE *dhandle;
@@ -168,8 +160,12 @@ __sweep_flush(WT_SESSION_IMPL *session)
conn = S2C(session);
- WT_STAT_FAST_CONN_INCR(session, dh_conn_sweeps);
- SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+ *dead_handlesp = 0;
+
+ TAILQ_FOREACH(dhandle, &conn->dhqh, q) {
+ if (WT_DHANDLE_CAN_DISCARD(dhandle))
+ ++*dead_handlesp;
+
if (!F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
!F_ISSET(dhandle, WT_DHANDLE_DEAD))
continue;
@@ -178,9 +174,12 @@ __sweep_flush(WT_SESSION_IMPL *session)
WT_WITH_DHANDLE(session, dhandle, ret =
__wt_conn_btree_sync_and_close(session, 0, 0));
- /* We closed the btree handle, bump the statistic. */
- if (ret == 0)
- WT_STAT_FAST_CONN_INCR(session, dh_conn_handles);
+ /* We closed the btree handle. */
+ if (ret == 0) {
+ WT_STAT_FAST_CONN_INCR(session, dh_sweep_close);
+ ++*dead_handlesp;
+ } else
+ WT_STAT_FAST_CONN_INCR(session, dh_sweep_ref);
WT_RET_BUSY_OK(ret);
}
@@ -189,8 +188,41 @@ __sweep_flush(WT_SESSION_IMPL *session)
}
/*
+ * __sweep_remove_one --
+ * Remove a closed handle from the connection list.
+ */
+static int
+__sweep_remove_one(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle)
+{
+ WT_DECL_RET;
+
+ /* Try to get exclusive access. */
+ WT_RET(__wt_try_writelock(session, dhandle->rwlock));
+
+ /*
+ * If there are no longer any references to the handle in any
+ * sessions, attempt to discard it.
+ */
+ if (!WT_DHANDLE_CAN_DISCARD(dhandle))
+ WT_ERR(EBUSY);
+
+ WT_WITH_DHANDLE(session, dhandle,
+ ret = __wt_conn_dhandle_discard_single(session, 0, 1));
+
+ /*
+ * If the handle was not successfully discarded, unlock it and
+ * don't retry the discard until it times out again.
+ */
+ if (ret != 0) {
+err: WT_TRET(__wt_writeunlock(session, dhandle->rwlock));
+ }
+
+ return (ret);
+}
+
+/*
* __sweep_remove_handles --
- * Remove closed dhandles from the connection list.
+ * Remove closed handles from the connection list.
*/
static int
__sweep_remove_handles(WT_SESSION_IMPL *session)
@@ -200,41 +232,23 @@ __sweep_remove_handles(WT_SESSION_IMPL *session)
WT_DECL_RET;
conn = S2C(session);
- dhandle = SLIST_FIRST(&conn->dhlh);
- for (; dhandle != NULL; dhandle = dhandle_next) {
- dhandle_next = SLIST_NEXT(dhandle, l);
+ for (dhandle = TAILQ_FIRST(&conn->dhqh);
+ dhandle != NULL;
+ dhandle = dhandle_next) {
+ dhandle_next = TAILQ_NEXT(dhandle, q);
if (WT_IS_METADATA(dhandle))
continue;
- if (F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
- dhandle->session_inuse != 0 ||
- dhandle->session_ref != 0)
- continue;
-
- /* Make sure we get exclusive access. */
- if ((ret =
- __wt_try_writelock(session, dhandle->rwlock)) == EBUSY)
- continue;
- WT_RET(ret);
-
- /*
- * If there are no longer any references to the handle in any
- * sessions, attempt to discard it.
- */
- if (F_ISSET(dhandle, WT_DHANDLE_OPEN) ||
- dhandle->session_inuse != 0 || dhandle->session_ref != 0) {
- WT_RET(__wt_writeunlock(session, dhandle->rwlock));
+ if (!WT_DHANDLE_CAN_DISCARD(dhandle))
continue;
- }
-
- WT_WITH_DHANDLE(session, dhandle,
- ret = __wt_conn_dhandle_discard_single(session, 0, 1));
- /* If the handle was not successfully discarded, unlock it. */
- if (ret != 0)
- WT_TRET(__wt_writeunlock(session, dhandle->rwlock));
+ WT_WITH_HANDLE_LIST_LOCK(session,
+ ret = __sweep_remove_one(session, dhandle));
+ if (ret == 0)
+ WT_STAT_FAST_CONN_INCR(session, dh_sweep_remove);
+ else
+ WT_STAT_FAST_CONN_INCR(session, dh_sweep_ref);
WT_RET_BUSY_OK(ret);
- WT_STAT_FAST_CONN_INCR(session, dh_conn_ref);
}
return (ret == EBUSY ? 0 : ret);
@@ -250,7 +264,8 @@ __sweep_server(void *arg)
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_SESSION_IMPL *session;
- int dead_handles;
+ time_t now;
+ u_int dead_handles;
session = arg;
conn = S2C(session);
@@ -263,35 +278,37 @@ __sweep_server(void *arg)
/* Wait until the next event. */
WT_ERR(__wt_cond_wait(session, conn->sweep_cond,
(uint64_t)conn->sweep_interval * WT_MILLION));
+ WT_ERR(__wt_seconds(session, &now));
+
+ WT_STAT_FAST_CONN_INCR(session, dh_sweeps);
/*
- * Mark handles with a time of death, and report whether any
- * handles are marked dead.
+ * Sweep the lookaside table. If the lookaside table hasn't yet
+ * been written, there's no work to do.
*/
- WT_ERR(__sweep_mark(session, &dead_handles));
+ if (__wt_las_is_written(session))
+ WT_ERR(__wt_las_sweep(session));
/*
- * We only want to flush and expire if there are no dead handles
- * and if either the sweep_idle_time is not 0, or if we have
- * reached the configured limit of handles.
+ * Mark handles with a time of death, and report whether any
+ * handles are marked dead. If sweep_idle_time is 0, handles
+ * never become idle.
*/
- if (dead_handles == 0 &&
- (conn->open_file_count < conn->sweep_handles_min ||
- conn->sweep_idle_time != 0))
- continue;
+ if (conn->sweep_idle_time != 0)
+ WT_ERR(__sweep_mark(session, now));
- /* Close handles if we have reached the configured limit */
- if (conn->open_file_count >= conn->sweep_handles_min) {
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __sweep_expire(session));
- WT_ERR(ret);
- }
+ /*
+ * Close handles if we have reached the configured limit.
+ * If sweep_idle_time is 0, handles never become idle.
+ */
+ if (conn->sweep_idle_time != 0 &&
+ conn->open_btree_count >= conn->sweep_handles_min)
+ WT_ERR(__sweep_expire(session, now));
- WT_ERR(__sweep_flush(session));
+ WT_ERR(__sweep_discard_trees(session, &dead_handles));
- WT_WITH_HANDLE_LIST_LOCK(session,
- ret = __sweep_remove_handles(session));
- WT_ERR(ret);
+ if (dead_handles > 0)
+ WT_ERR(__sweep_remove_handles(session));
}
if (0) {
@@ -349,8 +366,14 @@ __wt_sweep_create(WT_SESSION_IMPL *session)
/*
* Handle sweep does enough I/O it may be called upon to perform slow
* operations for the block manager.
+ *
+ * The sweep thread sweeps the lookaside table for outdated records,
+ * it gets its own cursor for that purpose.
+ *
+ * Don't tap the sweep thread for eviction.
*/
- F_SET(session, WT_SESSION_CAN_WAIT);
+ F_SET(session, WT_SESSION_CAN_WAIT |
+ WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION);
WT_RET(__wt_cond_alloc(
session, "handle sweep server", 0, &conn->sweep_cond));
@@ -389,5 +412,9 @@ __wt_sweep_destroy(WT_SESSION_IMPL *session)
conn->sweep_session = NULL;
}
+
+ /* Discard any saved lookaside key. */
+ __wt_buf_free(session, &conn->las_sweep_key);
+
return (ret);
}
diff --git a/src/cursor/cur_backup.c b/src/cursor/cur_backup.c
index 60d94697189..3d9e5e405e8 100644
--- a/src/cursor/cur_backup.c
+++ b/src/cursor/cur_backup.c
@@ -514,17 +514,23 @@ static int
__backup_list_all_append(WT_SESSION_IMPL *session, const char *cfg[])
{
WT_CURSOR_BACKUP *cb;
+ const char *name;
WT_UNUSED(cfg);
cb = session->bkp_cursor;
+ name = session->dhandle->name;
/* Ignore files in the process of being bulk-loaded. */
if (F_ISSET(S2BT(session), WT_BTREE_BULK))
return (0);
+ /* Ignore the lookaside table. */
+ if (strcmp(name, WT_LAS_URI) == 0)
+ return (0);
+
/* Add the file to the list of files to be copied. */
- return (__backup_list_append(session, cb, session->dhandle->name));
+ return (__backup_list_append(session, cb, name));
}
/*
diff --git a/src/cursor/cur_ds.c b/src/cursor/cur_ds.c
index c58d6899150..8ee57d24413 100644
--- a/src/cursor/cur_ds.c
+++ b/src/cursor/cur_ds.c
@@ -510,7 +510,7 @@ __wt_curds_open(
source = data_source->source;
source->session = (WT_SESSION *)session;
memset(&source->q, 0, sizeof(source->q));
- source->recno = 0;
+ source->recno = WT_RECNO_OOB;
memset(source->raw_recno_buf, 0, sizeof(source->raw_recno_buf));
memset(&source->key, 0, sizeof(source->key));
memset(&source->value, 0, sizeof(source->value));
diff --git a/src/cursor/cur_file.c b/src/cursor/cur_file.c
index d30a2a04c22..436227847af 100644
--- a/src/cursor/cur_file.c
+++ b/src/cursor/cur_file.c
@@ -369,15 +369,20 @@ __curfile_close(WT_CURSOR *cursor)
__wt_buf_free(session, &cbulk->last);
}
- WT_TRET(__wt_btcur_close(cbt));
- if (cbt->btree != NULL) {
+ WT_TRET(__wt_btcur_close(cbt, 0));
+ /* The URI is owned by the btree handle. */
+ cursor->internal_uri = NULL;
+ WT_TRET(__wt_cursor_close(cursor));
+
+ /*
+ * Note: release the data handle last so that cursor statistics are
+ * updated correctly.
+ */
+ if (session->dhandle != NULL) {
/* Increment the data-source's in-use counter. */
__wt_cursor_dhandle_decr_use(session);
WT_TRET(__wt_session_release_btree(session));
}
- /* The URI is owned by the btree handle. */
- cursor->internal_uri = NULL;
- WT_TRET(__wt_cursor_close(cursor));
err: API_END_RET(session, ret);
}
diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c
index 7dad85e9d38..045663b3614 100644
--- a/src/cursor/cur_index.c
+++ b/src/cursor/cur_index.c
@@ -130,7 +130,8 @@ __curindex_move(WT_CURSOR_INDEX *cindex)
(*cp)->recno = first->recno;
}
F_SET(*cp, WT_CURSTD_KEY_EXT);
- WT_RET((*cp)->search(*cp));
+ if (cindex->cg_needvalue[i])
+ WT_RET((*cp)->search(*cp));
}
F_SET(&cindex->iface, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
@@ -320,6 +321,7 @@ __curindex_close(WT_CURSOR *cursor)
*cp = NULL;
}
+ __wt_free(session, cindex->cg_needvalue);
__wt_free(session, cindex->cg_cursors);
if (cindex->key_plan != idx->key_plan)
__wt_free(session, cindex->key_plan);
@@ -353,14 +355,19 @@ __curindex_open_colgroups(
/* Child cursors are opened with dump disabled. */
const char *cfg[] = { cfg_arg[0], cfg_arg[1], "dump=\"\"", NULL };
char *proj;
+ size_t cgcnt;
table = cindex->table;
- WT_RET(__wt_calloc_def(session, WT_COLGROUPS(table), &cp));
+ cgcnt = WT_COLGROUPS(table);
+ WT_RET(__wt_calloc_def(session, cgcnt, &cindex->cg_needvalue));
+ WT_RET(__wt_calloc_def(session, cgcnt, &cp));
cindex->cg_cursors = cp;
/* Work out which column groups we need. */
for (proj = (char *)cindex->value_plan; *proj != '\0'; proj++) {
arg = strtoul(proj, &proj, 10);
+ if (*proj == WT_PROJ_VALUE)
+ cindex->cg_needvalue[arg] = 1;
if ((*proj != WT_PROJ_KEY && *proj != WT_PROJ_VALUE) ||
cp[arg] != NULL)
continue;
diff --git a/src/cursor/cur_log.c b/src/cursor/cur_log.c
index 3376f2a3166..ade9fd18962 100644
--- a/src/cursor/cur_log.c
+++ b/src/cursor/cur_log.c
@@ -74,7 +74,7 @@ __curlog_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
acl = (WT_CURSOR_LOG *)a;
bcl = (WT_CURSOR_LOG *)b;
WT_ASSERT(session, cmpp != NULL);
- *cmpp = WT_LOG_CMP(acl->cur_lsn, bcl->cur_lsn);
+ *cmpp = __wt_log_cmp(acl->cur_lsn, bcl->cur_lsn);
/*
* If both are on the same LSN, compare step counter.
*/
@@ -392,6 +392,12 @@ __wt_curlog_open(WT_SESSION_IMPL *session,
WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp));
+ /*
+ * The user may be trying to read a log record they just wrote.
+ * Log records may be buffered, so force out any now.
+ */
+ WT_ERR(__wt_log_force_write(session, 1));
+
/* Log cursors block archiving. */
WT_ERR(__wt_readlock(session, log->log_archive_lock));
diff --git a/src/cursor/cur_stat.c b/src/cursor/cur_stat.c
index 82568401319..2216a1d969d 100644
--- a/src/cursor/cur_stat.c
+++ b/src/cursor/cur_stat.c
@@ -113,12 +113,12 @@ __curstat_get_value(WT_CURSOR *cursor, ...)
if (F_ISSET(cursor, WT_CURSTD_RAW)) {
WT_ERR(__wt_struct_size(session, &size, cursor->value_format,
- cst->stats_first[WT_STAT_KEY_OFFSET(cst)].desc,
+ cst->stats_desc(WT_STAT_KEY_OFFSET(cst)),
cst->pv.data, cst->v));
WT_ERR(__wt_buf_initsize(session, &cursor->value, size));
WT_ERR(__wt_struct_pack(session, cursor->value.mem, size,
cursor->value_format,
- cst->stats_first[WT_STAT_KEY_OFFSET(cst)].desc,
+ cst->stats_desc(WT_STAT_KEY_OFFSET(cst)),
cst->pv.data, cst->v));
item = va_arg(ap, WT_ITEM *);
@@ -130,7 +130,7 @@ __curstat_get_value(WT_CURSOR *cursor, ...)
* pointer support isn't documented, but it's a cheap test.
*/
if ((p = va_arg(ap, const char **)) != NULL)
- *p = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].desc;
+ *p = cst->stats_desc(WT_STAT_KEY_OFFSET(cst));
if ((p = va_arg(ap, const char **)) != NULL)
*p = cst->pv.data;
if ((v = va_arg(ap, uint64_t *)) != NULL)
@@ -215,7 +215,7 @@ __curstat_next(WT_CURSOR *cursor)
F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
WT_ERR(WT_NOTFOUND);
}
- cst->v = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].v;
+ cst->v = (uint64_t)cst->stats[WT_STAT_KEY_OFFSET(cst)];
WT_ERR(__curstat_print_value(session, cst->v, &cst->pv));
F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
@@ -254,7 +254,7 @@ __curstat_prev(WT_CURSOR *cursor)
WT_ERR(WT_NOTFOUND);
}
- cst->v = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].v;
+ cst->v = (uint64_t)cst->stats[WT_STAT_KEY_OFFSET(cst)];
WT_ERR(__curstat_print_value(session, cst->v, &cst->pv));
F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
@@ -308,7 +308,7 @@ __curstat_search(WT_CURSOR *cursor)
if (cst->key < WT_STAT_KEY_MIN(cst) || cst->key > WT_STAT_KEY_MAX(cst))
WT_ERR(WT_NOTFOUND);
- cst->v = cst->stats_first[WT_STAT_KEY_OFFSET(cst)].v;
+ cst->v = (uint64_t)cst->stats[WT_STAT_KEY_OFFSET(cst)];
WT_ERR(__curstat_print_value(session, cst->v, &cst->pv));
F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
@@ -354,13 +354,14 @@ __curstat_conn_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst)
* Optionally clear the connection statistics.
*/
__wt_conn_stat_init(session);
- cst->u.conn_stats = conn->stats;
+ __wt_stat_connection_aggregate(conn->stats, &cst->u.conn_stats);
if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
- __wt_stat_refresh_connection_stats(&conn->stats);
+ __wt_stat_connection_clear_all(conn->stats);
- cst->stats_first = cst->stats = (WT_STATS *)&cst->u.conn_stats;
+ cst->stats = (int64_t *)&cst->u.conn_stats;
cst->stats_base = WT_CONNECTION_STATS_BASE;
- cst->stats_count = sizeof(WT_CONNECTION_STATS) / sizeof(WT_STATS);
+ cst->stats_count = sizeof(WT_CONNECTION_STATS) / sizeof(int64_t);
+ cst->stats_desc = __wt_stat_connection_desc;
}
/*
@@ -383,7 +384,7 @@ __curstat_file_init(WT_SESSION_IMPL *session,
filename = uri;
if (!WT_PREFIX_SKIP(filename, "file:"))
return (EINVAL);
- __wt_stat_init_dsrc_stats(&cst->u.dsrc_stats);
+ __wt_stat_dsrc_init_single(&cst->u.dsrc_stats);
WT_RET(__wt_block_manager_size(
session, filename, &cst->u.dsrc_stats));
__wt_curstat_dsrc_final(cst);
@@ -398,9 +399,10 @@ __curstat_file_init(WT_SESSION_IMPL *session,
* Optionally clear the data source statistics.
*/
if ((ret = __wt_btree_stat_init(session, cst)) == 0) {
- cst->u.dsrc_stats = dhandle->stats;
+ __wt_stat_dsrc_init_single(&cst->u.dsrc_stats);
+ __wt_stat_dsrc_aggregate(dhandle->stats, &cst->u.dsrc_stats);
if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
- __wt_stat_refresh_dsrc_stats(&dhandle->stats);
+ __wt_stat_dsrc_clear_all(dhandle->stats);
__wt_curstat_dsrc_final(cst);
}
@@ -417,10 +419,10 @@ __curstat_file_init(WT_SESSION_IMPL *session,
void
__wt_curstat_dsrc_final(WT_CURSOR_STAT *cst)
{
-
- cst->stats_first = cst->stats = (WT_STATS *)&cst->u.dsrc_stats;
+ cst->stats = (int64_t *)&cst->u.dsrc_stats;
cst->stats_base = WT_DSRC_STATS_BASE;
- cst->stats_count = sizeof(WT_DSRC_STATS) / sizeof(WT_STATS);
+ cst->stats_count = sizeof(WT_DSRC_STATS) / sizeof(int64_t);
+ cst->stats_desc = __wt_stat_dsrc_desc;
}
/*
@@ -495,7 +497,7 @@ __wt_curstat_open(WT_SESSION_IMPL *session,
conn = S2C(session);
- WT_ERR(__wt_calloc_one(session, &cst));
+ WT_RET(__wt_calloc_one(session, &cst));
cursor = &cst->iface;
*cursor = iface;
cursor->session = &session->iface;
diff --git a/src/cursor/cur_std.c b/src/cursor/cur_std.c
index 858c6af6853..701bd845ae9 100644
--- a/src/cursor/cur_std.c
+++ b/src/cursor/cur_std.c
@@ -258,9 +258,9 @@ __wt_cursor_set_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap)
item->data, item->size, "q", &cursor->recno));
} else
cursor->recno = va_arg(ap, uint64_t);
- if (cursor->recno == 0)
+ if (cursor->recno == WT_RECNO_OOB)
WT_ERR_MSG(session, EINVAL,
- "Record numbers must be greater than zero");
+ "%d is an invalid record number", WT_RECNO_OOB);
buf->data = &cursor->recno;
sz = sizeof(cursor->recno);
} else {
@@ -463,16 +463,17 @@ __wt_cursor_close(WT_CURSOR *cursor)
WT_SESSION_IMPL *session;
session = (WT_SESSION_IMPL *)cursor->session;
- __wt_buf_free(session, &cursor->key);
- __wt_buf_free(session, &cursor->value);
if (F_ISSET(cursor, WT_CURSTD_OPEN)) {
TAILQ_REMOVE(&session->cursors, cursor, q);
+ (void)__wt_atomic_sub32(&S2C(session)->open_cursor_count, 1);
WT_STAT_FAST_DATA_DECR(session, session_cursor_open);
- WT_STAT_FAST_CONN_ATOMIC_DECR(session, session_cursor_open);
}
+ __wt_buf_free(session, &cursor->key);
+ __wt_buf_free(session, &cursor->value);
+
__wt_free(session, cursor->internal_uri);
__wt_free(session, cursor->uri);
__wt_overwrite_and_free(session, cursor);
@@ -683,8 +684,8 @@ __wt_cursor_init(WT_CURSOR *cursor,
TAILQ_INSERT_HEAD(&session->cursors, cursor, q);
F_SET(cursor, WT_CURSTD_OPEN);
+ (void)__wt_atomic_add32(&S2C(session)->open_cursor_count, 1);
WT_STAT_FAST_DATA_INCR(session, session_cursor_open);
- WT_STAT_FAST_CONN_ATOMIC_INCR(session, session_cursor_open);
*cursorp = (cdump != NULL) ? cdump : cursor;
return (0);
diff --git a/src/docs/cursor-random.dox b/src/docs/cursor-random.dox
index 70a28407ea5..5d0b89d6547 100644
--- a/src/docs/cursor-random.dox
+++ b/src/docs/cursor-random.dox
@@ -2,15 +2,11 @@
The \c next_random configuration to the WT_SESSION::open_cursor method
configures the cursor to return a pseudo-random record from a row-store
-object.
-
-The ability to return a random record was added to support a particular
-application, and as a result has somewhat unusual semantics. First, the
-returned record may not be random at all in the case of objects with only a few
-rows (especially when the object has never been written to the backing store).
-In such objects, the WT_CURSOR::next method for cursors configured with \c
-next_random may return the same row on each call. Additionally, even in larger
-objects, the WT_CURSOR::next method usually returns the first record from a
-random page in the underlying file, not a random record from a random page.
+object (the configuration is not supported on other types of objects).
+The configuration has somewhat unusual semantics: first, the returned
+record may not be very random in the case of objects with only a few
+rows. Additionally, even in larger objects, the WT_CURSOR::next method
+generally returns the first record from a random page in the underlying
+file, not a random record from a random page.
*/
diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox
index d9ac58103c5..e0640660b0a 100644
--- a/src/docs/upgrading.dox
+++ b/src/docs/upgrading.dox
@@ -3,6 +3,15 @@
@section version_262 Upgrading to Version 2.6.2
<dl>
+<dt>Change to config_base=false</dt>
+<dd>
+If \c config_base=false in the config passed directly to ::wiredtiger_open,
+any existing base configuration file will now be ignored. If an application
+was relying on this behavior, a connection will be opened with different
+settings after upgrading, which could lead to errors or unexpected behavior.
+</dd>
+
+<dl>
<dt>WT_SESSION.verify</dt>
<dd>
The WT_SESSION.verify method in this release has a new configuration
diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c
index 38cfc07ac5b..66fabe48fb2 100644
--- a/src/evict/evict_file.c
+++ b/src/evict/evict_file.c
@@ -79,26 +79,19 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop)
WT_ERR(__wt_evict(session, ref, 1));
break;
case WT_SYNC_DISCARD:
- WT_ASSERT(session,
- __wt_page_can_evict(session, page, 0, NULL));
- __wt_evict_page_clean_update(session, ref, 1);
- break;
- case WT_SYNC_DISCARD_FORCE:
/*
- * Forced discard of the page, whether clean or dirty.
- * If we see a dirty page in a forced discard, clean
- * the page, both to keep statistics correct, and to
- * let the page-discard function assert no dirty page
- * is ever discarded.
+ * Dead handles may reference dirty pages; clean the
+ * page, both to keep statistics correct, and to let
+ * the page-discard function assert no dirty page is
+ * ever discarded.
*/
- if (__wt_page_is_modified(page)) {
- page->modify->write_gen = 0;
- __wt_cache_dirty_decr(session, page);
- }
+ if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
+ __wt_page_modify_clear(session, page);
- F_SET(session, WT_SESSION_DISCARD_FORCE);
+ WT_ASSERT(session,
+ F_ISSET(session->dhandle, WT_DHANDLE_DEAD) ||
+ __wt_page_can_evict(session, page, 0, NULL));
__wt_evict_page_clean_update(session, ref, 1);
- F_CLR(session, WT_SESSION_DISCARD_FORCE);
break;
WT_ILLEGAL_VALUE_ERR(session);
}
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index 6aa61b4137b..ce61aa2c798 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -10,14 +10,13 @@
static int __evict_clear_all_walks(WT_SESSION_IMPL *);
static int __evict_clear_walks(WT_SESSION_IMPL *);
-static int __evict_has_work(WT_SESSION_IMPL *, uint32_t *);
static int WT_CDECL __evict_lru_cmp(const void *, const void *);
static int __evict_lru_pages(WT_SESSION_IMPL *, int);
-static int __evict_lru_walk(WT_SESSION_IMPL *, uint32_t);
+static int __evict_lru_walk(WT_SESSION_IMPL *);
static int __evict_page(WT_SESSION_IMPL *, int);
static int __evict_pass(WT_SESSION_IMPL *);
-static int __evict_walk(WT_SESSION_IMPL *, uint32_t);
-static int __evict_walk_file(WT_SESSION_IMPL *, u_int *, uint32_t);
+static int __evict_walk(WT_SESSION_IMPL *);
+static int __evict_walk_file(WT_SESSION_IMPL *, u_int *);
static WT_THREAD_RET __evict_worker(void *);
static int __evict_server_work(WT_SESSION_IMPL *);
@@ -107,7 +106,7 @@ __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref)
__wt_spin_lock(session, &cache->evict_lock);
elem = cache->evict_max;
- for (i = 0, evict = cache->evict; i < elem; i++, evict++)
+ for (i = 0, evict = cache->evict_queue; i < elem; i++, evict++)
if (evict->ref == ref) {
__evict_list_clear(session, evict);
break;
@@ -159,6 +158,7 @@ __evict_server(void *arg)
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_SESSION_IMPL *session;
+ u_int spins;
session = arg;
conn = S2C(session);
@@ -176,7 +176,27 @@ __evict_server(void *arg)
* otherwise we can block applications evicting large pages.
*/
if (!F_ISSET(cache, WT_CACHE_STUCK)) {
- WT_ERR(__evict_clear_walks(session));
+ for (spins = 0; (ret = __wt_spin_trylock(
+ session, &conn->dhandle_lock)) == EBUSY &&
+ !F_ISSET(cache, WT_CACHE_CLEAR_WALKS);
+ spins++) {
+ if (spins < 1000)
+ __wt_yield();
+ else
+ __wt_sleep(0, 1000);
+ }
+ /*
+ * If we gave up acquiring the lock, that indicates a
+ * session is waiting for us to clear walks. Do that
+ * as part of a normal pass (without the handle list
+ * lock) to avoid deadlock.
+ */
+ if (ret == EBUSY)
+ continue;
+ WT_ERR(ret);
+ ret = __evict_clear_all_walks(session);
+ __wt_spin_unlock(session, &conn->dhandle_lock);
+ WT_ERR(ret);
/* Next time we wake up, reverse the sweep direction. */
cache->flags ^= WT_CACHE_WALK_REVERSE;
@@ -227,9 +247,16 @@ __evict_workers_resize(WT_SESSION_IMPL *session)
for (i = conn->evict_workers_alloc; i < conn->evict_workers_max; i++) {
WT_ERR(__wt_open_internal_session(conn,
- "eviction-worker", 0, 0, &workers[i].session));
+ "eviction-worker", 1, 0, &workers[i].session));
workers[i].id = i;
- F_SET(workers[i].session, WT_SESSION_CAN_WAIT);
+
+ /*
+ * Eviction worker threads get their own lookaside table cursor.
+ * Eviction worker threads may be called upon to perform slow
+ * operations for the block manager.
+ */
+ F_SET(workers[i].session,
+ WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_CAN_WAIT);
if (i < conn->evict_workers_min) {
++conn->evict_workers;
@@ -259,7 +286,7 @@ __wt_evict_create(WT_SESSION_IMPL *session)
/* We need a session handle because we're reading/writing pages. */
WT_RET(__wt_open_internal_session(
- conn, "eviction-server", 0, 0, &conn->evict_session));
+ conn, "eviction-server", 1, 0, &conn->evict_session));
session = conn->evict_session;
/*
@@ -276,6 +303,9 @@ __wt_evict_create(WT_SESSION_IMPL *session)
else
F_SET(session, WT_SESSION_CAN_WAIT);
+ /* The eviction server gets its own lookaside table cursor. */
+ F_SET(session, WT_SESSION_LOOKASIDE_CURSOR);
+
/*
* Start the primary eviction server thread after the worker threads
* have started to avoid it starting additional worker threads before
@@ -385,47 +415,62 @@ err: WT_PANIC_MSG(session, ret, "cache eviction worker error");
}
/*
- * __evict_has_work --
- * Find out if there is eviction work to be done.
+ * __evict_update_work --
+ * Configure eviction work state.
*/
-static int
-__evict_has_work(WT_SESSION_IMPL *session, uint32_t *flagsp)
+static bool
+__evict_update_work(WT_SESSION_IMPL *session)
{
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
- uint32_t flags;
- int evict, dirty;
+ uint64_t bytes_inuse, bytes_max, dirty_inuse;
conn = S2C(session);
cache = conn->cache;
- *flagsp = flags = 0;
+
+ /* Clear previous state. */
+ cache->state = 0;
if (!F_ISSET(conn, WT_CONN_EVICTION_RUN))
- return (0);
+ return (false);
- /* Check to see if the eviction server should run. */
- __wt_cache_status(session, &evict, &dirty);
- if (evict)
- /* The cache is too small. */
- LF_SET(WT_EVICT_PASS_ALL);
- else if (dirty)
- /* Too many dirty pages, ignore clean pages. */
- LF_SET(WT_EVICT_PASS_DIRTY);
- else if (F_ISSET(cache, WT_CACHE_WOULD_BLOCK)) {
- /*
- * Evict pages with oldest generation (which would otherwise
- * block application threads) set regardless of whether we have
- * reached the eviction trigger.
- */
- LF_SET(WT_EVICT_PASS_WOULD_BLOCK);
- F_CLR(cache, WT_CACHE_WOULD_BLOCK);
+ /*
+ * Page eviction overrides the dirty target and other types of eviction,
+ * that is, we don't care where we are with respect to the dirty target
+ * if page eviction is configured.
+ *
+ * Avoid division by zero if the cache size has not yet been set in a
+ * shared cache.
+ */
+ bytes_max = conn->cache_size + 1;
+ bytes_inuse = __wt_cache_bytes_inuse(cache);
+ if (bytes_inuse > (cache->eviction_target * bytes_max) / 100) {
+ FLD_SET(cache->state, WT_EVICT_PASS_ALL);
+ goto done;
}
- if (F_ISSET(cache, WT_CACHE_STUCK))
- LF_SET(WT_EVICT_PASS_AGGRESSIVE);
+ dirty_inuse = __wt_cache_dirty_inuse(cache);
+ if (dirty_inuse > (cache->eviction_dirty_target * bytes_max) / 100) {
+ FLD_SET(cache->state, WT_EVICT_PASS_DIRTY);
+ goto done;
+ }
- *flagsp = flags;
- return (0);
+ /*
+ * Evict pages with oldest generation (which would otherwise block
+ * application threads), set regardless of whether we have reached
+ * the eviction trigger.
+ */
+ if (F_ISSET(cache, WT_CACHE_WOULD_BLOCK)) {
+ FLD_SET(cache->state, WT_EVICT_PASS_WOULD_BLOCK);
+
+ F_CLR(cache, WT_CACHE_WOULD_BLOCK);
+ goto done;
+ }
+ return (false);
+
+done: if (F_ISSET(cache, WT_CACHE_STUCK))
+ FLD_SET(cache->state, WT_EVICT_PASS_AGGRESSIVE);
+ return (true);
}
/*
@@ -439,7 +484,6 @@ __evict_pass(WT_SESSION_IMPL *session)
WT_CONNECTION_IMPL *conn;
WT_EVICT_WORKER *worker;
uint64_t pages_evicted;
- uint32_t flags;
int loop;
conn = S2C(session);
@@ -462,25 +506,36 @@ __evict_pass(WT_SESSION_IMPL *session)
}
/*
- * Increment the shared read generation. We do this
- * occasionally even if eviction is not currently required, so
- * that pages have some relative read generation when the
- * eviction server does need to do some work.
+ * Increment the shared read generation. Do this occasionally
+ * even if eviction is not currently required, so that pages
+ * have some relative read generation when the eviction server
+ * does need to do some work.
*/
__wt_cache_read_gen_incr(session);
- WT_RET(__evict_has_work(session, &flags));
- if (flags == 0)
+ /*
+ * Update the oldest ID: we use it to decide whether pages are
+ * candidates for eviction. Without this, if all threads are
+ * blocked after a long-running transaction (such as a
+ * checkpoint) completes, we may never start evicting again.
+ *
+ * Do this every time the eviction server wakes up, regardless
+ * of whether the cache is full, to prevent the oldest ID
+ * falling too far behind.
+ */
+ __wt_txn_update_oldest(session, 1);
+
+ if (!__evict_update_work(session))
break;
if (loop > 10)
- LF_SET(WT_EVICT_PASS_AGGRESSIVE);
+ FLD_SET(cache->state, WT_EVICT_PASS_AGGRESSIVE);
/*
* Start a worker if we have capacity and we haven't reached
* the eviction targets.
*/
- if (LF_ISSET(WT_EVICT_PASS_ALL |
+ if (FLD_ISSET(cache->state, WT_EVICT_PASS_ALL |
WT_EVICT_PASS_DIRTY | WT_EVICT_PASS_WOULD_BLOCK) &&
conn->evict_workers < conn->evict_workers_max) {
WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER,
@@ -499,7 +554,7 @@ __evict_pass(WT_SESSION_IMPL *session)
" In use: %" PRIu64 " Dirty: %" PRIu64,
conn->cache_size, cache->bytes_inmem, cache->bytes_dirty));
- WT_RET(__evict_lru_walk(session, flags));
+ WT_RET(__evict_lru_walk(session));
WT_RET(__evict_server_work(session));
/*
@@ -520,7 +575,8 @@ __evict_pass(WT_SESSION_IMPL *session)
* Mark the cache as stuck if we need space
* and aren't evicting any pages.
*/
- if (!LF_ISSET(WT_EVICT_PASS_WOULD_BLOCK)) {
+ if (!FLD_ISSET(cache->state,
+ WT_EVICT_PASS_WOULD_BLOCK)) {
F_SET(cache, WT_CACHE_STUCK);
WT_STAT_FAST_CONN_INCR(
session, cache_eviction_slow);
@@ -546,9 +602,14 @@ static int
__evict_clear_walk(WT_SESSION_IMPL *session)
{
WT_BTREE *btree;
+ WT_CACHE *cache;
WT_REF *ref;
btree = S2BT(session);
+ cache = S2C(session)->cache;
+
+ if (session->dhandle == cache->evict_file_next)
+ cache->evict_file_next = NULL;
if ((ref = btree->evict_ref) == NULL)
return (0);
@@ -568,21 +629,17 @@ __evict_clear_walk(WT_SESSION_IMPL *session)
static int
__evict_clear_walks(WT_SESSION_IMPL *session)
{
- WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_SESSION_IMPL *s;
u_int i, session_cnt;
conn = S2C(session);
- cache = conn->cache;
WT_ORDERED_READ(session_cnt, conn->session_cnt);
for (s = conn->sessions, i = 0; i < session_cnt; ++s, ++i) {
if (!s->active || !F_ISSET(s, WT_SESSION_CLEAR_EVICT_WALK))
continue;
- if (s->dhandle == cache->evict_file_next)
- cache->evict_file_next = NULL;
WT_WITH_DHANDLE(
session, s->dhandle, WT_TRET(__evict_clear_walk(session)));
}
@@ -606,7 +663,8 @@ __evict_request_walk_clear(WT_SESSION_IMPL *session)
F_SET(session, WT_SESSION_CLEAR_EVICT_WALK);
- while (btree->evict_ref != NULL && ret == 0) {
+ while (ret == 0 && (btree->evict_ref != NULL ||
+ cache->evict_file_next == session->dhandle)) {
F_SET(cache, WT_CACHE_CLEAR_WALKS);
ret = __wt_cond_wait(
session, cache->evict_waiter_cond, 100000);
@@ -630,7 +688,7 @@ __evict_clear_all_walks(WT_SESSION_IMPL *session)
conn = S2C(session);
- SLIST_FOREACH(dhandle, &conn->dhlh, l)
+ TAILQ_FOREACH(dhandle, &conn->dhqh, q)
if (WT_PREFIX_MATCH(dhandle->name, "file:"))
WT_WITH_DHANDLE(session,
dhandle, WT_TRET(__evict_clear_walk(session)));
@@ -638,44 +696,6 @@ __evict_clear_all_walks(WT_SESSION_IMPL *session)
}
/*
- * __wt_evict_page --
- * Evict a given page.
- */
-int
-__wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref)
-{
- WT_DECL_RET;
- WT_TXN *txn;
- WT_TXN_ISOLATION saved_iso;
-
- /*
- * We have to take care when evicting pages not to write a change that:
- * (a) is not yet committed; or
- * (b) is committed more recently than an in-progress checkpoint.
- *
- * We handle both of these cases by setting up the transaction context
- * before evicting, using a special "eviction" isolation level, where
- * only globally visible updates can be evicted.
- */
- __wt_txn_update_oldest(session, 1);
- txn = &session->txn;
- saved_iso = txn->isolation;
- txn->isolation = WT_ISO_EVICTION;
-
- /*
- * Sanity check: if a transaction has updates, its updates should not
- * be visible to eviction.
- */
- WT_ASSERT(session, !F_ISSET(txn, WT_TXN_HAS_ID) ||
- !__wt_txn_visible(session, txn->id));
-
- ret = __wt_evict(session, ref, 0);
- txn->isolation = saved_iso;
-
- return (ret);
-}
-
-/*
* __wt_evict_file_exclusive_on --
* Get exclusive eviction access to a file and discard any of the file's
* blocks queued for eviction.
@@ -719,7 +739,7 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, int *evict_resetp)
* clear it.
*/
elem = cache->evict_max;
- for (i = 0, evict = cache->evict; i < elem; i++, evict++)
+ for (i = 0, evict = cache->evict_queue; i < elem; i++, evict++)
if (evict->btree == btree)
__evict_list_clear(session, evict);
__wt_spin_unlock(session, &cache->evict_lock);
@@ -773,7 +793,7 @@ __evict_lru_pages(WT_SESSION_IMPL *session, int is_server)
* Add pages to the LRU queue to be evicted from cache.
*/
static int
-__evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags)
+__evict_lru_walk(WT_SESSION_IMPL *session)
{
WT_CACHE *cache;
WT_DECL_RET;
@@ -784,17 +804,17 @@ __evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags)
cache = S2C(session)->cache;
/* Get some more pages to consider for eviction. */
- if ((ret = __evict_walk(session, flags)) != 0)
+ if ((ret = __evict_walk(session)) != 0)
return (ret == EBUSY ? 0 : ret);
/* Sort the list into LRU order and restart. */
__wt_spin_lock(session, &cache->evict_lock);
entries = cache->evict_entries;
- qsort(cache->evict,
+ qsort(cache->evict_queue,
entries, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp);
- while (entries > 0 && cache->evict[entries - 1].ref == NULL)
+ while (entries > 0 && cache->evict_queue[entries - 1].ref == NULL)
--entries;
cache->evict_entries = entries;
@@ -811,12 +831,13 @@ __evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags)
return (0);
}
- WT_ASSERT(session, cache->evict[0].ref != NULL);
+ WT_ASSERT(session, cache->evict_queue[0].ref != NULL);
/* Track the oldest read generation we have in the queue. */
- cache->read_gen_oldest = cache->evict[0].ref->page->read_gen;
+ cache->read_gen_oldest = cache->evict_queue[0].ref->page->read_gen;
- if (LF_ISSET(WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK))
+ if (FLD_ISSET(cache->state,
+ WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK))
/*
* Take all candidates if we only gathered pages with an oldest
* read generation set.
@@ -824,8 +845,8 @@ __evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags)
cache->evict_candidates = entries;
else {
/* Find the bottom 25% of read generations. */
- cutoff = (3 * __evict_read_gen(&cache->evict[0]) +
- __evict_read_gen(&cache->evict[entries - 1])) / 4;
+ cutoff = (3 * __evict_read_gen(&cache->evict_queue[0]) +
+ __evict_read_gen(&cache->evict_queue[entries - 1])) / 4;
/*
* Don't take less than 10% or more than 50% of entries,
* regardless. That said, if there is only one entry, which is
@@ -835,21 +856,21 @@ __evict_lru_walk(WT_SESSION_IMPL *session, uint32_t flags)
candidates < entries / 2;
candidates++)
if (__evict_read_gen(
- &cache->evict[candidates]) > cutoff)
+ &cache->evict_queue[candidates]) > cutoff)
break;
cache->evict_candidates = candidates;
}
/* If we have more than the minimum number of entries, clear them. */
if (cache->evict_entries > WT_EVICT_WALK_BASE) {
- for (i = WT_EVICT_WALK_BASE, evict = cache->evict + i;
+ for (i = WT_EVICT_WALK_BASE, evict = cache->evict_queue + i;
i < cache->evict_entries;
i++, evict++)
__evict_list_clear(session, evict);
cache->evict_entries = WT_EVICT_WALK_BASE;
}
- cache->evict_current = cache->evict;
+ cache->evict_current = cache->evict_queue;
__wt_spin_unlock(session, &cache->evict_lock);
/*
@@ -894,7 +915,7 @@ __evict_server_work(WT_SESSION_IMPL *session)
* Fill in the array by walking the next set of pages.
*/
static int
-__evict_walk(WT_SESSION_IMPL *session, uint32_t flags)
+__evict_walk(WT_SESSION_IMPL *session)
{
WT_BTREE *btree;
WT_CACHE *cache;
@@ -910,14 +931,6 @@ __evict_walk(WT_SESSION_IMPL *session, uint32_t flags)
incr = dhandle_locked = 0;
retries = 0;
- /*
- * Update the oldest ID: we use it to decide whether pages are
- * candidates for eviction. Without this, if all threads are blocked
- * after a long-running transaction (such as a checkpoint) completes,
- * we may never start evicting again.
- */
- __wt_txn_update_oldest(session, 1);
-
if (cache->evict_current == NULL)
WT_STAT_FAST_CONN_INCR(session, cache_eviction_queue_empty);
else
@@ -957,15 +970,24 @@ retry: while (slot < max_entries && ret == 0) {
dhandle_locked = 1;
}
- if (dhandle == NULL)
- dhandle = SLIST_FIRST(&conn->dhlh);
- else {
+ if (dhandle == NULL) {
+ /*
+ * On entry, continue from wherever we got to in the
+ * scan last time through. If we don't have a saved
+ * handle, start from the beginning of the list.
+ */
+ if ((dhandle = cache->evict_file_next) != NULL)
+ cache->evict_file_next = NULL;
+ else
+ dhandle = TAILQ_FIRST(&conn->dhqh);
+ } else {
if (incr) {
WT_ASSERT(session, dhandle->session_inuse > 0);
- (void)WT_ATOMIC_SUB4(dhandle->session_inuse, 1);
+ (void)__wt_atomic_subi32(
+ &dhandle->session_inuse, 1);
incr = 0;
}
- dhandle = SLIST_NEXT(dhandle, l);
+ dhandle = TAILQ_NEXT(dhandle, q);
}
/* If we reach the end of the list, we're done. */
@@ -977,15 +999,6 @@ retry: while (slot < max_entries && ret == 0) {
!F_ISSET(dhandle, WT_DHANDLE_OPEN))
continue;
- /*
- * Each time we reenter this function, start at the next handle
- * on the list.
- */
- if (cache->evict_file_next != NULL &&
- cache->evict_file_next != dhandle)
- continue;
- cache->evict_file_next = NULL;
-
/* Skip files that don't allow eviction. */
btree = dhandle->handle;
if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
@@ -996,7 +1009,7 @@ retry: while (slot < max_entries && ret == 0) {
* stick in cache until we get aggressive.
*/
if ((btree->checkpointing || btree->evict_priority != 0) &&
- !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE))
+ !FLD_ISSET(cache->state, WT_EVICT_PASS_AGGRESSIVE))
continue;
/* Skip files if we have used all available hazard pointers. */
@@ -1015,7 +1028,7 @@ retry: while (slot < max_entries && ret == 0) {
btree->evict_walk_skips = 0;
prev_slot = slot;
- (void)WT_ATOMIC_ADD4(dhandle->session_inuse, 1);
+ (void)__wt_atomic_addi32(&dhandle->session_inuse, 1);
incr = 1;
__wt_spin_unlock(session, &conn->dhandle_lock);
dhandle_locked = 0;
@@ -1028,7 +1041,7 @@ retry: while (slot < max_entries && ret == 0) {
*/
if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) {
WT_WITH_DHANDLE(session, dhandle,
- ret = __evict_walk_file(session, &slot, flags));
+ ret = __evict_walk_file(session, &slot));
WT_ASSERT(session, session->split_gen == 0);
}
@@ -1046,8 +1059,11 @@ retry: while (slot < max_entries && ret == 0) {
}
if (incr) {
+ /* Remember the file we should visit first, next loop. */
+ cache->evict_file_next = dhandle;
+
WT_ASSERT(session, dhandle->session_inuse > 0);
- (void)WT_ATOMIC_SUB4(dhandle->session_inuse, 1);
+ (void)__wt_atomic_subi32(&dhandle->session_inuse, 1);
incr = 0;
}
@@ -1059,21 +1075,18 @@ retry: while (slot < max_entries && ret == 0) {
/*
* Walk the list of files a few times if we don't find enough pages.
* Try two passes through all the files, give up when we have some
- * candidates and we aren't finding more. Take care not to skip files
- * on subsequent passes.
+ * candidates and we aren't finding more.
*/
if (!F_ISSET(cache, WT_CACHE_CLEAR_WALKS) && ret == 0 &&
slot < max_entries && (retries < 2 ||
- (!LF_ISSET(WT_EVICT_PASS_WOULD_BLOCK) && retries < 10 &&
+ (retries < 10 &&
+ !FLD_ISSET(cache->state, WT_EVICT_PASS_WOULD_BLOCK) &&
(slot == cache->evict_entries || slot > start_slot)))) {
- cache->evict_file_next = NULL;
start_slot = slot;
++retries;
goto retry;
}
- /* Remember the file we should visit first, next loop. */
- cache->evict_file_next = dhandle;
cache->evict_entries = slot;
return (ret);
}
@@ -1092,7 +1105,7 @@ __evict_init_candidate(
cache = S2C(session)->cache;
/* Keep track of the maximum slot we are using. */
- slot = (u_int)(evict - cache->evict);
+ slot = (u_int)(evict - cache->evict_queue);
if (slot >= cache->evict_max)
cache->evict_max = slot + 1;
@@ -1110,10 +1123,11 @@ __evict_init_candidate(
* Get a few page eviction candidates from a single underlying file.
*/
static int
-__evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
+__evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
{
WT_BTREE *btree;
WT_CACHE *cache;
+ WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_EVICT_ENTRY *end, *evict, *start;
WT_PAGE *page;
@@ -1123,11 +1137,12 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
uint32_t walk_flags;
int enough, internal_pages, modified, restarts;
+ conn = S2C(session);
btree = S2BT(session);
- cache = S2C(session)->cache;
- start = cache->evict + *slotp;
+ cache = conn->cache;
+ start = cache->evict_queue + *slotp;
end = WT_MIN(start + WT_EVICT_WALK_PER_FILE,
- cache->evict + cache->evict_slots);
+ cache->evict_queue + cache->evict_slots);
enough = internal_pages = restarts = 0;
walk_flags = WT_READ_CACHE | WT_READ_NO_EVICT |
@@ -1178,21 +1193,21 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
goto fast;
/* Optionally ignore clean pages. */
- if (!modified && LF_ISSET(WT_EVICT_PASS_DIRTY))
+ if (!modified && FLD_ISSET(cache->state, WT_EVICT_PASS_DIRTY))
continue;
/*
* If we are only trickling out pages marked for definite
* eviction, skip anything that isn't marked.
*/
- if (LF_ISSET(WT_EVICT_PASS_WOULD_BLOCK) &&
+ if (FLD_ISSET(cache->state, WT_EVICT_PASS_WOULD_BLOCK) &&
page->read_gen != WT_READGEN_OLDEST)
continue;
/* Limit internal pages to 50% unless we get aggressive. */
if (WT_PAGE_IS_INTERNAL(page) &&
++internal_pages > WT_EVICT_WALK_PER_FILE / 2 &&
- !LF_ISSET(WT_EVICT_PASS_AGGRESSIVE))
+ !FLD_ISSET(cache->state, WT_EVICT_PASS_AGGRESSIVE))
continue;
/*
@@ -1207,36 +1222,44 @@ fast: /* If the page can't be evicted, give up. */
continue;
/*
- * If the page is clean but has modifications that appear too
- * new to evict, skip it.
+ * Additional tests if eviction is likely to succeed.
*
- * Note: take care with ordering: if we detected that the page
- * is modified above, we expect mod != NULL.
+ * If eviction is stuck or we are helping with forced eviction,
+ * try anyway: maybe a transaction that was running last time
+ * we wrote the page has since rolled back, or we can help the
+ * checkpoint complete sooner. Additionally, being stuck will
+ * configure lookaside table writes in reconciliation, allowing
+ * us to evict pages we can't usually evict.
*/
- mod = page->modify;
- if (!modified && mod != NULL && !LF_ISSET(
- WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK) &&
- !__wt_txn_visible_all(session, mod->rec_max_txn))
- continue;
+ if (!FLD_ISSET(cache->state,
+ WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK)) {
+ /*
+ * Note: take care with ordering: if we detected that
+ * the page is modified above, we expect mod != NULL.
+ */
+ mod = page->modify;
- /*
- * If the oldest transaction hasn't changed since the last time
- * this page was written, it's unlikely that we can make
- * progress. Similarly, if the most recent update on the page
- * is not yet globally visible, eviction will fail. These
- * heuristics attempt to avoid repeated attempts to evict the
- * same page.
- *
- * That said, if eviction is stuck, or we are helping with
- * forced eviction, try anyway: maybe a transaction that was
- * running last time we wrote the page has since rolled back,
- * or we can help get the checkpoint completed sooner.
- */
- if (modified && !LF_ISSET(
- WT_EVICT_PASS_AGGRESSIVE | WT_EVICT_PASS_WOULD_BLOCK) &&
- (mod->disk_snap_min == S2C(session)->txn_global.oldest_id ||
- !__wt_txn_visible_all(session, mod->update_txn)))
- continue;
+ /*
+ * If the page is clean but has modifications that
+ * appear too new to evict, skip it.
+ */
+ if (!modified && mod != NULL &&
+ !__wt_txn_visible_all(session, mod->rec_max_txn))
+ continue;
+
+ /*
+ * If the oldest transaction hasn't changed since the
+ * last time this page was written, it's unlikely we
+ * can make progress. Similarly, if the most recent
+ * update on the page is not yet globally visible,
+ * eviction will fail. These heuristics attempt to
+ * avoid repeated attempts to evict the same page.
+ */
+ if (modified &&
+ (mod->disk_snap_min == conn->txn_global.oldest_id ||
+ !__wt_txn_visible_all(session, mod->update_txn)))
+ continue;
+ }
WT_ASSERT(session, evict->ref == NULL);
__evict_init_candidate(session, evict, ref);
@@ -1245,28 +1268,28 @@ fast: /* If the page can't be evicted, give up. */
WT_RET(__wt_verbose(session, WT_VERB_EVICTSERVER,
"select: %p, size %" PRIu64, page, page->memory_footprint));
}
+ WT_RET_NOTFOUND_OK(ret);
+
+ *slotp += (u_int)(evict - start);
/*
* If we happen to end up on the root page, clear it. We have to track
* hazard pointers, and the root page complicates that calculation.
*
- * Also clear the walk if we land on a page requiring forced eviction.
- * The eviction server may go to sleep, and we want this page evicted
- * as quickly as possible.
+ * If we land on a page requiring forced eviction, move on to the next
+ * page: we want this page evicted as quickly as possible.
*/
- if ((ref = btree->evict_ref) != NULL && (__wt_ref_is_root(ref) ||
- ref->page->read_gen == WT_READGEN_OLDEST)) {
- btree->evict_ref = NULL;
- __wt_page_release(session, ref, WT_READ_NO_EVICT);
+ if ((ref = btree->evict_ref) != NULL) {
+ if (__wt_ref_is_root(ref))
+ WT_RET(__evict_clear_walk(session));
+ else if (ref->page->read_gen == WT_READGEN_OLDEST)
+ WT_RET_NOTFOUND_OK(__wt_tree_walk(session,
+ &btree->evict_ref, &pages_walked, walk_flags));
}
- /* If the walk was interrupted by a locked page, that's okay. */
- if (ret == WT_NOTFOUND)
- ret = 0;
-
- *slotp += (u_int)(evict - start);
WT_STAT_FAST_CONN_INCRV(session, cache_eviction_walk, pages_walked);
- return (ret);
+
+ return (0);
}
/*
@@ -1310,7 +1333,7 @@ __evict_get_ref(
/* Get the next page queued for eviction. */
while ((evict = cache->evict_current) != NULL &&
- evict < cache->evict + candidates && evict->ref != NULL) {
+ evict < cache->evict_queue + candidates && evict->ref != NULL) {
WT_ASSERT(session, evict->btree != NULL);
/* Move to the next item. */
@@ -1321,8 +1344,8 @@ __evict_get_ref(
* multiple attempts to evict it. For pages that are already
* being evicted, this operation will fail and we will move on.
*/
- if (!WT_ATOMIC_CAS4(
- evict->ref->state, WT_REF_MEM, WT_REF_LOCKED)) {
+ if (!__wt_atomic_casv32(
+ &evict->ref->state, WT_REF_MEM, WT_REF_LOCKED)) {
__evict_list_clear(session, evict);
continue;
}
@@ -1331,7 +1354,7 @@ __evict_get_ref(
* Increment the busy count in the btree handle to prevent it
* from being closed under us.
*/
- (void)WT_ATOMIC_ADD4(evict->btree->evict_busy, 1);
+ (void)__wt_atomic_addv32(&evict->btree->evict_busy, 1);
*btreep = evict->btree;
*refp = evict->ref;
@@ -1345,7 +1368,7 @@ __evict_get_ref(
}
/* Clear the current pointer if there are no more candidates. */
- if (evict >= cache->evict + cache->evict_candidates)
+ if (evict >= cache->evict_queue + cache->evict_candidates)
cache->evict_current = NULL;
__wt_spin_unlock(session, &cache->evict_lock);
@@ -1402,15 +1425,12 @@ __evict_page(WT_SESSION_IMPL *session, int is_server)
* page-discard function assert that no dirty pages are ever
* discarded.
*/
- if (F_ISSET(btree->dhandle, WT_DHANDLE_DEAD) &&
- __wt_page_is_modified(page)) {
- page->modify->write_gen = 0;
- __wt_cache_dirty_decr(session, page);
- }
+ if (F_ISSET(btree->dhandle, WT_DHANDLE_DEAD))
+ __wt_page_modify_clear(session, page);
- WT_WITH_BTREE(session, btree, ret = __wt_evict_page(session, ref));
+ WT_WITH_BTREE(session, btree, ret = __wt_evict(session, ref, 0));
- (void)WT_ATOMIC_SUB4(btree->evict_busy, 1);
+ (void)__wt_atomic_subv32(&btree->evict_busy, 1);
WT_RET(ret);
@@ -1427,7 +1447,7 @@ __evict_page(WT_SESSION_IMPL *session, int is_server)
* crosses its boundaries.
*/
int
-__wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, int pct_full)
+__wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, u_int pct_full)
{
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
@@ -1544,29 +1564,31 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, int pct_full)
* NOTE: this function is not called anywhere, it is intended to be called
* from a debugger.
*/
-void
-__wt_cache_dump(WT_SESSION_IMPL *session)
+int
+__wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile)
{
- WT_BTREE *btree;
+ FILE *fp;
WT_CONNECTION_IMPL *conn;
- WT_DATA_HANDLE *dhandle;
- WT_REF *next_walk;
+ WT_DATA_HANDLE *dhandle, *saved_dhandle;
WT_PAGE *page;
+ WT_REF *next_walk;
uint64_t file_intl_pages, file_leaf_pages;
uint64_t file_bytes, file_dirty, total_bytes;
conn = S2C(session);
total_bytes = 0;
- SLIST_FOREACH(dhandle, &conn->dhlh, l) {
+ if (ofile == NULL)
+ fp = stdout;
+ else
+ WT_RET(__wt_fopen(session, ofile, WT_FHANDLE_WRITE, 0, &fp));
+
+ saved_dhandle = session->dhandle;
+ TAILQ_FOREACH(dhandle, &conn->dhqh, q) {
if (!WT_PREFIX_MATCH(dhandle->name, "file:") ||
!F_ISSET(dhandle, WT_DHANDLE_OPEN))
continue;
- btree = dhandle->handle;
- if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
- continue;
-
file_bytes = file_dirty = file_intl_pages = file_leaf_pages = 0;
next_walk = NULL;
session->dhandle = dhandle;
@@ -1581,12 +1603,14 @@ __wt_cache_dump(WT_SESSION_IMPL *session)
file_bytes += page->memory_footprint;
if (__wt_page_is_modified(page))
file_dirty += page->memory_footprint;
+ (void)__wt_fprintf(fp,
+ "%" WT_SIZET_FMT ", ", page->memory_footprint);
}
session->dhandle = NULL;
- printf("cache dump: %s%s%s%s:"
- " %" PRIu64 " intl pages, %" PRIu64 " leaf pages,"
- " %" PRIu64 "MB, %" PRIu64 "MB dirty\n",
+ (void)__wt_fprintf(fp, "\n" "cache dump: %s%s%s%s\n\t"
+ " %" PRIu64 " internal pages, %" PRIu64 " leaf pages,"
+ " %" PRIu64 "MB, %" PRIu64 "MB dirty\n==============\n",
dhandle->name,
dhandle->checkpoint == NULL ? "" : " [",
dhandle->checkpoint == NULL ? "" : dhandle->checkpoint,
@@ -1596,9 +1620,13 @@ __wt_cache_dump(WT_SESSION_IMPL *session)
total_bytes += file_bytes;
}
- printf("cache dump: total found = %" PRIu64 "MB"
+ session->dhandle = saved_dhandle;
+
+ (void)__wt_fprintf(fp, "cache dump: total found = %" PRIu64 "MB"
" vs tracked inuse %" PRIu64 "MB\n",
total_bytes >> 20, __wt_cache_bytes_inuse(conn->cache) >> 20);
- fflush(stdout);
+ if (fp != stdout)
+ WT_RET(__wt_fclose(&fp, WT_FHANDLE_WRITE));
+ return (0);
}
#endif
diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c
index 1e5faf45de2..11284ce7b21 100644
--- a/src/evict/evict_page.c
+++ b/src/evict/evict_page.c
@@ -150,17 +150,12 @@ done: if (((inmem_split && ret == 0) || (forced_eviction && ret == EBUSY)) &&
int
__wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
{
- int evict;
-
/*
* If doing normal system eviction, but only in the service of reducing
* the number of dirty pages, leave the clean page in cache.
*/
- if (!closing) {
- __wt_cache_status(session, &evict, NULL);
- if (!evict)
- return (EBUSY);
- }
+ if (!closing && __wt_eviction_dirty_target(session))
+ return (EBUSY);
/*
* Discard the page and update the reference structure; if the page has
@@ -184,7 +179,6 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
WT_ADDR *addr;
WT_PAGE *parent;
WT_PAGE_MODIFY *mod;
- int evict;
parent = ref->home;
mod = ref->page->modify;
@@ -229,11 +223,8 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
* push it out of cache (and read it back in, when needed), we
* would rather have more, smaller pages than fewer large pages.
*/
- if (!closing) {
- __wt_cache_status(session, &evict, NULL);
- if (!evict)
- return (EBUSY);
- }
+ if (!closing && __wt_eviction_dirty_target(session))
+ return (EBUSY);
/* Discard the parent's address. */
if (ref->addr != NULL && __wt_off_page(parent, ref->addr)) {
@@ -309,8 +300,7 @@ __evict_review(
{
WT_DECL_RET;
WT_PAGE *page;
- WT_PAGE_MODIFY *mod;
- uint32_t reconcile_flags;
+ uint32_t flags;
/*
* Get exclusive access to the page if our caller doesn't have the tree
@@ -331,7 +321,6 @@ __evict_review(
/* Now that we have exclusive access, review the page. */
page = ref->page;
- mod = page->modify;
/*
* Fail if an internal has active children, the children must be evicted
@@ -347,6 +336,13 @@ __evict_review(
/* Check if the page can be evicted. */
if (!closing) {
+ /*
+ * Update the oldest ID to avoid wasted effort should it have
+ * fallen behind current.
+ */
+ if (__wt_page_is_modified(page))
+ __wt_txn_update_oldest(session, 1);
+
if (!__wt_page_can_evict(session, page, 0, inmem_splitp))
return (EBUSY);
@@ -361,9 +357,12 @@ __evict_review(
return (__wt_split_insert(session, ref));
}
+ /* If the page is clean, we're done and we can evict. */
+ if (!__wt_page_is_modified(page))
+ return (0);
+
/*
- * If the page is dirty and can possibly change state, reconcile it to
- * determine the final state.
+ * If the page is dirty, reconcile it to decide if we can evict it.
*
* If we have an exclusive lock (we're discarding the tree), assert
* there are no updates we cannot read.
@@ -377,30 +376,38 @@ __evict_review(
* in-memory pages, (restoring the updates that stopped us from writing
* the block), and inserting the whole mess into the page's parent.
*
- * Don't set the update-restore flag for internal pages, they don't have
- * updates that can be saved and restored.
+ * Otherwise, if eviction is getting pressed, configure reconciliation
+ * to write not-yet-globally-visible updates to the lookaside table,
+ * allowing the eviction of pages we'd otherwise have to retain in cache
+ * to support older readers.
+ *
+ * Don't set the update-restore or lookaside table flags for internal
+ * pages, they don't have update lists that can be saved and restored.
*/
- reconcile_flags = WT_EVICTING;
- if (__wt_page_is_modified(page)) {
- if (closing)
- FLD_SET(reconcile_flags, WT_SKIP_UPDATE_ERR);
- else if (!WT_PAGE_IS_INTERNAL(page) &&
- page->read_gen == WT_READGEN_OLDEST)
- FLD_SET(reconcile_flags, WT_SKIP_UPDATE_RESTORE);
- WT_RET(__wt_reconcile(session, ref, NULL, reconcile_flags));
- WT_ASSERT(session,
- !__wt_page_is_modified(page) ||
- FLD_ISSET(reconcile_flags, WT_SKIP_UPDATE_RESTORE));
+ flags = WT_EVICTING;
+ if (closing)
+ LF_SET(WT_VISIBILITY_ERR);
+ else if (!WT_PAGE_IS_INTERNAL(page)) {
+ if (page->read_gen == WT_READGEN_OLDEST)
+ LF_SET(WT_EVICT_UPDATE_RESTORE);
+ else if (__wt_eviction_aggressive(session))
+ LF_SET(WT_EVICT_LOOKASIDE);
}
+ WT_RET(__wt_reconcile(session, ref, NULL, flags));
+
/*
- * If the page was ever modified, make sure all of the updates
- * on the page are old enough they can be discarded from cache.
+ * Success: assert the page is clean or reconciliation was configured
+ * for an update/restore split, and if the page is clean, reconciliation
+ * was configured for a lookaside table or all updates on the page are
+ * globally visible.
*/
- if (!closing && mod != NULL &&
- !__wt_txn_visible_all(session, mod->rec_max_txn) &&
- !FLD_ISSET(reconcile_flags, WT_SKIP_UPDATE_RESTORE))
- return (EBUSY);
+ WT_ASSERT(session,
+ LF_ISSET(WT_EVICT_UPDATE_RESTORE) || !__wt_page_is_modified(page));
+ WT_ASSERT(session,
+ LF_SET(WT_EVICT_LOOKASIDE) ||
+ __wt_page_is_modified(page) ||
+ __wt_txn_visible_all(session, page->modify->rec_max_txn));
return (0);
}
diff --git a/src/include/async.h b/src/include/async.h
index 88ecad6eb2c..fb9a64e774d 100644
--- a/src/include/async.h
+++ b/src/include/async.h
@@ -6,20 +6,6 @@
* See the file LICENSE for redistribution information.
*/
-typedef enum {
- WT_ASYNCOP_ENQUEUED, /* Placed on the work queue */
- WT_ASYNCOP_FREE, /* Able to be allocated to user */
- WT_ASYNCOP_READY, /* Allocated and ready for user to use */
- WT_ASYNCOP_WORKING /* Operation in progress by worker */
-} WT_ASYNC_STATE;
-
-typedef enum {
- WT_ASYNC_FLUSH_NONE=0, /* No flush in progress */
- WT_ASYNC_FLUSH_COMPLETE, /* Notify flush caller it's done */
- WT_ASYNC_FLUSH_IN_PROGRESS, /* Prevent other callers */
- WT_ASYNC_FLUSHING /* Notify workers */
-} WT_ASYNC_FLUSH_STATE;
-
#define MAX_ASYNC_SLEEP_USECS 100000 /* Maximum sleep waiting for work */
#define MAX_ASYNC_YIELD 200 /* Maximum number of yields for work */
@@ -31,7 +17,7 @@ typedef enum {
* The URI/config/format cache.
*/
struct __wt_async_format {
- STAILQ_ENTRY(__wt_async_format) q;
+ TAILQ_ENTRY(__wt_async_format) q;
const char *config;
uint64_t cfg_hash; /* Config hash */
const char *uri;
@@ -53,7 +39,13 @@ struct __wt_async_op_impl {
uint64_t unique_id; /* Unique identifier. */
WT_ASYNC_FORMAT *format; /* Format structure */
- WT_ASYNC_STATE state; /* Op state */
+
+#define WT_ASYNCOP_ENQUEUED 0 /* Placed on the work queue */
+#define WT_ASYNCOP_FREE 1 /* Able to be allocated to user */
+#define WT_ASYNCOP_READY 2 /* Allocated, ready for user to use */
+#define WT_ASYNCOP_WORKING 3 /* Operation in progress by worker */
+ uint32_t state;
+
WT_ASYNC_OPTYPE optype; /* Operation type */
};
@@ -88,10 +80,16 @@ struct __wt_async {
uint64_t alloc_tail; /* Next slot to dequeue */
uint64_t tail_slot; /* Worker slot consumed */
- STAILQ_HEAD(__wt_async_format_qh, __wt_async_format) formatqh;
- int cur_queue; /* Currently enqueued */
- int max_queue; /* Maximum enqueued */
- WT_ASYNC_FLUSH_STATE flush_state; /* Queue flush state */
+ TAILQ_HEAD(__wt_async_format_qh, __wt_async_format) formatqh;
+ uint32_t cur_queue; /* Currently enqueued */
+ uint32_t max_queue; /* Maximum enqueued */
+
+#define WT_ASYNC_FLUSH_NONE 0 /* No flush in progress */
+#define WT_ASYNC_FLUSH_COMPLETE 1 /* Notify flush caller done */
+#define WT_ASYNC_FLUSH_IN_PROGRESS 2 /* Prevent other callers */
+#define WT_ASYNC_FLUSHING 3 /* Notify workers */
+ uint32_t flush_state;
+
/* Notify any waiting threads when flushing is done. */
WT_CONDVAR *flush_cond;
WT_ASYNC_OP_IMPL flush_op; /* Special flush op */
@@ -112,7 +110,7 @@ struct __wt_async {
* has a cache of async cursors to reuse for operations.
*/
struct __wt_async_cursor {
- STAILQ_ENTRY(__wt_async_cursor) q; /* Worker cache */
+ TAILQ_ENTRY(__wt_async_cursor) q; /* Worker cache */
uint64_t cfg_hash; /* Config hash */
uint64_t uri_hash; /* URI hash */
WT_CURSOR *c; /* WT cursor */
@@ -124,6 +122,6 @@ struct __wt_async_cursor {
*/
struct __wt_async_worker_state {
uint32_t id;
- STAILQ_HEAD(__wt_cursor_qh, __wt_async_cursor) cursorqh;
+ TAILQ_HEAD(__wt_cursor_qh, __wt_async_cursor) cursorqh;
uint32_t num_cursors;
};
diff --git a/src/include/bitstring.i b/src/include/bitstring.i
index c548c12761d..5449ffe6209 100644
--- a/src/include/bitstring.i
+++ b/src/include/bitstring.i
@@ -84,10 +84,10 @@ __bit_alloc(WT_SESSION_IMPL *session, uint64_t nbits, void *retp)
* __bit_test --
* Test one bit in name.
*/
-static inline int
+static inline bool
__bit_test(uint8_t *bitf, uint64_t bit)
{
- return (bitf[__bit_byte(bit)] & __bit_mask(bit) ? 1 : 0);
+ return ((bitf[__bit_byte(bit)] & __bit_mask(bit)) != 0);
}
/*
diff --git a/src/include/block.h b/src/include/block.h
index 795d646db1e..ce33b331e76 100644
--- a/src/include/block.h
+++ b/src/include/block.h
@@ -215,8 +215,8 @@ struct __wt_block {
/* A list of block manager handles, sharing a file descriptor. */
uint32_t ref; /* References */
WT_FH *fh; /* Backing file handle */
- SLIST_ENTRY(__wt_block) l; /* Linked list of handles */
- SLIST_ENTRY(__wt_block) hashl; /* Hashed list of handles */
+ TAILQ_ENTRY(__wt_block) q; /* Linked list of handles */
+ TAILQ_ENTRY(__wt_block) hashq; /* Hashed list of handles */
/* Configuration information, set when the file is opened. */
uint32_t allocfirst; /* Allocation is first-fit */
diff --git a/src/include/btmem.h b/src/include/btmem.h
index f13504d66ca..f214ddb1dc3 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -6,6 +6,8 @@
* See the file LICENSE for redistribution information.
*/
+#define WT_RECNO_OOB 0 /* Illegal record number */
+
/*
* WT_PAGE_HEADER --
* Blocks have a common header, a WT_PAGE_HEADER structure followed by a
@@ -43,6 +45,7 @@ struct __wt_page_header {
#define WT_PAGE_EMPTY_V_ALL 0x02 /* Page has all zero-length values */
#define WT_PAGE_EMPTY_V_NONE 0x04 /* Page has no zero-length values */
#define WT_PAGE_ENCRYPTED 0x08 /* Page is encrypted on disk */
+#define WT_PAGE_LAS_UPDATE 0x10 /* Page updates in lookaside store */
uint8_t flags; /* 25: flags */
/*
@@ -168,6 +171,29 @@ struct __wt_ovfl_txnc {
};
/*
+ * Lookaside table support: when a page is being reconciled for eviction and has
+ * updates that might be required by earlier readers in the system, the updates
+ * are written into a lookaside table, and restored as necessary if the page is
+ * read. The key is a unique marker for the page (a file ID plus an address),
+ * a counter (used to ensure the update records remain in the original order),
+ * the on-page item's transaction ID (so we can discard any update records from
+ * the lookaside table once the on-page item's transaction is globally visible),
+ * and the page key (byte-string for row-store, record number for column-store).
+ * The value is the WT_UPDATE structure's transaction ID, update size and value.
+ *
+ * As the key for the lookaside table is different for row- and column-store, we
+ * store both key types in a WT_ITEM, building/parsing them in the code, because
+ * otherwise we'd need two lookaside files with different key formats. We could
+ * make the lookaside table's key standard by moving the source key into the
+ * lookaside table value, but that doesn't make the coding any simpler, and it
+ * makes the lookaside table's value more likely to overflow the page size when
+ * the row-store key is relatively large.
+ */
+#define WT_LAS_FORMAT \
+ "key_format=" WT_UNCHECKED_STRING(IuQQu) \
+ ",value_format=" WT_UNCHECKED_STRING(QIu)
+
+/*
* WT_PAGE_MODIFY --
* When a page is modified, there's additional information to maintain.
*/
@@ -238,15 +264,17 @@ struct __wt_page_modify {
* Eviction, but block wasn't written: unresolved updates and
* associated disk image.
*
- * Skipped updates are either a WT_INSERT, or a row-store leaf
- * page entry.
+ * Saved updates are either a WT_INSERT, or a row-store leaf
+ * page entry; in the case of creating lookaside records, there
+ * is an additional value, the committed item's transaction ID.
*/
- struct __wt_upd_skipped {
+ struct __wt_save_upd {
WT_INSERT *ins;
WT_ROW *rip;
- } *skip;
- uint32_t skip_entries;
- void *skip_dsk;
+ uint64_t onpage_txn;
+ } *supd;
+ uint32_t supd_entries;
+ void *supd_dsk;
/*
* Block was written: address, size and checksum.
@@ -556,9 +584,8 @@ struct __wt_page {
#define WT_PAGE_DISK_ALLOC 0x02 /* Disk image in allocated memory */
#define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */
#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */
-#define WT_PAGE_SCANNING 0x10 /* Obsolete updates are being scanned */
+#define WT_PAGE_RECONCILIATION 0x10 /* Page reconciliation lock */
#define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */
-#define WT_PAGE_SPLIT_LOCKED 0x40 /* An internal page is growing */
uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
/*
@@ -656,14 +683,6 @@ struct __wt_page {
* to the readers. If the evicting thread does not find a hazard pointer,
* the page is evicted.
*/
-typedef enum __wt_page_state {
- WT_REF_DISK=0, /* Page is on disk */
- WT_REF_DELETED, /* Page is on disk, but deleted */
- WT_REF_LOCKED, /* Page locked for exclusive access */
- WT_REF_MEM, /* Page is in cache and valid */
- WT_REF_READING, /* Page being read */
- WT_REF_SPLIT /* Parent page split (WT_REF dead) */
-} WT_PAGE_STATE;
/*
* WT_PAGE_DELETED --
@@ -691,7 +710,13 @@ struct __wt_ref {
WT_PAGE * volatile home; /* Reference page */
uint32_t pindex_hint; /* Reference page index hint */
- volatile WT_PAGE_STATE state; /* Page state */
+#define WT_REF_DISK 0 /* Page is on disk */
+#define WT_REF_DELETED 1 /* Page is on disk, but deleted */
+#define WT_REF_LOCKED 2 /* Page locked for exclusive access */
+#define WT_REF_MEM 3 /* Page is in cache and valid */
+#define WT_REF_READING 4 /* Page being read */
+#define WT_REF_SPLIT 5 /* Parent page split (WT_REF dead) */
+ volatile uint32_t state; /* Page state */
/*
* Address: on-page cell if read from backing block, off-page WT_ADDR
@@ -871,8 +896,9 @@ WT_PACKED_STRUCT_BEGIN(__wt_update)
* store 4GB objects; I'd rather do that than increase the size of this
* structure for a flag bit.
*/
-#define WT_UPDATE_DELETED_ISSET(upd) ((upd)->size == UINT32_MAX)
-#define WT_UPDATE_DELETED_SET(upd) ((upd)->size = UINT32_MAX)
+#define WT_UPDATE_DELETED_VALUE UINT32_MAX
+#define WT_UPDATE_DELETED_SET(upd) ((upd)->size = WT_UPDATE_DELETED_VALUE)
+#define WT_UPDATE_DELETED_ISSET(upd) ((upd)->size == WT_UPDATE_DELETED_VALUE)
uint32_t size; /* update length */
/* The untyped value immediately follows the WT_UPDATE structure. */
@@ -958,7 +984,7 @@ struct __wt_insert {
#define WT_PAGE_ALLOC_AND_SWAP(s, page, dest, v, count) do { \
if (((v) = (dest)) == NULL) { \
WT_ERR(__wt_calloc_def(s, count, &(v))); \
- if (WT_ATOMIC_CAS8(dest, NULL, v)) \
+ if (__wt_atomic_cas_ptr(&dest, NULL, v)) \
__wt_cache_page_inmem_incr( \
s, page, (count) * sizeof(*(v))); \
else \
diff --git a/src/include/btree.h b/src/include/btree.h
index deecd8f6d88..98ce4c22c10 100644
--- a/src/include/btree.h
+++ b/src/include/btree.h
@@ -146,12 +146,14 @@ struct __wt_btree {
/* Flags values up to 0xff are reserved for WT_DHANDLE_* */
#define WT_BTREE_BULK 0x00100 /* Bulk-load handle */
#define WT_BTREE_IN_MEMORY 0x00200 /* Cache-resident object */
-#define WT_BTREE_NO_EVICTION 0x00400 /* Disable eviction */
-#define WT_BTREE_NO_LOGGING 0x00800 /* Disable logging */
-#define WT_BTREE_SALVAGE 0x01000 /* Handle is for salvage */
-#define WT_BTREE_SKIP_CKPT 0x02000 /* Handle skipped checkpoint */
-#define WT_BTREE_UPGRADE 0x04000 /* Handle is for upgrade */
-#define WT_BTREE_VERIFY 0x08000 /* Handle is for verify */
+#define WT_BTREE_LOOKASIDE 0x00400 /* Look-aside table */
+#define WT_BTREE_NO_CHECKPOINT 0x00800 /* Disable checkpoints */
+#define WT_BTREE_NO_EVICTION 0x01000 /* Disable eviction */
+#define WT_BTREE_NO_LOGGING 0x02000 /* Disable logging */
+#define WT_BTREE_SALVAGE 0x04000 /* Handle is for salvage */
+#define WT_BTREE_SKIP_CKPT 0x08000 /* Handle skipped checkpoint */
+#define WT_BTREE_UPGRADE 0x10000 /* Handle is for upgrade */
+#define WT_BTREE_VERIFY 0x20000 /* Handle is for verify */
uint32_t flags;
};
diff --git a/src/include/btree.i b/src/include/btree.i
index d13ec1972fb..b54cecb6ce0 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -10,17 +10,17 @@
* __wt_ref_is_root --
* Return if the page reference is for the root page.
*/
-static inline int
+static inline bool
__wt_ref_is_root(WT_REF *ref)
{
- return (ref->home == NULL ? 1 : 0);
+ return (ref->home == NULL);
}
/*
* __wt_page_is_empty --
* Return if the page is empty.
*/
-static inline int
+static inline bool
__wt_page_is_empty(WT_PAGE *page)
{
return (page->modify != NULL &&
@@ -31,10 +31,10 @@ __wt_page_is_empty(WT_PAGE *page)
* __wt_page_is_modified --
* Return if the page is dirty.
*/
-static inline int
+static inline bool
__wt_page_is_modified(WT_PAGE *page)
{
- return (page->modify != NULL && page->modify->write_gen != 0 ? 1 : 0);
+ return (page->modify != NULL && page->modify->write_gen != 0);
}
/*
@@ -49,46 +49,74 @@ __wt_cache_page_inmem_incr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size)
WT_ASSERT(session, size < WT_EXABYTE);
cache = S2C(session)->cache;
- (void)WT_ATOMIC_ADD8(cache->bytes_inmem, size);
- (void)WT_ATOMIC_ADD8(page->memory_footprint, size);
+ (void)__wt_atomic_add64(&cache->bytes_inmem, size);
+ (void)__wt_atomic_addsize(&page->memory_footprint, size);
if (__wt_page_is_modified(page)) {
- (void)WT_ATOMIC_ADD8(cache->bytes_dirty, size);
- (void)WT_ATOMIC_ADD8(page->modify->bytes_dirty, size);
+ (void)__wt_atomic_add64(&cache->bytes_dirty, size);
+ (void)__wt_atomic_addsize(&page->modify->bytes_dirty, size);
}
/* Track internal and overflow size in cache. */
if (WT_PAGE_IS_INTERNAL(page))
- (void)WT_ATOMIC_ADD8(cache->bytes_internal, size);
+ (void)__wt_atomic_add64(&cache->bytes_internal, size);
else if (page->type == WT_PAGE_OVFL)
- (void)WT_ATOMIC_ADD8(cache->bytes_overflow, size);
+ (void)__wt_atomic_add64(&cache->bytes_overflow, size);
}
-/*
- * WT_CACHE_DECR --
- * Macro to decrement a field by a size.
- *
- * Be defensive and don't underflow: a band-aid on a gaping wound, but underflow
- * won't make things better no matter the problem (specifically, underflow makes
- * eviction crazy trying to evict non-existent memory).
+/*
+ * __wt_cache_decr_check_size --
+ * Decrement a size_t cache value and check for underflow.
*/
+static inline void
+__wt_cache_decr_check_size(
+ WT_SESSION_IMPL *session, size_t *vp, size_t v, const char *fld)
+{
+ if (__wt_atomic_subsize(vp, v) < WT_EXABYTE)
+ return;
+
#ifdef HAVE_DIAGNOSTIC
-#define WT_CACHE_DECR(session, f, sz) do { \
- static int __first = 1; \
- if (WT_ATOMIC_SUB8(f, sz) > WT_EXABYTE) { \
- (void)WT_ATOMIC_ADD8(f, sz); \
- if (__first) { \
- __wt_errx(session, \
- "%s underflow: decrementing %" WT_SIZET_FMT,\
- #f, sz); \
- __first = 0; \
- } \
- } \
-} while (0)
+ (void)__wt_atomic_addsize(vp, v);
+
+ {
+ static int first = 1;
+
+ if (!first)
+ return;
+ __wt_errx(session, "%s underflow: decrementing %" WT_SIZET_FMT, fld, v);
+ first = 0;
+ }
#else
-#define WT_CACHE_DECR(s, f, sz) do { \
- if (WT_ATOMIC_SUB8(f, sz) > WT_EXABYTE) \
- (void)WT_ATOMIC_ADD8(f, sz); \
-} while (0)
+ WT_UNUSED(fld);
+ WT_UNUSED(session);
#endif
+}
+
+/*
+ * __wt_cache_decr_check_uint64 --
+ * Decrement a uint64_t cache value and check for underflow.
+ */
+static inline void
+__wt_cache_decr_check_uint64(
+ WT_SESSION_IMPL *session, uint64_t *vp, size_t v, const char *fld)
+{
+ if (__wt_atomic_sub64(vp, v) < WT_EXABYTE)
+ return;
+
+#ifdef HAVE_DIAGNOSTIC
+ (void)__wt_atomic_add64(vp, v);
+
+ {
+ static int first = 1;
+
+ if (!first)
+ return;
+ __wt_errx(session, "%s underflow: decrementing %" WT_SIZET_FMT, fld, v);
+ first = 0;
+ }
+#else
+ WT_UNUSED(fld);
+ WT_UNUSED(session);
+#endif
+}
/*
* __wt_cache_page_byte_dirty_decr --
@@ -128,9 +156,10 @@ __wt_cache_page_byte_dirty_decr(
*/
orig = page->modify->bytes_dirty;
decr = WT_MIN(size, orig);
- if (WT_ATOMIC_CAS8(
- page->modify->bytes_dirty, orig, orig - decr)) {
- WT_CACHE_DECR(session, cache->bytes_dirty, decr);
+ if (__wt_atomic_cassize(
+ &page->modify->bytes_dirty, orig, orig - decr)) {
+ __wt_cache_decr_check_uint64(session,
+ &cache->bytes_dirty, decr, "WT_CACHE.bytes_dirty");
break;
}
}
@@ -149,15 +178,19 @@ __wt_cache_page_inmem_decr(WT_SESSION_IMPL *session, WT_PAGE *page, size_t size)
WT_ASSERT(session, size < WT_EXABYTE);
- WT_CACHE_DECR(session, cache->bytes_inmem, size);
- WT_CACHE_DECR(session, page->memory_footprint, size);
+ __wt_cache_decr_check_uint64(
+ session, &cache->bytes_inmem, size, "WT_CACHE.bytes_inmem");
+ __wt_cache_decr_check_size(
+ session, &page->memory_footprint, size, "WT_PAGE.memory_footprint");
if (__wt_page_is_modified(page))
__wt_cache_page_byte_dirty_decr(session, page, size);
/* Track internal and overflow size in cache. */
if (WT_PAGE_IS_INTERNAL(page))
- WT_CACHE_DECR(session, cache->bytes_internal, size);
+ __wt_cache_decr_check_uint64(session,
+ &cache->bytes_internal, size, "WT_CACHE.bytes_internal");
else if (page->type == WT_PAGE_OVFL)
- WT_CACHE_DECR(session, cache->bytes_overflow, size);
+ __wt_cache_decr_check_uint64(session,
+ &cache->bytes_overflow, size, "WT_CACHE.bytes_overflow");
}
/*
@@ -172,15 +205,15 @@ __wt_cache_dirty_incr(WT_SESSION_IMPL *session, WT_PAGE *page)
size_t size;
cache = S2C(session)->cache;
- (void)WT_ATOMIC_ADD8(cache->pages_dirty, 1);
+ (void)__wt_atomic_add64(&cache->pages_dirty, 1);
/*
* Take care to read the memory_footprint once in case we are racing
* with updates.
*/
size = page->memory_footprint;
- (void)WT_ATOMIC_ADD8(cache->bytes_dirty, size);
- (void)WT_ATOMIC_ADD8(page->modify->bytes_dirty, size);
+ (void)__wt_atomic_add64(&cache->bytes_dirty, size);
+ (void)__wt_atomic_addsize(&page->modify->bytes_dirty, size);
}
/*
@@ -202,7 +235,7 @@ __wt_cache_dirty_decr(WT_SESSION_IMPL *session, WT_PAGE *page)
"count went negative");
cache->pages_dirty = 0;
} else
- (void)WT_ATOMIC_SUB8(cache->pages_dirty, 1);
+ (void)__wt_atomic_sub64(&cache->pages_dirty, 1);
modify = page->modify;
if (modify != NULL && modify->bytes_dirty != 0)
@@ -224,12 +257,15 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page)
modify = page->modify;
/* Update the bytes in-memory to reflect the eviction. */
- WT_CACHE_DECR(session, cache->bytes_inmem, page->memory_footprint);
+ __wt_cache_decr_check_uint64(session,
+ &cache->bytes_inmem,
+ page->memory_footprint, "WT_CACHE.bytes_inmem");
/* Update the bytes_internal value to reflect the eviction */
if (WT_PAGE_IS_INTERNAL(page))
- WT_CACHE_DECR(session,
- cache->bytes_internal, page->memory_footprint);
+ __wt_cache_decr_check_uint64(session,
+ &cache->bytes_internal,
+ page->memory_footprint, "WT_CACHE.bytes_internal");
/* Update the cache's dirty-byte count. */
if (modify != NULL && modify->bytes_dirty != 0) {
@@ -239,13 +275,14 @@ __wt_cache_page_evict(WT_SESSION_IMPL *session, WT_PAGE *page)
"dirty byte count went negative");
cache->bytes_dirty = 0;
} else
- WT_CACHE_DECR(
- session, cache->bytes_dirty, modify->bytes_dirty);
+ __wt_cache_decr_check_uint64(session,
+ &cache->bytes_dirty,
+ modify->bytes_dirty, "WT_CACHE.bytes_dirty");
}
/* Update pages and bytes evicted. */
- (void)WT_ATOMIC_ADD8(cache->bytes_evict, page->memory_footprint);
- (void)WT_ATOMIC_ADD8(cache->pages_evict, 1);
+ (void)__wt_atomic_add64(&cache->bytes_evict, page->memory_footprint);
+ (void)__wt_atomic_add64(&cache->pages_evict, 1);
}
/*
@@ -306,7 +343,7 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
* Every time the page transitions from clean to dirty, update the cache
* and transactional information.
*/
- if (WT_ATOMIC_ADD4(page->modify->write_gen, 1) == 1) {
+ if (__wt_atomic_add32(&page->modify->write_gen, 1) == 1) {
__wt_cache_dirty_incr(session, page);
/*
@@ -321,9 +358,13 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
* have committed in the meantime, and the last_running field
* been updated past it. That is all very unlikely, but not
* impossible, so we take care to read the global state before
- * the atomic increment. If we raced with reconciliation, just
- * leave the previous value here: at worst, we will write a
- * page in a checkpoint when not absolutely necessary.
+ * the atomic increment.
+ *
+ * If the page was dirty on entry, then last_running == 0. The
+ * page could have become clean since then, if reconciliation
+ * completed. In that case, we leave the previous value for
+ * first_dirty_txn rather than potentially racing to update it,
+ * at worst, we'll unnecessarily write a page in a checkpoint.
*/
if (last_running != 0)
page->modify->first_dirty_txn = last_running;
@@ -335,6 +376,25 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
}
/*
+ * __wt_page_modify_clear --
+ * Clean a modified page.
+ */
+static inline void
+__wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ /*
+ * The page must be held exclusive when this call is made, this call
+ * can only be used when the page is owned by a single thread.
+ *
+ * Allow the call to be made on clean pages.
+ */
+ if (__wt_page_is_modified(page)) {
+ page->modify->write_gen = 0;
+ __wt_cache_dirty_decr(session, page);
+ }
+}
+
+/*
* __wt_page_modify_set --
* Mark the page and tree dirty.
*/
@@ -354,6 +414,9 @@ __wt_page_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
* shouldn't cause problems; regardless, let's play it safe.)
*/
if (S2BT(session)->modified == 0) {
+ /* Assert we never dirty a checkpoint handle. */
+ WT_ASSERT(session, session->dhandle->checkpoint == NULL);
+
S2BT(session)->modified = 1;
WT_FULL_BARRIER();
}
@@ -395,7 +458,7 @@ __wt_page_parent_modify_set(
* __wt_off_page --
* Return if a pointer references off-page data.
*/
-static inline int
+static inline bool
__wt_off_page(WT_PAGE *page, const void *p)
{
/*
@@ -496,7 +559,12 @@ __wt_ref_key_instantiated(WT_REF *ref)
static inline void
__wt_ref_key_clear(WT_REF *ref)
{
- /* The key union has 2 fields, both of which are 8B. */
+ /*
+ * The key union has 2 8B fields; this is equivalent to:
+ *
+ * ref->key.recno = WT_RECNO_OOB;
+ * ref->key.ikey = NULL;
+ */
ref->key.recno = 0;
}
@@ -506,7 +574,7 @@ __wt_ref_key_clear(WT_REF *ref)
* had without unpacking a cell, and information about the cell, if the key
* isn't cheaply available.
*/
-static inline int
+static inline bool
__wt_row_leaf_key_info(WT_PAGE *page, void *copy,
WT_IKEY **ikeyp, WT_CELL **cellp, void *datap, size_t *sizep)
{
@@ -597,7 +665,7 @@ __wt_row_leaf_key_info(WT_PAGE *page, void *copy,
if (cellp != NULL)
*cellp =
WT_PAGE_REF_OFFSET(page, WT_CELL_DECODE_OFFSET(v));
- return (0);
+ return (false);
case WT_K_FLAG:
/* Encoded key: no instantiated key, no cell. */
if (cellp != NULL)
@@ -608,9 +676,9 @@ __wt_row_leaf_key_info(WT_PAGE *page, void *copy,
*(void **)datap =
WT_PAGE_REF_OFFSET(page, WT_K_DECODE_KEY_OFFSET(v));
*sizep = WT_K_DECODE_KEY_LEN(v);
- return (1);
+ return (true);
}
- return (0);
+ return (false);
case WT_KV_FLAG:
/* Encoded key/value pair: no instantiated key, no cell. */
if (cellp != NULL)
@@ -621,9 +689,9 @@ __wt_row_leaf_key_info(WT_PAGE *page, void *copy,
*(void **)datap = WT_PAGE_REF_OFFSET(
page, WT_KV_DECODE_KEY_OFFSET(v));
*sizep = WT_KV_DECODE_KEY_LEN(v);
- return (1);
+ return (true);
}
- return (0);
+ return (false);
}
@@ -636,9 +704,9 @@ __wt_row_leaf_key_info(WT_PAGE *page, void *copy,
if (datap != NULL) {
*(void **)datap = WT_IKEY_DATA(ikey);
*sizep = ikey->size;
- return (1);
+ return (true);
}
- return (0);
+ return (false);
}
/*
@@ -826,7 +894,7 @@ __wt_row_leaf_value_cell(WT_PAGE *page, WT_ROW *rip, WT_CELL_UNPACK *kpack)
* __wt_row_leaf_value --
* Return the value for a row-store leaf page encoded key/value pair.
*/
-static inline int
+static inline bool
__wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value)
{
uintptr_t v;
@@ -842,9 +910,9 @@ __wt_row_leaf_value(WT_PAGE *page, WT_ROW *rip, WT_ITEM *value)
value->data =
WT_PAGE_REF_OFFSET(page, WT_KV_DECODE_VALUE_OFFSET(v));
value->size = WT_KV_DECODE_VALUE_LEN(v);
- return (1);
+ return (true);
}
- return (0);
+ return (false);
}
/*
@@ -903,11 +971,13 @@ __wt_ref_info(WT_SESSION_IMPL *session,
* __wt_page_can_split --
* Check whether a page can be split in memory.
*/
-static inline int
+static inline bool
__wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
{
WT_BTREE *btree;
WT_INSERT_HEAD *ins_head;
+ WT_INSERT *ins;
+ int i;
btree = S2BT(session);
@@ -916,58 +986,54 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page)
* of the page could continually split without benefit.
*/
if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT))
- return (0);
+ return (false);
/*
* Check for pages with append-only workloads. A common application
* pattern is to have multiple threads frantically appending to the
* tree. We want to reconcile and evict this page, but we'd like to
- * do it without making the appending threads wait. If we're not
- * discarding the tree, check and see if it's worth doing a split to
- * let the threads continue before doing eviction.
- *
- * Ignore anything other than large, dirty row-store leaf pages.
+ * do it without making the appending threads wait. See if it's worth
+ * doing a split to let the threads continue before doing eviction.
*
- * XXX KEITH
- * Need a better test for append-only workloads.
+ * Ignore anything other than large, dirty row-store leaf pages. The
+ * split code only supports row-store pages, and we depend on the page
+ * being dirty for correctness (the page must be reconciled again
+ * before being evicted after the split, information from a previous
+ * reconciliation will be wrong, so we can't evict immediately).
*/
if (page->type != WT_PAGE_ROW_LEAF ||
page->memory_footprint < btree->maxmempage ||
!__wt_page_is_modified(page))
- return (0);
-
- /* Don't split a page that is pending a multi-block split. */
- if (F_ISSET(page->modify, WT_PM_REC_MULTIBLOCK))
- return (0);
+ return (false);
/*
* There is no point splitting if the list is small, no deep items is
- * our heuristic for that. (A 1/4 probability of adding a new skiplist
- * level means there will be a new 6th level for roughly each 4KB of
- * entries in the list. If we have at least two 6th level entries, the
- * list is at least large enough to work with.)
- *
- * The following code requires at least two items on the insert list,
- * this test serves the additional purpose of confirming that.
+ * our heuristic for that. A 1/4 probability of adding a new skiplist
+ * level, with level-0 always created, means there will be a 5th level
+ * entry for roughly every 1024 entries in the list. If there are at
+ * least 4 5th level entries (4K items), the list is large enough.
*/
-#define WT_MIN_SPLIT_SKIPLIST_DEPTH WT_MIN(6, WT_SKIP_MAXDEPTH - 1)
+#define WT_MIN_SPLIT_SKIPLIST_DEPTH WT_MIN(5, WT_SKIP_MAXDEPTH - 1)
ins_head = page->pg_row_entries == 0 ?
WT_ROW_INSERT_SMALLEST(page) :
WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1);
- if (ins_head == NULL ||
- ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH] == NULL ||
- ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH] ==
- ins_head->tail[WT_MIN_SPLIT_SKIPLIST_DEPTH])
- return (0);
-
- return (1);
+ if (ins_head == NULL)
+ return (false);
+ for (i = 0, ins = ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH];
+ ins != NULL; ins = ins->next[WT_MIN_SPLIT_SKIPLIST_DEPTH])
+ if (++i == 4) {
+ WT_STAT_FAST_CONN_INCR(session, cache_inmem_splittable);
+ WT_STAT_FAST_DATA_INCR(session, cache_inmem_splittable);
+ return (true);
+ }
+ return (false);
}
/*
* __wt_page_can_evict --
* Check whether a page can be evicted.
*/
-static inline int
+static inline bool
__wt_page_can_evict(WT_SESSION_IMPL *session,
WT_PAGE *page, int check_splits, int *inmem_splitp)
{
@@ -980,11 +1046,22 @@ __wt_page_can_evict(WT_SESSION_IMPL *session,
btree = S2BT(session);
mod = page->modify;
- txn_global = &S2C(session)->txn_global;
/* Pages that have never been modified can always be evicted. */
if (mod == NULL)
- return (1);
+ return (true);
+
+ /*
+ * Check for in-memory splits before other eviction tests. If the page
+ * should split in-memory, return success immediately and skip more
+ * detailed eviction tests. We don't need further tests since the page
+ * won't be written or discarded from the cache.
+ */
+ if (__wt_page_can_split(session, page)) {
+ if (inmem_splitp != NULL)
+ *inmem_splitp = 1;
+ return (true);
+ }
/*
* If the tree was deepened, there's a requirement that newly created
@@ -997,20 +1074,7 @@ __wt_page_can_evict(WT_SESSION_IMPL *session,
*/
if (check_splits && WT_PAGE_IS_INTERNAL(page) &&
!__wt_txn_visible_all(session, mod->mod_split_txn))
- return (0);
-
- /*
- * Allow for the splitting of pages when a checkpoint is underway only
- * if the allow_splits flag has been passed, we know we are performing
- * a checkpoint, the page is larger than the stated maximum and there
- * has not already been a split on this page as the WT_PM_REC_MULTIBLOCK
- * flag is unset.
- */
- if (__wt_page_can_split(session, page)) {
- if (inmem_splitp != NULL)
- *inmem_splitp = 1;
- return (1);
- }
+ return (false);
/*
* If the file is being checkpointed, we can't evict dirty pages:
@@ -1018,48 +1082,27 @@ __wt_page_can_evict(WT_SESSION_IMPL *session,
* previous version might be referenced by an internal page already
* been written in the checkpoint, leaving the checkpoint inconsistent.
*/
- if (btree->checkpointing &&
- (__wt_page_is_modified(page) ||
- F_ISSET(mod, WT_PM_REC_MULTIBLOCK))) {
+ if (btree->checkpointing && __wt_page_is_modified(page)) {
WT_STAT_FAST_CONN_INCR(session, cache_eviction_checkpoint);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_checkpoint);
- return (0);
+ return (false);
}
/*
- * If we aren't (potentially) doing eviction that can restore updates
- * and the updates on this page are too recent, give up.
- *
- * Don't rely on new updates being skipped by the transaction used
- * for transaction reads: (1) there are paths that dirty pages for
- * artificial reasons; (2) internal pages aren't transactional; and
- * (3) if an update was skipped during the checkpoint (leaving the page
- * dirty), then rolled back, we could still successfully overwrite a
- * page and corrupt the checkpoint.
+ * If the page was recently split in-memory, don't evict it immediately:
+ * we want to give application threads that are appending a chance to
+ * move to the new leaf page created by the split.
*
- * Further, we can't race with the checkpoint's reconciliation of
- * an internal page as we evict a clean child from the page's subtree.
- * This works in the usual way: eviction locks the page and then checks
- * for existing hazard pointers, the checkpoint thread reconciling an
- * internal page acquires hazard pointers on child pages it reads, and
- * is blocked by the exclusive lock.
- */
- if (page->read_gen != WT_READGEN_OLDEST &&
- !__wt_txn_visible_all(session, __wt_page_is_modified(page) ?
- mod->update_txn : mod->rec_max_txn))
- return (0);
-
- /*
- * If the page was recently split in-memory, don't force it out: we
- * hope an eviction thread will find it first. The check here is
- * similar to __wt_txn_visible_all, but ignores the checkpoint's
- * transaction.
+ * Note the check here is similar to __wt_txn_visible_all, but ignores
+ * the checkpoint's transaction.
*/
- if (check_splits &&
- WT_TXNID_LE(txn_global->oldest_id, mod->inmem_split_txn))
- return (0);
+ if (check_splits) {
+ txn_global = &S2C(session)->txn_global;
+ if (WT_TXNID_LE(txn_global->oldest_id, mod->inmem_split_txn))
+ return (false);
+ }
- return (1);
+ return (true);
}
/*
@@ -1082,17 +1125,17 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref)
* reference without first locking the page, it could be evicted in
* between.
*/
- locked = WT_ATOMIC_CAS4(ref->state, WT_REF_MEM, WT_REF_LOCKED);
+ locked = __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED);
if ((ret = __wt_hazard_clear(session, page)) != 0 || !locked) {
if (locked)
ref->state = WT_REF_MEM;
return (ret == 0 ? EBUSY : ret);
}
- (void)WT_ATOMIC_ADD4(btree->evict_busy, 1);
+ (void)__wt_atomic_addv32(&btree->evict_busy, 1);
too_big = (page->memory_footprint > btree->maxmempage) ? 1 : 0;
- if ((ret = __wt_evict_page(session, ref)) == 0) {
+ if ((ret = __wt_evict(session, ref, 0)) == 0) {
if (too_big)
WT_STAT_FAST_CONN_INCR(session, cache_eviction_force);
else
@@ -1106,7 +1149,7 @@ __wt_page_release_evict(WT_SESSION_IMPL *session, WT_REF *ref)
} else
WT_STAT_FAST_CONN_INCR(session, cache_eviction_force_fail);
- (void)WT_ATOMIC_SUB4(btree->evict_busy, 1);
+ (void)__wt_atomic_subv32(&btree->evict_busy, 1);
return (ret);
}
@@ -1143,12 +1186,13 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
* memory_page_max setting, when we see many deleted items, and when we
* are attempting to scan without trashing the cache.
*
- * Fast checks if eviction is disabled for this operation or this tree,
- * then perform a general check if eviction will be possible.
+ * Fast checks if eviction is disabled for this handle, operation or
+ * tree, then perform a general check if eviction will be possible.
*/
page = ref->page;
if (page->read_gen != WT_READGEN_OLDEST ||
LF_ISSET(WT_READ_NO_EVICT) ||
+ F_ISSET(session, WT_SESSION_NO_EVICTION) ||
F_ISSET(btree, WT_BTREE_NO_EVICTION) ||
!__wt_page_can_evict(session, page, 1, NULL))
return (__wt_hazard_clear(session, page));
@@ -1264,13 +1308,13 @@ __wt_skip_choose_depth(WT_SESSION_IMPL *session)
}
/*
- * __wt_btree_lsm_size --
+ * __wt_btree_lsm_over_size --
* Return if the size of an in-memory tree with a single leaf page is over
* a specified maximum. If called on anything other than a simple tree with a
* single leaf page, returns true so our LSM caller will switch to a new tree.
*/
-static inline int
-__wt_btree_lsm_size(WT_SESSION_IMPL *session, uint64_t maxsize)
+static inline bool
+__wt_btree_lsm_over_size(WT_SESSION_IMPL *session, uint64_t maxsize)
{
WT_BTREE *btree;
WT_PAGE *child, *root;
@@ -1282,20 +1326,20 @@ __wt_btree_lsm_size(WT_SESSION_IMPL *session, uint64_t maxsize)
/* Check for a non-existent tree. */
if (root == NULL)
- return (0);
+ return (false);
/* A tree that can be evicted always requires a switch. */
if (!F_ISSET(btree, WT_BTREE_NO_EVICTION))
- return (1);
+ return (true);
/* Check for a tree with a single leaf page. */
WT_INTL_INDEX_GET(session, root, pindex);
if (pindex->entries != 1) /* > 1 child page, switch */
- return (1);
+ return (true);
first = pindex->index[0];
if (first->state != WT_REF_MEM) /* no child page, ignore */
- return (0);
+ return (false);
/*
* We're reaching down into the page without a hazard pointer, but
@@ -1304,7 +1348,7 @@ __wt_btree_lsm_size(WT_SESSION_IMPL *session, uint64_t maxsize)
*/
child = first->page;
if (child->type != WT_PAGE_ROW_LEAF) /* not a single leaf page */
- return (1);
+ return (true);
return (child->memory_footprint > maxsize);
}
diff --git a/src/include/cache.h b/src/include/cache.h
index ed93f82538c..f199372ea5e 100644
--- a/src/include/cache.h
+++ b/src/include/cache.h
@@ -18,11 +18,6 @@
#define WT_EVICT_WALK_BASE 300 /* Pages tracked across file visits */
#define WT_EVICT_WALK_INCR 100 /* Pages added each walk */
-#define WT_EVICT_PASS_AGGRESSIVE 0x01
-#define WT_EVICT_PASS_ALL 0x02
-#define WT_EVICT_PASS_DIRTY 0x04
-#define WT_EVICT_PASS_WOULD_BLOCK 0x08
-
/*
* WT_EVICT_ENTRY --
* Encapsulation of an eviction candidate.
@@ -96,7 +91,7 @@ struct __wt_cache {
/*
* LRU eviction list information.
*/
- WT_EVICT_ENTRY *evict; /* LRU pages being tracked */
+ WT_EVICT_ENTRY *evict_queue; /* LRU pages being tracked */
WT_EVICT_ENTRY *evict_current; /* LRU current page to be evicted */
uint32_t evict_candidates; /* LRU list pages to evict */
uint32_t evict_entries; /* LRU entries in the queue */
@@ -109,6 +104,7 @@ struct __wt_cache {
* Cache pool information.
*/
uint64_t cp_pass_pressure; /* Calculated pressure from this pass */
+ uint64_t cp_quota; /* Maximum size for this cache */
uint64_t cp_reserved; /* Base size for this cache */
WT_SESSION_IMPL *cp_session; /* May be used for cache management */
uint32_t cp_skip_count; /* Post change stabilization */
@@ -119,6 +115,15 @@ struct __wt_cache {
uint64_t cp_saved_read; /* Read count at last review */
/*
+ * Work state.
+ */
+#define WT_EVICT_PASS_AGGRESSIVE 0x01
+#define WT_EVICT_PASS_ALL 0x02
+#define WT_EVICT_PASS_DIRTY 0x04
+#define WT_EVICT_PASS_WOULD_BLOCK 0x08
+ uint32_t state;
+
+ /*
* Flags.
*/
#define WT_CACHE_POOL_MANAGER 0x01 /* The active cache pool manager */
@@ -140,6 +145,7 @@ struct __wt_cache_pool {
const char *name;
uint64_t size;
uint64_t chunk;
+ uint64_t quota;
uint64_t currently_used;
uint32_t refs; /* Reference count for structure. */
/* Locked: List of connections participating in the cache pool. */
diff --git a/src/include/cache.i b/src/include/cache.i
index 87f8c5543d1..bc33f82d927 100644
--- a/src/include/cache.i
+++ b/src/include/cache.i
@@ -104,48 +104,6 @@ __wt_cache_dirty_inuse(WT_CACHE *cache)
}
/*
- * __wt_cache_status --
- * Return if the cache usage exceeds the eviction or dirty targets.
- */
-static inline void
-__wt_cache_status(WT_SESSION_IMPL *session, int *evictp, int *dirtyp)
-{
- WT_CONNECTION_IMPL *conn;
- WT_CACHE *cache;
- uint64_t bytes_inuse, bytes_max, dirty_inuse;
-
- conn = S2C(session);
- cache = conn->cache;
-
- /*
- * There's an assumption "evict" overrides "dirty", that is, if eviction
- * is required, we no longer care where we are with respect to the dirty
- * target.
- *
- * Avoid division by zero if the cache size has not yet been set in a
- * shared cache.
- */
- bytes_max = conn->cache_size + 1;
- if (evictp != NULL) {
- bytes_inuse = __wt_cache_bytes_inuse(cache);
- if (bytes_inuse > (cache->eviction_target * bytes_max) / 100) {
- *evictp = 1;
- return;
- }
- *evictp = 0;
- }
- if (dirtyp != NULL) {
- dirty_inuse = __wt_cache_dirty_inuse(cache);
- if (dirty_inuse >
- (cache->eviction_dirty_target * bytes_max) / 100) {
- *dirtyp = 1;
- return;
- }
- *dirtyp = 0;
- }
-}
-
-/*
* __wt_session_can_wait --
* Return if a session available for a potentially slow operation.
*/
@@ -161,29 +119,52 @@ __wt_session_can_wait(WT_SESSION_IMPL *session)
return (0);
/*
- * LSM sets the no-cache-check flag when holding the LSM tree lock,
+ * LSM sets the no-eviction flag when holding the LSM tree lock,
* in that case, or when holding the schema lock, we don't want to
* highjack the thread for eviction.
*/
if (F_ISSET(session,
- WT_SESSION_NO_CACHE_CHECK | WT_SESSION_LOCKED_SCHEMA))
+ WT_SESSION_NO_EVICTION | WT_SESSION_LOCKED_SCHEMA))
return (0);
return (1);
}
/*
+ * __wt_eviction_aggressive --
+ * Return if the eviction server is running in aggressive mode.
+ */
+static inline int
+__wt_eviction_aggressive(WT_SESSION_IMPL *session)
+{
+ return (FLD_ISSET(
+ S2C(session)->cache->state, WT_EVICT_PASS_AGGRESSIVE) ? 1 : 0);
+}
+
+/*
+ * __wt_eviction_dirty_target --
+ * Return if the eviction server is running to reduce the number of dirty
+ * pages (versus running to discard pages from the cache).
+ */
+static inline int
+__wt_eviction_dirty_target(WT_SESSION_IMPL *session)
+{
+ return (FLD_ISSET(
+ S2C(session)->cache->state, WT_EVICT_PASS_DIRTY) ? 1 : 0);
+}
+
+/*
* __wt_eviction_needed --
* Return if an application thread should do eviction, and the cache full
* percentage as a side-effect.
*/
-static inline int
-__wt_eviction_needed(WT_SESSION_IMPL *session, int *pct_fullp)
+static inline bool
+__wt_eviction_needed(WT_SESSION_IMPL *session, u_int *pct_fullp)
{
WT_CONNECTION_IMPL *conn;
WT_CACHE *cache;
uint64_t bytes_inuse, bytes_max;
- int pct_full;
+ u_int pct_full;
conn = S2C(session);
cache = conn->cache;
@@ -196,25 +177,20 @@ __wt_eviction_needed(WT_SESSION_IMPL *session, int *pct_fullp)
bytes_max = conn->cache_size + 1;
/*
- * Return the cache full percentage; anything over 95% means we involve
- * the application thread.
+ * Calculate the cache full percentage; anything over the trigger means
+ * we involve the application thread.
*/
- pct_full = (int)((100 * bytes_inuse) / bytes_max);
+ pct_full = (u_int)((100 * bytes_inuse) / bytes_max);
if (pct_fullp != NULL)
*pct_fullp = pct_full;
- if (pct_full >= 95)
- return (1);
+ if (pct_full > cache->eviction_trigger)
+ return (true);
- /*
- * Return if we're over the trigger cache size or there are too many
- * dirty pages.
- */
- if (bytes_inuse > (cache->eviction_trigger * bytes_max) / 100)
- return (1);
+ /* Return if there are too many dirty bytes in cache. */
if (__wt_cache_dirty_inuse(cache) >
(cache->eviction_dirty_trigger * bytes_max) / 100)
- return (1);
- return (0);
+ return (true);
+ return (false);
}
/*
@@ -225,7 +201,7 @@ static inline int
__wt_cache_eviction_check(WT_SESSION_IMPL *session, int busy, int *didworkp)
{
WT_BTREE *btree;
- int pct_full;
+ u_int pct_full;
if (didworkp != NULL)
*didworkp = 0;
@@ -235,7 +211,7 @@ __wt_cache_eviction_check(WT_SESSION_IMPL *session, int busy, int *didworkp)
* that case, or when holding the schema or handle list locks (which
* block eviction), we don't want to highjack the thread for eviction.
*/
- if (F_ISSET(session, WT_SESSION_NO_CACHE_CHECK |
+ if (F_ISSET(session, WT_SESSION_NO_EVICTION |
WT_SESSION_LOCKED_HANDLE_LIST | WT_SESSION_LOCKED_SCHEMA))
return (0);
diff --git a/src/include/cell.i b/src/include/cell.i
index 20a4d214015..a517ac4a523 100644
--- a/src/include/cell.i
+++ b/src/include/cell.i
@@ -182,7 +182,7 @@ __wt_cell_pack_addr(WT_CELL *cell, u_int cell_type, uint64_t recno, size_t size)
p = cell->__chunk + 1;
- if (recno == 0)
+ if (recno == WT_RECNO_OOB)
cell->__chunk[0] = cell_type; /* Type */
else {
cell->__chunk[0] = cell_type | WT_CELL_64V;
@@ -547,7 +547,8 @@ __wt_cell_leaf_value_parse(WT_PAGE *page, WT_CELL *cell)
* Unpack a WT_CELL into a structure during verification.
*/
static inline int
-__wt_cell_unpack_safe(WT_CELL *cell, WT_CELL_UNPACK *unpack, uint8_t *end)
+__wt_cell_unpack_safe(
+ WT_CELL *cell, WT_CELL_UNPACK *unpack, const void *start, const void *end)
{
struct {
uint32_t len;
@@ -560,14 +561,15 @@ __wt_cell_unpack_safe(WT_CELL *cell, WT_CELL_UNPACK *unpack, uint8_t *end)
copy.v = 0; /* -Werror=maybe-uninitialized */
/*
- * The verification code specifies an end argument, a pointer to 1 past
- * the end-of-page. In that case, make sure we don't go past the end
- * of the page when reading. If an error occurs, we simply return the
- * error code, the verification code takes care of complaining (and, in
- * the case of salvage, it won't complain at all, it's OK to fail).
+ * The verification code specifies start/end arguments, pointers to the
+ * start of the page and to 1 past the end-of-page. In which case, make
+ * sure all reads are inside the page image. If an error occurs, return
+ * an error code but don't output messages, our caller handles that.
*/
-#define WT_CELL_LEN_CHK(p, len) do { \
- if (end != NULL && (((uint8_t *)p) + (len)) > end) \
+#define WT_CELL_LEN_CHK(t, len) do { \
+ if (start != NULL && \
+ ((uint8_t *)t < (uint8_t *)start || \
+ (((uint8_t *)t) + (len)) > (uint8_t *)end)) \
return (WT_ERROR); \
} while (0)
@@ -630,7 +632,7 @@ restart:
*/
if (cell->__chunk[0] & WT_CELL_64V) /* skip value */
WT_RET(__wt_vunpack_uint(
- &p, end == NULL ? 0 : (size_t)(end - p), &unpack->v));
+ &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &unpack->v));
/*
* Handle special actions for a few different cell types and set the
@@ -647,7 +649,7 @@ restart:
* earlier cell.
*/
WT_RET(__wt_vunpack_uint(
- &p, end == NULL ? 0 : (size_t)(end - p), &v));
+ &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &v));
copy.len = WT_PTRDIFF32(p, cell);
copy.v = unpack->v;
cell = (WT_CELL *)((uint8_t *)cell - v);
@@ -675,7 +677,7 @@ restart:
* data.
*/
WT_RET(__wt_vunpack_uint(
- &p, end == NULL ? 0 : (size_t)(end - p), &v));
+ &p, end == NULL ? 0 : WT_PTRDIFF(end, p), &v));
if (unpack->raw == WT_CELL_KEY ||
unpack->raw == WT_CELL_KEY_PFX ||
@@ -716,7 +718,7 @@ done: WT_CELL_LEN_CHK(cell, unpack->__len);
static inline void
__wt_cell_unpack(WT_CELL *cell, WT_CELL_UNPACK *unpack)
{
- (void)__wt_cell_unpack_safe(cell, unpack, NULL);
+ (void)__wt_cell_unpack_safe(cell, unpack, NULL, NULL);
}
/*
diff --git a/src/include/connection.h b/src/include/connection.h
index cd55aadfc07..d8ff261cd82 100644
--- a/src/include/connection.h
+++ b/src/include/connection.h
@@ -38,8 +38,8 @@ struct __wt_keyed_encryptor {
size_t size_const; /* The result of the sizing callback */
WT_ENCRYPTOR *encryptor; /* User supplied callbacks */
/* Linked list of encryptors */
- SLIST_ENTRY(__wt_keyed_encryptor) hashl;
- SLIST_ENTRY(__wt_keyed_encryptor) l;
+ TAILQ_ENTRY(__wt_keyed_encryptor) hashq;
+ TAILQ_ENTRY(__wt_keyed_encryptor) q;
};
/*
@@ -82,9 +82,9 @@ struct __wt_named_encryptor {
const char *name; /* Name of encryptor */
WT_ENCRYPTOR *encryptor; /* User supplied callbacks */
/* Locked: list of encryptors by key */
- SLIST_HEAD(__wt_keyedhash, __wt_keyed_encryptor)
- keyedhashlh[WT_HASH_ARRAY_SIZE];
- SLIST_HEAD(__wt_keyed_lh, __wt_keyed_encryptor) keyedlh;
+ TAILQ_HEAD(__wt_keyedhash, __wt_keyed_encryptor)
+ keyedhashqh[WT_HASH_ARRAY_SIZE];
+ TAILQ_HEAD(__wt_keyed_qh, __wt_keyed_encryptor) keyedqh;
/* Linked list of encryptors */
TAILQ_ENTRY(__wt_named_encryptor) q;
};
@@ -100,10 +100,10 @@ struct __wt_named_extractor {
};
/*
- * Allocate some additional slots for internal sessions. There is a default
- * session for each connection, plus a session for each server thread.
+ * Allocate some additional slots for internal sessions so the user cannot
+ * configure too few sessions for us to run.
*/
-#define WT_NUM_INTERNAL_SESSIONS 10
+#define WT_EXTRA_INTERNAL_SESSIONS 10
/*
* WT_CONN_CHECK_PANIC --
@@ -119,14 +119,15 @@ struct __wt_named_extractor {
* main queue and the hashed queue.
*/
#define WT_CONN_DHANDLE_INSERT(conn, dhandle, bucket) do { \
- SLIST_INSERT_HEAD(&(conn)->dhlh, dhandle, l); \
- SLIST_INSERT_HEAD(&(conn)->dhhash[bucket], dhandle, hashl); \
+ TAILQ_INSERT_HEAD(&(conn)->dhqh, dhandle, q); \
+ TAILQ_INSERT_HEAD(&(conn)->dhhash[bucket], dhandle, hashq); \
+ ++conn->dhandle_count; \
} while (0)
#define WT_CONN_DHANDLE_REMOVE(conn, dhandle, bucket) do { \
- SLIST_REMOVE(&(conn)->dhlh, dhandle, __wt_data_handle, l); \
- SLIST_REMOVE(&(conn)->dhhash[bucket], \
- dhandle, __wt_data_handle, hashl); \
+ TAILQ_REMOVE(&(conn)->dhqh, dhandle, q); \
+ TAILQ_REMOVE(&(conn)->dhhash[bucket], dhandle, hashq); \
+ --conn->dhandle_count; \
} while (0)
/*
@@ -134,14 +135,13 @@ struct __wt_named_extractor {
* main queue and the hashed queue.
*/
#define WT_CONN_BLOCK_INSERT(conn, block, bucket) do { \
- SLIST_INSERT_HEAD(&(conn)->blocklh, block, l); \
- SLIST_INSERT_HEAD(&(conn)->blockhash[bucket], block, hashl); \
+ TAILQ_INSERT_HEAD(&(conn)->blockqh, block, q); \
+ TAILQ_INSERT_HEAD(&(conn)->blockhash[bucket], block, hashq); \
} while (0)
#define WT_CONN_BLOCK_REMOVE(conn, block, bucket) do { \
- SLIST_REMOVE(&(conn)->blocklh, block, __wt_block, l); \
- SLIST_REMOVE( \
- &(conn)->blockhash[bucket], block, __wt_block, hashl); \
+ TAILQ_REMOVE(&(conn)->blockqh, block, q); \
+ TAILQ_REMOVE(&(conn)->blockhash[bucket], block, hashq); \
} while (0)
/*
@@ -149,13 +149,13 @@ struct __wt_named_extractor {
* main queue and the hashed queue.
*/
#define WT_CONN_FILE_INSERT(conn, fh, bucket) do { \
- SLIST_INSERT_HEAD(&(conn)->fhlh, fh, l); \
- SLIST_INSERT_HEAD(&(conn)->fhhash[bucket], fh, hashl); \
+ TAILQ_INSERT_HEAD(&(conn)->fhqh, fh, q); \
+ TAILQ_INSERT_HEAD(&(conn)->fhhash[bucket], fh, hashq); \
} while (0)
#define WT_CONN_FILE_REMOVE(conn, fh, bucket) do { \
- SLIST_REMOVE(&(conn)->fhlh, fh, __wt_fh, l); \
- SLIST_REMOVE(&(conn)->fhhash[bucket], fh, __wt_fh, hashl); \
+ TAILQ_REMOVE(&(conn)->fhqh, fh, q); \
+ TAILQ_REMOVE(&(conn)->fhhash[bucket], fh, hashq); \
} while (0)
/*
@@ -180,13 +180,17 @@ struct __wt_connection_impl {
WT_SPINLOCK table_lock; /* Table creation spinlock */
/*
- * We distribute the btree page locks across a set of spin locks; it
- * can't be an array, we impose cache-line alignment and gcc doesn't
- * support that for arrays. Don't use too many: they are only held for
- * very short operations, each one is 64 bytes, so 256 will fill the L1
- * cache on most CPUs.
+ * We distribute the btree page locks across a set of spin locks. Don't
+ * use too many: they are only held for very short operations, each one
+ * is 64 bytes, so 256 will fill the L1 cache on most CPUs.
+ *
+ * Use a prime number of buckets rather than assuming a good hash
+ * (Reference Sedgewick, Algorithms in C, "Hash Functions").
+ *
+ * Note: this can't be an array, we impose cache-line alignment and gcc
+ * doesn't support that for arrays smaller than the alignment.
*/
-#define WT_PAGE_LOCKS(conn) 16
+#define WT_PAGE_LOCKS 17
WT_SPINLOCK *page_lock; /* Btree page spinlocks */
u_int page_lock_cnt; /* Next spinlock to use */
@@ -211,6 +215,8 @@ struct __wt_connection_impl {
WT_FH *lock_fh; /* Lock file handle */
volatile uint64_t split_gen; /* Generation number for splits */
+ uint64_t split_stashed_bytes; /* Atomic: split statistics */
+ uint64_t split_stashed_objects;
/*
* The connection keeps a cache of data handles. The set of handles
@@ -219,24 +225,26 @@ struct __wt_connection_impl {
* URI.
*/
/* Locked: data handle hash array */
- SLIST_HEAD(__wt_dhhash, __wt_data_handle) dhhash[WT_HASH_ARRAY_SIZE];
+ TAILQ_HEAD(__wt_dhhash, __wt_data_handle) dhhash[WT_HASH_ARRAY_SIZE];
/* Locked: data handle list */
- SLIST_HEAD(__wt_dhandle_lh, __wt_data_handle) dhlh;
+ TAILQ_HEAD(__wt_dhandle_qh, __wt_data_handle) dhqh;
/* Locked: LSM handle list. */
TAILQ_HEAD(__wt_lsm_qh, __wt_lsm_tree) lsmqh;
/* Locked: file list */
- SLIST_HEAD(__wt_fhhash, __wt_fh) fhhash[WT_HASH_ARRAY_SIZE];
- SLIST_HEAD(__wt_fh_lh, __wt_fh) fhlh;
+ TAILQ_HEAD(__wt_fhhash, __wt_fh) fhhash[WT_HASH_ARRAY_SIZE];
+ TAILQ_HEAD(__wt_fh_qh, __wt_fh) fhqh;
/* Locked: library list */
TAILQ_HEAD(__wt_dlh_qh, __wt_dlh) dlhqh;
WT_SPINLOCK block_lock; /* Locked: block manager list */
- SLIST_HEAD(__wt_blockhash, __wt_block) blockhash[WT_HASH_ARRAY_SIZE];
- SLIST_HEAD(__wt_block_lh, __wt_block) blocklh;
+ TAILQ_HEAD(__wt_blockhash, __wt_block) blockhash[WT_HASH_ARRAY_SIZE];
+ TAILQ_HEAD(__wt_block_qh, __wt_block) blockqh;
+ u_int dhandle_count; /* Locked: handles in the queue */
u_int open_btree_count; /* Locked: open writable btree count */
uint32_t next_file_id; /* Locked: file ID counter */
uint32_t open_file_count; /* Atomic: open file handle count */
+ uint32_t open_cursor_count; /* Atomic: open cursor handle count */
/*
* WiredTiger allocates space for 50 simultaneous sessions (threads of
@@ -262,7 +270,9 @@ struct __wt_connection_impl {
uint32_t hazard_max; /* Hazard array size */
WT_CACHE *cache; /* Page cache */
- uint64_t cache_size; /* Configured cache size */
+ volatile uint64_t cache_size; /* Cache size (either statically
+ configured or the current size
+ within a cache pool). */
WT_TXN_GLOBAL txn_global; /* Global transaction state */
@@ -277,9 +287,12 @@ struct __wt_connection_impl {
#define WT_CKPT_LOGSIZE(conn) ((conn)->ckpt_logsize != 0)
wt_off_t ckpt_logsize; /* Checkpoint log size period */
uint32_t ckpt_signalled;/* Checkpoint signalled */
- uint64_t ckpt_usecs; /* Checkpoint period */
- int compact_in_memory_pass; /* Compaction serialization */
+ uint64_t ckpt_usecs; /* Checkpoint timer */
+ uint64_t ckpt_time_max; /* Checkpoint time min/max */
+ uint64_t ckpt_time_min;
+ uint64_t ckpt_time_recent; /* Checkpoint time recent/total */
+ uint64_t ckpt_time_total;
#define WT_CONN_STAT_ALL 0x01 /* "all" statistics configured */
#define WT_CONN_STAT_CLEAR 0x02 /* clear after gathering */
@@ -289,7 +302,9 @@ struct __wt_connection_impl {
#define WT_CONN_STAT_SIZE 0x20 /* "size" statistics configured */
uint32_t stat_flags;
- WT_CONNECTION_STATS stats; /* Connection statistics */
+ /* Connection statistics */
+ WT_CONNECTION_STATS *stats[WT_COUNTER_SLOTS];
+ WT_CONNECTION_STATS stat_array[WT_COUNTER_SLOTS];
WT_ASYNC *async; /* Async structure */
int async_cfg; /* Global async configuration */
@@ -325,7 +340,8 @@ struct __wt_connection_impl {
#define WT_CONN_LOG_ENABLED 0x02 /* Logging is enabled */
#define WT_CONN_LOG_EXISTED 0x04 /* Log files found */
#define WT_CONN_LOG_PREALLOC 0x08 /* Pre-allocation is enabled */
-#define WT_CONN_LOG_RECOVER_ERR 0x10 /* Error if recovery required */
+#define WT_CONN_LOG_RECOVER_DONE 0x10 /* Recovery completed */
+#define WT_CONN_LOG_RECOVER_ERR 0x20 /* Error if recovery required */
uint32_t log_flags; /* Global logging configuration */
WT_CONDVAR *log_cond; /* Log server wait mutex */
WT_SESSION_IMPL *log_session; /* Log server session */
@@ -354,6 +370,20 @@ struct __wt_connection_impl {
time_t sweep_interval;/* Handle sweep interval */
u_int sweep_handles_min;/* Handle sweep minimum open */
+ /*
+ * Shared lookaside lock, session and cursor, used by threads accessing
+ * the lookaside table (other than eviction server and worker threads
+ * and the sweep thread, all of which have their own lookaside cursors).
+ */
+ WT_SPINLOCK las_lock; /* Lookaside table spinlock */
+ WT_SESSION_IMPL *las_session; /* Lookaside table session */
+ WT_CURSOR *las_cursor; /* Lookaside table cursor */
+ bool las_written; /* Lookaside table has been written */
+
+ WT_ITEM las_sweep_key; /* Sweep server's saved key */
+ int las_sweep_call;/* Sweep server's call count */
+ uint64_t las_sweep_cnt; /* Sweep server's per-call row count */
+
/* Locked: collator list */
TAILQ_HEAD(__wt_coll_qh, __wt_named_collator) collqh;
diff --git a/src/include/cursor.h b/src/include/cursor.h
index 36f36f2c46c..2f55dfc8186 100644
--- a/src/include/cursor.h
+++ b/src/include/cursor.h
@@ -261,6 +261,7 @@ struct __wt_cursor_index {
WT_CURSOR *child;
WT_CURSOR **cg_cursors;
+ uint8_t *cg_needvalue;
};
struct __wt_cursor_json {
@@ -303,10 +304,10 @@ struct __wt_cursor_stat {
int notinitialized; /* Cursor not initialized */
int notpositioned; /* Cursor not positioned */
- WT_STATS *stats; /* Stats owned by the cursor */
- WT_STATS *stats_first; /* First stats reference */
- int stats_base; /* Base statistics value */
- int stats_count; /* Count of stats elements */
+ int64_t *stats; /* Statistics */
+ int stats_base; /* Base statistics value */
+ int stats_count; /* Count of statistics values */
+ const char *(*stats_desc)(int); /* Statistics descriptions */
union { /* Copies of the statistics */
WT_DSRC_STATS dsrc_stats;
@@ -325,12 +326,10 @@ struct __wt_cursor_stat {
/*
* WT_CURSOR_STATS --
- * Return a reference to a statistic cursor's stats structures; use the
- * WT_CURSOR.stats_first field instead of WT_CURSOR.stats because the latter
- * is NULL when non-cursor memory is used to hold the statistics.
+ * Return a reference to a statistic cursor's stats structures.
*/
#define WT_CURSOR_STATS(cursor) \
- (((WT_CURSOR_STAT *)cursor)->stats_first)
+ (((WT_CURSOR_STAT *)cursor)->stats)
struct __wt_cursor_table {
WT_CURSOR iface;
diff --git a/src/include/cursor.i b/src/include/cursor.i
index 9e592ede450..e7fed250251 100644
--- a/src/include/cursor.i
+++ b/src/include/cursor.i
@@ -32,7 +32,7 @@ __cursor_pos_clear(WT_CURSOR_BTREE *cbt)
* and it's a minimal set of things we need to clear. It would be a
* lot simpler to clear everything, but we call this function a lot.
*/
- cbt->recno = 0;
+ cbt->recno = WT_RECNO_OOB;
cbt->ins = NULL;
cbt->ins_head = NULL;
@@ -150,7 +150,7 @@ __wt_cursor_dhandle_incr_use(WT_SESSION_IMPL *session)
dhandle = session->dhandle;
/* If we open a handle with a time of death set, clear it. */
- if (WT_ATOMIC_ADD4(dhandle->session_inuse, 1) == 1 &&
+ if (__wt_atomic_addi32(&dhandle->session_inuse, 1) == 1 &&
dhandle->timeofdeath != 0)
dhandle->timeofdeath = 0;
}
@@ -168,7 +168,7 @@ __wt_cursor_dhandle_decr_use(WT_SESSION_IMPL *session)
/* If we close a handle with a time of death set, clear it. */
WT_ASSERT(session, dhandle->session_inuse > 0);
- if (WT_ATOMIC_SUB4(dhandle->session_inuse, 1) == 0 &&
+ if (__wt_atomic_subi32(&dhandle->session_inuse, 1) == 0 &&
dhandle->timeofdeath != 0)
dhandle->timeofdeath = 0;
}
@@ -187,6 +187,12 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, int reenter)
if (reenter)
WT_RET(__curfile_leave(cbt));
+ /*
+ * Any old insert position is now invalid. We rely on this being
+ * cleared to detect if a new skiplist is installed after a search.
+ */
+ cbt->ins_stack[0] = NULL;
+
/* If the transaction is idle, check that the cache isn't full. */
WT_RET(__wt_txn_idle_cache_check(session));
diff --git a/src/include/dhandle.h b/src/include/dhandle.h
index d41631696b4..9a54b4ddb66 100644
--- a/src/include/dhandle.h
+++ b/src/include/dhandle.h
@@ -28,14 +28,19 @@
*/
#define WT_SAVE_DHANDLE(s, e) WT_WITH_DHANDLE(s, (s)->dhandle, e)
+/* Check if a handle is inactive. */
+#define WT_DHANDLE_INACTIVE(dhandle) \
+ (F_ISSET(dhandle, WT_DHANDLE_DEAD) || \
+ !F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_OPEN))
+
/*
* WT_DATA_HANDLE --
* A handle for a generic named data source.
*/
struct __wt_data_handle {
WT_RWLOCK *rwlock; /* Lock for shared/exclusive ops */
- SLIST_ENTRY(__wt_data_handle) l;
- SLIST_ENTRY(__wt_data_handle) hashl;
+ TAILQ_ENTRY(__wt_data_handle) q;
+ TAILQ_ENTRY(__wt_data_handle) hashq;
/*
* Sessions caching a connection's data handle will have a non-zero
@@ -64,7 +69,9 @@ struct __wt_data_handle {
*/
WT_SPINLOCK close_lock; /* Lock to close the handle */
- WT_DSRC_STATS stats; /* Data-source statistics */
+ /* Data-source statistics */
+ WT_DSRC_STATS *stats[WT_COUNTER_SLOTS];
+ WT_DSRC_STATS stat_array[WT_COUNTER_SLOTS];
/* Flags values over 0xff are reserved for WT_BTREE_* */
#define WT_DHANDLE_DEAD 0x01 /* Dead, awaiting discard */
diff --git a/src/include/error.h b/src/include/error.h
index fcb96b16361..abffc02945e 100644
--- a/src/include/error.h
+++ b/src/include/error.h
@@ -92,7 +92,8 @@
return (__wt_illegal_value(session, NULL))
#define WT_ILLEGAL_VALUE_ERR(session) \
default: \
- WT_ERR(__wt_illegal_value(session, NULL))
+ ret = __wt_illegal_value(session, NULL); \
+ goto err
#define WT_ILLEGAL_VALUE_SET(session) \
default: \
ret = __wt_illegal_value(session, NULL); \
diff --git a/src/include/extern.h b/src/include/extern.h
index f0c1a0e310a..a8f11c8694f 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -63,7 +63,7 @@ extern int __wt_block_ext_prealloc(WT_SESSION_IMPL *session, u_int max);
extern int __wt_block_ext_discard(WT_SESSION_IMPL *session, u_int max);
extern int __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block);
extern int __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block);
-extern int __wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size);
+extern bool __wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size);
extern int __wt_block_salvage_next(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t *addr_sizep, int *eofp);
extern int __wt_block_salvage_valid(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t addr_size, int valid);
extern int __wt_block_verify_start(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase, const char *cfg[]);
@@ -101,8 +101,9 @@ extern int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt);
extern int __wt_btcur_compare(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *cmpp);
extern int __wt_btcur_equals( WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *equalp);
extern int __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop);
+extern void __wt_btcur_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt);
extern void __wt_btcur_open(WT_CURSOR_BTREE *cbt);
-extern int __wt_btcur_close(WT_CURSOR_BTREE *cbt);
+extern int __wt_btcur_close(WT_CURSOR_BTREE *cbt, int lowlevel);
extern int __wt_debug_set_verbose(WT_SESSION_IMPL *session, const char *v);
extern int __wt_debug_addr_print( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size);
extern int __wt_debug_addr(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, const char *ofile);
@@ -115,12 +116,13 @@ extern int __wt_debug_tree(WT_SESSION_IMPL *session, WT_PAGE *page, const char *
extern int __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile);
extern int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp);
extern void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref);
-extern int __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref);
+extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref);
extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref);
extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref);
extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep);
extern void __wt_free_ref( WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref, int free_pages);
extern void __wt_free_ref_index(WT_SESSION_IMPL *session, WT_PAGE *page, WT_PAGE_INDEX *pindex, int free_pages);
+extern void __wt_free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd);
extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]);
extern int __wt_btree_close(WT_SESSION_IMPL *session);
extern void __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, int is_recno);
@@ -138,15 +140,15 @@ extern const char *__wt_addr_string(WT_SESSION_IMPL *session, const uint8_t *add
extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store);
extern int __wt_ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, void *cookie, WT_CELL_UNPACK *vpack);
extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell);
+extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint64_t recno, uint32_t alloc_entries, int alloc_refs, WT_PAGE **pagep);
+extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep);
+extern int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size);
extern int
__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
#ifdef HAVE_DIAGNOSTIC
, const char *file, int line
#endif
);
-extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint64_t recno, uint32_t alloc_entries, int alloc_refs, WT_PAGE **pagep);
-extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep);
-extern int __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref);
extern int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd);
extern int __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]);
extern void __wt_split_stash_discard(WT_SESSION_IMPL *session);
@@ -159,10 +161,10 @@ extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst);
extern int __wt_cache_op(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, int op);
extern int __wt_upgrade(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]);
-extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk, size_t size, int empty_page_ok);
-extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf);
+extern int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk, size_t size, int empty_page_ok);
+extern int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *tag, WT_ITEM *buf);
extern int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags);
-extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove);
+extern int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, int is_remove);
extern int __wt_col_search(WT_SESSION_IMPL *session, uint64_t recno, WT_REF *leaf, WT_CURSOR_BTREE *cbt);
extern int __wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page);
extern int __wt_row_leaf_key_copy( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key);
@@ -179,6 +181,14 @@ extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page,
extern int __wt_search_insert( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key);
extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, int insert);
extern int __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt);
+extern void __wt_las_stats_update(WT_SESSION_IMPL *session);
+extern int __wt_las_create(WT_SESSION_IMPL *session);
+extern int __wt_las_destroy(WT_SESSION_IMPL *session);
+extern void __wt_las_set_written(WT_SESSION_IMPL *session);
+extern bool __wt_las_is_written(WT_SESSION_IMPL *session);
+extern int __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags);
+extern int __wt_las_cursor_close( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags);
+extern int __wt_las_sweep(WT_SESSION_IMPL *session);
extern int __wt_config_initn( WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str, size_t len);
extern int __wt_config_init(WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str);
extern int __wt_config_subinit( WT_SESSION_IMPL *session, WT_CONFIG *conf, WT_CONFIG_ITEM *item);
@@ -237,7 +247,7 @@ extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session);
extern int __wt_connection_init(WT_CONNECTION_IMPL *conn);
extern int __wt_connection_destroy(WT_CONNECTION_IMPL *conn);
extern int __wt_log_truncate_files( WT_SESSION_IMPL *session, WT_CURSOR *cursor, const char *cfg[]);
-extern int __wt_log_wrlsn(WT_SESSION_IMPL *session, uint32_t *free_i, int *yield);
+extern int __wt_log_wrlsn(WT_SESSION_IMPL *session);
extern int __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_logmgr_open(WT_SESSION_IMPL *session);
extern int __wt_logmgr_destroy(WT_SESSION_IMPL *session);
@@ -309,14 +319,14 @@ extern void __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref);
extern int __wt_evict_server_wake(WT_SESSION_IMPL *session);
extern int __wt_evict_create(WT_SESSION_IMPL *session);
extern int __wt_evict_destroy(WT_SESSION_IMPL *session);
-extern int __wt_evict_page(WT_SESSION_IMPL *session, WT_REF *ref);
extern int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, int *evict_resetp);
extern void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session);
-extern int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, int pct_full);
-extern void __wt_cache_dump(WT_SESSION_IMPL *session);
+extern int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, u_int pct_full);
+extern int __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile);
extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, int closing);
extern int __wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, int closing);
extern int __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn);
+extern int __wt_log_ckpt_lsn(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn);
extern int __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn);
extern int __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn);
extern int __wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, int *rec);
@@ -324,12 +334,14 @@ extern void __wt_log_written_reset(WT_SESSION_IMPL *session);
extern int __wt_log_get_all_files(WT_SESSION_IMPL *session, char ***filesp, u_int *countp, uint32_t *maxid, int active_only);
extern void __wt_log_files_free(WT_SESSION_IMPL *session, char **files, u_int count);
extern int __wt_log_extract_lognum( WT_SESSION_IMPL *session, const char *name, uint32_t *id);
+extern int __wt_log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot);
extern int __wt_log_allocfile( WT_SESSION_IMPL *session, uint32_t lognum, const char *dest, int prealloc);
extern int __wt_log_remove(WT_SESSION_IMPL *session, const char *file_prefix, uint32_t lognum);
extern int __wt_log_open(WT_SESSION_IMPL *session);
extern int __wt_log_close(WT_SESSION_IMPL *session);
-extern int __wt_log_newfile(WT_SESSION_IMPL *session, int conn_create, int *created);
+extern int __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep);
extern int __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, int (*func)(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, WT_LSN *next_lsnp, void *cookie, int firstrecord), void *cookie);
+extern int __wt_log_force_write(WT_SESSION_IMPL *session, int retry);
extern int __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags);
extern int __wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap);
extern int __wt_logrec_alloc(WT_SESSION_IMPL *session, size_t size, WT_ITEM **logrecp);
@@ -355,14 +367,15 @@ extern int __wt_logop_row_truncate_pack( WT_SESSION_IMPL *session, WT_ITEM *logr
extern int __wt_logop_row_truncate_unpack( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, uint32_t *fileidp, WT_ITEM *startp, WT_ITEM *stopp, uint32_t *modep);
extern int __wt_logop_row_truncate_print( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
extern int __wt_txn_op_printlog( WT_SESSION_IMPL *session, const uint8_t **pp, const uint8_t *end, FILE *out);
+extern void __wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
+extern int __wt_log_slot_close( WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *releasep, int forced);
+extern int __wt_log_slot_switch( WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int retry, int forced);
+extern int __wt_log_slot_new(WT_SESSION_IMPL *session);
extern int __wt_log_slot_init(WT_SESSION_IMPL *session);
extern int __wt_log_slot_destroy(WT_SESSION_IMPL *session);
-extern int __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslotp);
-extern int __wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
-extern int __wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
-extern int __wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
-extern int64_t __wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size);
-extern int __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
+extern void __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot);
+extern int64_t __wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size);
+extern void __wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot);
extern int __wt_clsm_request_switch(WT_CURSOR_LSM *clsm);
extern int __wt_clsm_await_switch(WT_CURSOR_LSM *clsm);
extern int __wt_clsm_init_merge( WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks);
@@ -475,7 +488,7 @@ extern int __wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t siz
extern int __wt_mmap_discard(WT_SESSION_IMPL *session, void *p, size_t size);
extern int __wt_munmap(WT_SESSION_IMPL *session, WT_FH *fh, void *map, size_t len, void **mappingcookie);
extern int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, int is_signalled, WT_CONDVAR **condp);
-extern int __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs);
+extern int __wt_cond_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, int *signalled);
extern int __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond);
extern int __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp);
extern int __wt_rwlock_alloc( WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name);
@@ -489,7 +502,7 @@ extern int __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp);
extern int __wt_once(void (*init_routine)(void));
extern int __wt_open(WT_SESSION_IMPL *session, const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp);
extern int __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp);
-extern int __wt_absolute_path(const char *path);
+extern bool __wt_absolute_path(const char *path);
extern const char *__wt_path_separator(void);
extern int __wt_has_priv(void);
extern int __wt_remove(WT_SESSION_IMPL *session, const char *name);
@@ -577,6 +590,8 @@ extern int __wt_schema_worker(WT_SESSION_IMPL *session, const char *uri, int (*f
extern int __wt_session_reset_cursors(WT_SESSION_IMPL *session, int free_buffers);
extern int __wt_session_copy_values(WT_SESSION_IMPL *session);
extern int __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_session_create( WT_SESSION_IMPL *session, const char *uri, const char *config);
+extern int __wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]);
extern int __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, int uses_dhandles, int open_metadata, WT_SESSION_IMPL **sessionp);
extern int __wt_open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const char *config, int open_metadata, WT_SESSION_IMPL **sessionp);
extern int __wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, int *skip);
@@ -639,7 +654,7 @@ extern int __wt_huffman_decode(WT_SESSION_IMPL *session, void *huffman_arg, cons
extern uint32_t __wt_nlpo2_round(uint32_t v);
extern uint32_t __wt_nlpo2(uint32_t v);
extern uint32_t __wt_log2_int(uint32_t n);
-extern int __wt_ispo2(uint32_t v);
+extern bool __wt_ispo2(uint32_t v);
extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2);
extern void __wt_random_init(WT_RAND_STATE volatile *rnd_state);
extern uint32_t __wt_random(WT_RAND_STATE volatile *rnd_state);
@@ -655,11 +670,19 @@ __wt_scr_alloc_func(WT_SESSION_IMPL *session, size_t size, WT_ITEM **scratchp
extern void __wt_scr_discard(WT_SESSION_IMPL *session);
extern void *__wt_ext_scr_alloc( WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, size_t size);
extern void __wt_ext_scr_free(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, void *p);
-extern void __wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats);
-extern void __wt_stat_refresh_dsrc_stats(void *stats_arg);
-extern void __wt_stat_aggregate_dsrc_stats(const void *child, const void *parent);
-extern void __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats);
-extern void __wt_stat_refresh_connection_stats(void *stats_arg);
+extern const char *__wt_stat_dsrc_desc(int slot);
+extern void __wt_stat_dsrc_init_single(WT_DSRC_STATS *stats);
+extern void __wt_stat_dsrc_init(WT_DATA_HANDLE *handle);
+extern void __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats);
+extern void __wt_stat_dsrc_clear_all(WT_DSRC_STATS **stats);
+extern void __wt_stat_dsrc_aggregate_single( WT_DSRC_STATS *from, WT_DSRC_STATS *to);
+extern void __wt_stat_dsrc_aggregate( WT_DSRC_STATS **from, WT_DSRC_STATS *to);
+extern const char *__wt_stat_connection_desc(int slot);
+extern void __wt_stat_connection_init_single(WT_CONNECTION_STATS *stats);
+extern void __wt_stat_connection_init(WT_CONNECTION_IMPL *handle);
+extern void __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats);
+extern void __wt_stat_connection_clear_all(WT_CONNECTION_STATS **stats);
+extern void __wt_stat_connection_aggregate( WT_CONNECTION_STATS **from, WT_CONNECTION_STATS *to);
extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session);
extern void __wt_txn_get_snapshot(WT_SESSION_IMPL *session);
extern void __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force);
diff --git a/src/include/flags.h b/src/include/flags.h
index 675ede9a8a0..ca3c3c38245 100644
--- a/src/include/flags.h
+++ b/src/include/flags.h
@@ -18,6 +18,8 @@
#define WT_CONN_SERVER_SWEEP 0x00002000
#define WT_CONN_WAS_BACKUP 0x00004000
#define WT_EVICTING 0x00000001
+#define WT_EVICT_LOOKASIDE 0x00000002
+#define WT_EVICT_UPDATE_RESTORE 0x00000004
#define WT_FILE_TYPE_CHECKPOINT 0x00000001
#define WT_FILE_TYPE_DATA 0x00000002
#define WT_FILE_TYPE_DIRECTORY 0x00000004
@@ -42,27 +44,25 @@
#define WT_READ_WONT_NEED 0x00000100
#define WT_SESSION_CAN_WAIT 0x00000001
#define WT_SESSION_CLEAR_EVICT_WALK 0x00000002
-#define WT_SESSION_DISCARD_FORCE 0x00000004
-#define WT_SESSION_INTERNAL 0x00000008
-#define WT_SESSION_LOCKED_CHECKPOINT 0x00000010
-#define WT_SESSION_LOCKED_HANDLE_LIST 0x00000020
-#define WT_SESSION_LOCKED_SCHEMA 0x00000040
+#define WT_SESSION_INTERNAL 0x00000004
+#define WT_SESSION_LOCKED_CHECKPOINT 0x00000008
+#define WT_SESSION_LOCKED_HANDLE_LIST 0x00000010
+#define WT_SESSION_LOCKED_SCHEMA 0x00000020
+#define WT_SESSION_LOCKED_SLOT 0x00000040
#define WT_SESSION_LOCKED_TABLE 0x00000080
#define WT_SESSION_LOGGING_INMEM 0x00000100
-#define WT_SESSION_NO_CACHE 0x00000200
-#define WT_SESSION_NO_CACHE_CHECK 0x00000400
+#define WT_SESSION_LOOKASIDE_CURSOR 0x00000200
+#define WT_SESSION_NO_CACHE 0x00000400
#define WT_SESSION_NO_DATA_HANDLES 0x00000800
-#define WT_SESSION_NO_LOGGING 0x00001000
-#define WT_SESSION_NO_SCHEMA_LOCK 0x00002000
-#define WT_SESSION_SALVAGE_CORRUPT_OK 0x00004000
-#define WT_SESSION_SERVER_ASYNC 0x00008000
-#define WT_SKIP_UPDATE_ERR 0x00000002
-#define WT_SKIP_UPDATE_RESTORE 0x00000004
+#define WT_SESSION_NO_EVICTION 0x00001000
+#define WT_SESSION_NO_LOGGING 0x00002000
+#define WT_SESSION_NO_SCHEMA_LOCK 0x00004000
+#define WT_SESSION_QUIET_CORRUPT_FILE 0x00008000
+#define WT_SESSION_SERVER_ASYNC 0x00010000
#define WT_SYNC_CHECKPOINT 0x00000001
#define WT_SYNC_CLOSE 0x00000002
#define WT_SYNC_DISCARD 0x00000004
-#define WT_SYNC_DISCARD_FORCE 0x00000008
-#define WT_SYNC_WRITE_LEAVES 0x00000010
+#define WT_SYNC_WRITE_LEAVES 0x00000008
#define WT_TXN_LOG_CKPT_CLEANUP 0x00000001
#define WT_TXN_LOG_CKPT_PREPARE 0x00000002
#define WT_TXN_LOG_CKPT_START 0x00000004
@@ -92,6 +92,7 @@
#define WT_VERB_VERIFY 0x00200000
#define WT_VERB_VERSION 0x00400000
#define WT_VERB_WRITE 0x00800000
+#define WT_VISIBILITY_ERR 0x00000008
/*
* flags section: END
* DO NOT EDIT: automatically built by dist/flags.py.
diff --git a/src/include/gcc.h b/src/include/gcc.h
index 1c61768d372..01e33792d73 100644
--- a/src/include/gcc.h
+++ b/src/include/gcc.h
@@ -85,56 +85,71 @@
* In summary, locking > barriers > volatile.
*
* To avoid locking shared data structures such as statistics and to permit
- * atomic state changes, we rely on the WT_ATOMIC_ADD and WT_ATOMIC_CAS
- * (compare and swap) operations.
+ * atomic state changes, we rely on the atomic-add and atomic-cas (compare and
+ * swap) operations.
*/
-#define __WT_ATOMIC_ADD(v, val, n) \
- (WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_add_and_fetch(&(v), val))
-#define __WT_ATOMIC_FETCH_ADD(v, val, n) \
- (WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_fetch_and_add(&(v), val))
+
#ifdef __clang__
/*
- * We avoid __sync_bool_compare_and_swap with due to problems with
- * optimization with some versions of clang. See
- * http://llvm.org/bugs/show_bug.cgi?id=21499 for details.
+ * We avoid __sync_bool_compare_and_swap with due to problems with optimization
+ * with some versions of clang. See http://llvm.org/bugs/show_bug.cgi?id=21499
+ * for details.
*/
-#define __WT_ATOMIC_CAS(v, old, new, n) \
- (WT_STATIC_ASSERT(sizeof(v) == (n)), \
- __sync_val_compare_and_swap(&(v), old, new) == (old))
+#define WT_ATOMIC_CAS(ptr, oldval, newval) \
+ (__sync_val_compare_and_swap(ptr, oldval, newval) == oldval)
#else
-#define __WT_ATOMIC_CAS(v, old, new, n) \
- (WT_STATIC_ASSERT(sizeof(v) == (n)), \
- __sync_bool_compare_and_swap(&(v), old, new))
+#define WT_ATOMIC_CAS(ptr, oldval, newval) \
+ __sync_bool_compare_and_swap(ptr, oldval, newval)
#endif
-#define __WT_ATOMIC_STORE(v, val, n) \
- (WT_STATIC_ASSERT(sizeof(v) == (n)), \
- __sync_lock_test_and_set(&(v), val))
-#define __WT_ATOMIC_SUB(v, val, n) \
- (WT_STATIC_ASSERT(sizeof(v) == (n)), __sync_sub_and_fetch(&(v), val))
-
-#define WT_ATOMIC_ADD1(v, val) __WT_ATOMIC_ADD(v, val, 1)
-#define WT_ATOMIC_FETCH_ADD1(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 1)
-#define WT_ATOMIC_CAS1(v, old, new) __WT_ATOMIC_CAS(v, old, new, 1)
-#define WT_ATOMIC_STORE1(v, val) __WT_ATOMIC_STORE(v, val, 1)
-#define WT_ATOMIC_SUB1(v, val) __WT_ATOMIC_SUB(v, val, 1)
-
-#define WT_ATOMIC_ADD2(v, val) __WT_ATOMIC_ADD(v, val, 2)
-#define WT_ATOMIC_FETCH_ADD2(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 2)
-#define WT_ATOMIC_CAS2(v, old, new) __WT_ATOMIC_CAS(v, old, new, 2)
-#define WT_ATOMIC_STORE2(v, val) __WT_ATOMIC_STORE(v, val, 2)
-#define WT_ATOMIC_SUB2(v, val) __WT_ATOMIC_SUB(v, val, 2)
-
-#define WT_ATOMIC_ADD4(v, val) __WT_ATOMIC_ADD(v, val, 4)
-#define WT_ATOMIC_FETCH_ADD4(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 4)
-#define WT_ATOMIC_CAS4(v, old, new) __WT_ATOMIC_CAS(v, old, new, 4)
-#define WT_ATOMIC_STORE4(v, val) __WT_ATOMIC_STORE(v, val, 4)
-#define WT_ATOMIC_SUB4(v, val) __WT_ATOMIC_SUB(v, val, 4)
-
-#define WT_ATOMIC_ADD8(v, val) __WT_ATOMIC_ADD(v, val, 8)
-#define WT_ATOMIC_FETCH_ADD8(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 8)
-#define WT_ATOMIC_CAS8(v, old, new) __WT_ATOMIC_CAS(v, old, new, 8)
-#define WT_ATOMIC_STORE8(v, val) __WT_ATOMIC_STORE(v, val, 8)
-#define WT_ATOMIC_SUB8(v, val) __WT_ATOMIC_SUB(v, val, 8)
+
+#define WT_ATOMIC_FUNC(name, ret, type) \
+static inline ret \
+__wt_atomic_add##name(type *vp, type v) \
+{ \
+ return (__sync_add_and_fetch(vp, v)); \
+} \
+static inline ret \
+__wt_atomic_fetch_add##name(type *vp, type v) \
+{ \
+ return (__sync_fetch_and_add(vp, v)); \
+} \
+static inline ret \
+__wt_atomic_store##name(type *vp, type v) \
+{ \
+ return (__sync_lock_test_and_set(vp, v)); \
+} \
+static inline ret \
+__wt_atomic_sub##name(type *vp, type v) \
+{ \
+ return (__sync_sub_and_fetch(vp, v)); \
+} \
+static inline bool \
+__wt_atomic_cas##name(type *vp, type old, type new) \
+{ \
+ return (WT_ATOMIC_CAS(vp, old, new)); \
+}
+
+WT_ATOMIC_FUNC(8, uint8_t, uint8_t)
+WT_ATOMIC_FUNC(16, uint16_t, uint16_t)
+WT_ATOMIC_FUNC(32, uint32_t, uint32_t)
+WT_ATOMIC_FUNC(v32, uint32_t, volatile uint32_t)
+WT_ATOMIC_FUNC(i32, int32_t, int32_t)
+WT_ATOMIC_FUNC(iv32, int32_t, volatile int32_t)
+WT_ATOMIC_FUNC(64, uint64_t, uint64_t)
+WT_ATOMIC_FUNC(v64, uint64_t, volatile uint64_t)
+WT_ATOMIC_FUNC(i64, int64_t, int64_t)
+WT_ATOMIC_FUNC(iv64, int64_t, volatile int64_t)
+WT_ATOMIC_FUNC(size, size_t, size_t)
+
+/*
+ * __wt_atomic_cas_ptr --
+ * Pointer compare and swap.
+ */
+static inline bool
+__wt_atomic_cas_ptr(void *vp, void *old, void *new)
+{
+ return (WT_ATOMIC_CAS((void **)vp, old, new));
+}
/* Compile read-write barrier */
#define WT_BARRIER() __asm__ volatile("" ::: "memory")
diff --git a/src/include/hardware.h b/src/include/hardware.h
index e3c098826d0..32353072c5b 100644
--- a/src/include/hardware.h
+++ b/src/include/hardware.h
@@ -33,8 +33,8 @@
uint8_t __orig; \
do { \
__orig = (p)->flags_atomic; \
- } while (!WT_ATOMIC_CAS1((p)->flags_atomic, \
- __orig, __orig | (uint8_t)(mask))); \
+ } while (!__wt_atomic_cas8( \
+ &(p)->flags_atomic, __orig, __orig | (uint8_t)(mask))); \
} while (0)
#define F_CAS_ATOMIC(p, mask, ret) do { \
@@ -46,16 +46,30 @@
ret = EBUSY; \
break; \
} \
- } while (!WT_ATOMIC_CAS1((p)->flags_atomic, \
- __orig, __orig | (uint8_t)(mask))); \
+ } while (!__wt_atomic_cas8( \
+ &(p)->flags_atomic, __orig, __orig | (uint8_t)(mask))); \
+} while (0)
+
+#define F_CAS_ATOMIC_WAIT(p, mask) do { \
+ int __ret; \
+ for (;;) { \
+ F_CAS_ATOMIC(p, mask, __ret); \
+ if (__ret == 0) \
+ break; \
+ __wt_yield(); \
+ } \
} while (0)
#define F_CLR_ATOMIC(p, mask) do { \
uint8_t __orig; \
do { \
__orig = (p)->flags_atomic; \
- } while (!WT_ATOMIC_CAS1((p)->flags_atomic, \
- __orig, __orig & ~(uint8_t)(mask))); \
+ } while (!__wt_atomic_cas8( \
+ &(p)->flags_atomic, __orig, __orig & ~(uint8_t)(mask))); \
} while (0)
#define WT_CACHE_LINE_ALIGNMENT 64 /* Cache line alignment */
+#define WT_CACHE_LINE_ALIGNMENT_VERIFY(session, a) \
+ WT_ASSERT(session, \
+ WT_PTRDIFF(&(a)[1], &(a)[0]) >= WT_CACHE_LINE_ALIGNMENT && \
+ WT_PTRDIFF(&(a)[1], &(a)[0]) % WT_CACHE_LINE_ALIGNMENT == 0)
diff --git a/src/include/lint.h b/src/include/lint.h
index 964aa5c118f..f288fb98683 100644
--- a/src/include/lint.h
+++ b/src/include/lint.h
@@ -18,40 +18,71 @@
#define WT_GCC_FUNC_ATTRIBUTE(x)
#define WT_GCC_FUNC_DECL_ATTRIBUTE(x)
-#define __WT_ATOMIC_ADD(v, val) \
- ((v) += (val))
-#define __WT_ATOMIC_FETCH_ADD(v, val) \
- ((v) += (val), (v))
-#define __WT_ATOMIC_CAS(v, old, new) \
- ((v) = ((v) == (old) ? (new) : (old)), (v) == (old))
-#define __WT_ATOMIC_STORE(v, val) \
- ((v) = (val))
-#define __WT_ATOMIC_SUB(v, val) \
- ((v) -= (val), (v))
-
-#define WT_ATOMIC_ADD1(v, val) __WT_ATOMIC_ADD(v, val)
-#define WT_ATOMIC_FETCH_ADD1(v, val) __WT_ATOMIC_FETCH_ADD(v, val)
-#define WT_ATOMIC_CAS1(v, old, new) __WT_ATOMIC_CAS(v, old, new)
-#define WT_ATOMIC_STORE1(v, val) __WT_ATOMIC_STORE(v, val)
-#define WT_ATOMIC_SUB1(v, val) __WT_ATOMIC_SUB(v, val)
-
-#define WT_ATOMIC_ADD2(v, val) __WT_ATOMIC_ADD(v, val)
-#define WT_ATOMIC_FETCH_ADD2(v, val) __WT_ATOMIC_FETCH_ADD(v, val)
-#define WT_ATOMIC_CAS2(v, old, new) __WT_ATOMIC_CAS(v, old, new)
-#define WT_ATOMIC_STORE2(v, val) __WT_ATOMIC_STORE(v, val)
-#define WT_ATOMIC_SUB2(v, val) __WT_ATOMIC_SUB(v, val)
-
-#define WT_ATOMIC_ADD4(v, val) __WT_ATOMIC_ADD(v, val)
-#define WT_ATOMIC_FETCH_ADD4(v, val) __WT_ATOMIC_FETCH_ADD(v, val)
-#define WT_ATOMIC_CAS4(v, old, new) __WT_ATOMIC_CAS(v, old, new)
-#define WT_ATOMIC_STORE4(v, val) __WT_ATOMIC_STORE(v, val)
-#define WT_ATOMIC_SUB4(v, val) __WT_ATOMIC_SUB(v, val)
-
-#define WT_ATOMIC_ADD8(v, val) __WT_ATOMIC_ADD(v, val)
-#define WT_ATOMIC_FETCH_ADD8(v, val) __WT_ATOMIC_FETCH_ADD(v, val)
-#define WT_ATOMIC_CAS8(v, old, new) __WT_ATOMIC_CAS(v, old, new)
-#define WT_ATOMIC_STORE8(v, val) __WT_ATOMIC_STORE(v, val)
-#define WT_ATOMIC_SUB8(v, val) __WT_ATOMIC_SUB(v, val)
+#define WT_ATOMIC_FUNC(name, ret, type) \
+static inline ret \
+__wt_atomic_add##name(type *vp, type v) \
+{ \
+ *vp += v; \
+ return (*vp); \
+} \
+static inline ret \
+__wt_atomic_fetch_add##name(type *vp, type v) \
+{ \
+ type orig; \
+ \
+ old = *vp; \
+ *vp += v; \
+ return (old); \
+} \
+static inline ret \
+__wt_atomic_store##name(type *vp, type v) \
+{ \
+ type orig; \
+ \
+ orig = *vp; \
+ *vp = v; \
+ return (old); \
+} \
+static inline ret \
+__wt_atomic_sub##name(type *vp, type v) \
+{ \
+ *vp -= v; \
+ return (*vp); \
+} \
+static inline bool \
+__wt_atomic_cas##name(type *vp, type old, type new) \
+{ \
+ if (*vp == old) { \
+ *vp = new; \
+ return (true); \
+ } \
+ return (false); \
+}
+
+WT_ATOMIC_FUNC(8, uint8_t, uint8_t)
+WT_ATOMIC_FUNC(16, uint16_t, uint16_t)
+WT_ATOMIC_FUNC(32, uint32_t, uint32_t)
+WT_ATOMIC_FUNC(v32, uint32_t, volatile uint32_t)
+WT_ATOMIC_FUNC(i32, int32_t, int32_t)
+WT_ATOMIC_FUNC(iv32, int32_t, volatile int32_t)
+WT_ATOMIC_FUNC(64, uint64_t, uint64_t)
+WT_ATOMIC_FUNC(v64, uint64_t, volatile uint64_t)
+WT_ATOMIC_FUNC(i64, int64_t, int64_t)
+WT_ATOMIC_FUNC(iv64, int64_t, volatile int64_t)
+WT_ATOMIC_FUNC(size, size_t, size_t)
+
+/*
+ * __wt_atomic_cas_ptr --
+ * Pointer compare and swap.
+ */
+static inline bool
+__wt_atomic_cas_ptr(void *vp, void *old, void *new) {
+ if (*(void **)vp == old) {
+ *(void **)vp = new;
+ return (true);
+ }
+ return (false);
+}
static inline void WT_BARRIER(void) { return; }
static inline void WT_FULL_BARRIER(void) { return; }
diff --git a/src/include/log.h b/src/include/log.h
index fbb0a3e3842..06be95697c7 100644
--- a/src/include/log.h
+++ b/src/include/log.h
@@ -12,7 +12,6 @@
/* Logging subsystem declarations. */
#define WT_LOG_ALIGN 128
-#define WT_LOG_SLOT_BUF_SIZE 256 * 1024
#define WT_INIT_LSN(l) do { \
(l)->file = 1; \
@@ -48,67 +47,136 @@
((size) - offsetof(WT_LOG_RECORD, record))
/*
- * Compare 2 LSNs, return -1 if lsn0 < lsn1, 0 if lsn0 == lsn1
- * and 1 if lsn0 > lsn1.
- */
-#define WT_LOG_CMP(lsn1, lsn2) \
- ((lsn1)->file != (lsn2)->file ? \
- ((lsn1)->file < (lsn2)->file ? -1 : 1) : \
- ((lsn1)->offset != (lsn2)->offset ? \
- ((lsn1)->offset < (lsn2)->offset ? -1 : 1) : 0))
-
-/*
* Possible values for the consolidation array slot states:
- * (NOTE: Any new states must be > WT_LOG_SLOT_DONE and < WT_LOG_SLOT_READY.)
*
- * < WT_LOG_SLOT_DONE - threads are actively writing to the log.
- * WT_LOG_SLOT_DONE - all activity on this slot is complete.
+ * WT_LOG_SLOT_CLOSE - slot is in use but closed to new joins.
* WT_LOG_SLOT_FREE - slot is available for allocation.
- * WT_LOG_SLOT_PENDING - slot is transitioning from ready to active.
* WT_LOG_SLOT_WRITTEN - slot is written and should be processed by worker.
- * WT_LOG_SLOT_READY - slot is ready for threads to join.
- * > WT_LOG_SLOT_READY - threads are actively consolidating on this slot.
*
* The slot state must be volatile: threads loop checking the state and can't
* cache the first value they see.
+ *
+ * The slot state is divided into two 32 bit sizes. One half is the
+ * amount joined and the other is the amount released. Since we use
+ * a few special states, reserve the top few bits for state. That makes
+ * the maximum size less than 32 bits for both joined and released.
+ */
+
+/*
+ * The high bit is reserved for the special states. If the high bit is
+ * set (WT_LOG_SLOT_RESERVED) then we are guaranteed to be in a special state.
+ */
+#define WT_LOG_SLOT_FREE -1 /* Not in use */
+#define WT_LOG_SLOT_WRITTEN -2 /* Slot data written, not processed */
+
+/*
+ * We allocate the buffer size, but trigger a slot switch when we cross
+ * the maximum size of half the buffer. If a record is more than the buffer
+ * maximum then we trigger a slot switch and write that record unbuffered.
+ * We use a larger buffer to provide overflow space so that we can switch
+ * once we cross the threshold.
+ */
+#define WT_LOG_SLOT_BUF_SIZE (256 * 1024) /* Must be power of 2 */
+#define WT_LOG_SLOT_BUF_MAX ((uint32_t)log->slot_buf_size / 2)
+#define WT_LOG_SLOT_UNBUFFERED (WT_LOG_SLOT_BUF_SIZE << 1)
+
+/*
+ * If new slot states are added, adjust WT_LOG_SLOT_BITS and
+ * WT_LOG_SLOT_MASK_OFF accordingly for how much of the top 32
+ * bits we are using. More slot states here will reduce the maximum
+ * size that a slot can hold unbuffered by half. If a record is
+ * larger than the maximum we can account for in the slot state we fall
+ * back to direct writes.
+ */
+#define WT_LOG_SLOT_BITS 2
+#define WT_LOG_SLOT_MAXBITS (32 - WT_LOG_SLOT_BITS)
+#define WT_LOG_SLOT_CLOSE 0x4000000000000000LL /* Force slot close */
+#define WT_LOG_SLOT_RESERVED 0x8000000000000000LL /* Reserved states */
+
+/*
+ * Check if the unbuffered flag is set in the joined portion of
+ * the slot state.
*/
-#define WT_LOG_SLOT_DONE 0
-#define WT_LOG_SLOT_FREE 1
-#define WT_LOG_SLOT_PENDING 2
-#define WT_LOG_SLOT_WRITTEN 3
-#define WT_LOG_SLOT_READY 4
-typedef WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) struct {
+#define WT_LOG_SLOT_UNBUFFERED_ISSET(state) \
+ ((state) & ((int64_t)WT_LOG_SLOT_UNBUFFERED << 32))
+
+#define WT_LOG_SLOT_MASK_OFF 0x3fffffffffffffffLL
+#define WT_LOG_SLOT_MASK_ON ~(WT_LOG_SLOT_MASK_OFF)
+#define WT_LOG_SLOT_JOIN_MASK (WT_LOG_SLOT_MASK_OFF >> 32)
+
+/*
+ * These macros manipulate the slot state and its component parts.
+ */
+#define WT_LOG_SLOT_FLAGS(state) ((state) & WT_LOG_SLOT_MASK_ON)
+#define WT_LOG_SLOT_JOINED(state) (((state) & WT_LOG_SLOT_MASK_OFF) >> 32)
+#define WT_LOG_SLOT_JOINED_BUFFERED(state) \
+ (WT_LOG_SLOT_JOINED(state) & \
+ (WT_LOG_SLOT_UNBUFFERED - 1))
+#define WT_LOG_SLOT_JOIN_REL(j, r, s) (((j) << 32) + (r) + (s))
+#define WT_LOG_SLOT_RELEASED(state) ((int64_t)(int32_t)(state))
+#define WT_LOG_SLOT_RELEASED_BUFFERED(state) \
+ ((int64_t)((int32_t)WT_LOG_SLOT_RELEASED(state) & \
+ (WT_LOG_SLOT_UNBUFFERED - 1)))
+
+/* Slot is in use */
+#define WT_LOG_SLOT_ACTIVE(state) \
+ (WT_LOG_SLOT_JOINED(state) != WT_LOG_SLOT_JOIN_MASK)
+/* Slot is in use, but closed to new joins */
+#define WT_LOG_SLOT_CLOSED(state) \
+ (WT_LOG_SLOT_ACTIVE(state) && \
+ (FLD64_ISSET((uint64_t)state, WT_LOG_SLOT_CLOSE) && \
+ !FLD64_ISSET((uint64_t)state, WT_LOG_SLOT_RESERVED)))
+/* Slot is in use, all data copied into buffer */
+#define WT_LOG_SLOT_INPROGRESS(state) \
+ (WT_LOG_SLOT_RELEASED(state) != WT_LOG_SLOT_JOINED(state))
+#define WT_LOG_SLOT_DONE(state) \
+ (WT_LOG_SLOT_CLOSED(state) && \
+ !WT_LOG_SLOT_INPROGRESS(state))
+/* Slot is in use, more threads may join this slot */
+#define WT_LOG_SLOT_OPEN(state) \
+ (WT_LOG_SLOT_ACTIVE(state) && \
+ !WT_LOG_SLOT_UNBUFFERED_ISSET(state) && \
+ !FLD64_ISSET((uint64_t)(state), WT_LOG_SLOT_CLOSE) && \
+ WT_LOG_SLOT_JOINED(state) < WT_LOG_SLOT_BUF_MAX)
+
+struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_logslot {
volatile int64_t slot_state; /* Slot state */
- uint64_t slot_group_size; /* Group size */
+ int64_t slot_unbuffered; /* Unbuffered data in this slot */
int32_t slot_error; /* Error value */
-#define WT_SLOT_INVALID_INDEX 0xffffffff
- uint32_t slot_index; /* Active slot index */
wt_off_t slot_start_offset; /* Starting file offset */
- WT_LSN slot_release_lsn; /* Slot release LSN */
- WT_LSN slot_start_lsn; /* Slot starting LSN */
- WT_LSN slot_end_lsn; /* Slot ending LSN */
+ wt_off_t slot_last_offset; /* Last record offset */
+ WT_LSN slot_release_lsn; /* Slot release LSN */
+ WT_LSN slot_start_lsn; /* Slot starting LSN */
+ WT_LSN slot_end_lsn; /* Slot ending LSN */
WT_FH *slot_fh; /* File handle for this group */
- WT_ITEM slot_buf; /* Buffer for grouped writes */
- int32_t slot_churn; /* Active slots are scarce. */
+ WT_ITEM slot_buf; /* Buffer for grouped writes */
-#define WT_SLOT_BUFFERED 0x01 /* Buffer writes */
-#define WT_SLOT_CLOSEFH 0x02 /* Close old fh on release */
-#define WT_SLOT_SYNC 0x04 /* Needs sync on release */
-#define WT_SLOT_SYNC_DIR 0x08 /* Directory sync on release */
+#define WT_SLOT_CLOSEFH 0x01 /* Close old fh on release */
+#define WT_SLOT_SYNC 0x02 /* Needs sync on release */
+#define WT_SLOT_SYNC_DIR 0x04 /* Directory sync on release */
uint32_t flags; /* Flags */
-} WT_LOGSLOT;
+};
-#define WT_SLOT_INIT_FLAGS (WT_SLOT_BUFFERED)
+#define WT_SLOT_INIT_FLAGS 0
-typedef struct {
- WT_LOGSLOT *slot;
- wt_off_t offset;
-} WT_MYSLOT;
+#define WT_WITH_SLOT_LOCK(session, log, op) do { \
+ WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT)); \
+ WT_WITH_LOCK(session, \
+ &log->log_slot_lock, WT_SESSION_LOCKED_SLOT, op); \
+} while (0)
+
+struct __wt_myslot {
+ WT_LOGSLOT *slot; /* Slot I'm using */
+ wt_off_t end_offset; /* My end offset in buffer */
+ wt_off_t offset; /* Slot buffer offset */
+#define WT_MYSLOT_CLOSE 0x01 /* This thread is closing the slot */
+#define WT_MYSLOT_UNBUFFERED 0x02 /* Write directly */
+ uint32_t flags; /* Flags */
+};
- /* Offset of first record */
#define WT_LOG_FIRST_RECORD log->allocsize
-typedef struct {
+struct __wt_log {
uint32_t allocsize; /* Allocation alignment size */
wt_off_t log_written; /* Amount of log written this period */
/*
@@ -119,8 +187,9 @@ typedef struct {
uint32_t tmp_fileid; /* Temporary file number */
uint32_t prep_missed; /* Pre-allocated file misses */
WT_FH *log_fh; /* Logging file handle */
- WT_FH *log_close_fh; /* Logging file handle to close */
WT_FH *log_dir_fh; /* Log directory file handle */
+ WT_FH *log_close_fh; /* Logging file handle to close */
+ WT_LSN log_close_lsn; /* LSN needed to close */
/*
* System LSNs
@@ -141,8 +210,9 @@ typedef struct {
WT_SPINLOCK log_lock; /* Locked: Logging fields */
WT_SPINLOCK log_slot_lock; /* Locked: Consolidation array */
WT_SPINLOCK log_sync_lock; /* Locked: Single-thread fsync */
+ WT_SPINLOCK log_writelsn_lock; /* Locked: write LSN */
- WT_RWLOCK *log_archive_lock; /* Archive and log cursors */
+ WT_RWLOCK *log_archive_lock; /* Archive and log cursors */
/* Notify any waiting threads when sync_lsn is updated. */
WT_CONDVAR *log_sync_cond;
@@ -151,22 +221,25 @@ typedef struct {
/*
* Consolidation array information
- * WT_SLOT_ACTIVE must be less than WT_SLOT_POOL.
* Our testing shows that the more consolidation we generate the
* better the performance we see which equates to an active slot
* slot count of one.
+ *
+ * Note: this can't be an array, we impose cache-line alignment and
+ * gcc doesn't support that for arrays.
*/
-#define WT_SLOT_ACTIVE 1
#define WT_SLOT_POOL 128
- WT_LOGSLOT *slot_array[WT_SLOT_ACTIVE]; /* Active slots */
+ WT_LOGSLOT *active_slot; /* Active slot */
WT_LOGSLOT slot_pool[WT_SLOT_POOL]; /* Pool of all slots */
size_t slot_buf_size; /* Buffer size for slots */
+#ifdef HAVE_DIAGNOSTIC
+ uint64_t write_calls; /* Calls to log_write */
+#endif
-#define WT_LOG_FORCE_CONSOLIDATE 0x01 /* Disable direct writes */
uint32_t flags;
-} WT_LOG;
+};
-typedef struct {
+struct __wt_log_record {
uint32_t len; /* 00-03: Record length including hdr */
uint32_t checksum; /* 04-07: Checksum of the record */
@@ -176,7 +249,7 @@ typedef struct {
uint8_t unused[2]; /* 10-11: Padding */
uint32_t mem_len; /* 12-15: Uncompressed len if needed */
uint8_t record[0]; /* Beginning of actual data */
-} WT_LOG_RECORD;
+};
/*
* WT_LOG_DESC --
diff --git a/src/include/log.i b/src/include/log.i
new file mode 100644
index 00000000000..ff309c31265
--- /dev/null
+++ b/src/include/log.i
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+static inline int __wt_log_cmp(WT_LSN *lsn1, WT_LSN *lsn2);
+
+/*
+ * __wt_log_cmp --
+ * Compare 2 LSNs, return -1 if lsn1 < lsn2, 0if lsn1 == lsn2
+ * and 1 if lsn1 > lsn2.
+ */
+static inline int
+__wt_log_cmp(WT_LSN *lsn1, WT_LSN *lsn2)
+{
+ WT_LSN l1, l2;
+
+ /*
+ * Read LSNs into local variables so that we only read each field
+ * once and all comparisons are on the same values.
+ */
+ l1 = *(volatile WT_LSN *)lsn1;
+ l2 = *(volatile WT_LSN *)lsn2;
+
+ /*
+ * If the file numbers are different we don't need to compare the
+ * offset.
+ */
+ if (l1.file != l2.file)
+ return (l1.file < l2.file ? -1 : 1);
+ /*
+ * If the file numbers are the same, compare the offset.
+ */
+ if (l1.offset != l2.offset)
+ return (l1.offset < l2.offset ? -1 : 1);
+ return (0);
+}
diff --git a/src/include/lsm.h b/src/include/lsm.h
index 08e57794fb8..11cf8204aec 100644
--- a/src/include/lsm.h
+++ b/src/include/lsm.h
@@ -74,7 +74,7 @@ struct __wt_cursor_lsm {
* WT_LSM_CHUNK --
* A single chunk (file) in an LSM tree.
*/
-struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_lsm_chunk {
+struct __wt_lsm_chunk {
const char *uri; /* Data source for this chunk */
const char *bloom_uri; /* URI of Bloom filter, if any */
struct timespec create_ts; /* Creation time (for rate limiting) */
@@ -177,16 +177,14 @@ struct __wt_lsm_tree {
const char *collator_name;
int collator_owned;
- int refcnt; /* Number of users of the tree */
- int8_t exclusive; /* Tree is locked exclusively */
+ uint32_t refcnt; /* Number of users of the tree */
+ uint8_t exclusive; /* Tree is locked exclusively */
#define LSM_TREE_MAX_QUEUE 100
- int queue_ref;
+ uint32_t queue_ref;
WT_RWLOCK *rwlock;
TAILQ_ENTRY(__wt_lsm_tree) q;
- WT_DSRC_STATS stats; /* LSM-level statistics */
-
uint64_t dsk_gen;
uint64_t ckpt_throttle; /* Rate limiting due to checkpoints */
@@ -221,9 +219,28 @@ struct __wt_lsm_tree {
WT_LSM_CHUNK **old_chunks; /* Array of old LSM chunks */
size_t old_alloc; /* Space allocated for old chunks */
u_int nold_chunks; /* Number of old chunks */
- int freeing_old_chunks; /* Whether chunks are being freed */
+ uint32_t freeing_old_chunks; /* Whether chunks are being freed */
uint32_t merge_aggressiveness; /* Increase amount of work per merge */
+ /*
+ * We maintain a set of statistics outside of the normal statistics
+ * area, copying them into place when a statistics cursor is created.
+ */
+#define WT_LSM_TREE_STAT_INCR(session, fld) do { \
+ if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \
+ ++(fld); \
+} while (0)
+#define WT_LSM_TREE_STAT_INCRV(session, fld, v) do { \
+ if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \
+ (fld) += (int64_t)(v); \
+} while (0)
+ int64_t bloom_false_positive;
+ int64_t bloom_hit;
+ int64_t bloom_miss;
+ int64_t lsm_checkpoint_throttle;
+ int64_t lsm_lookup_no_bloom;
+ int64_t lsm_merge_throttle;
+
#define WT_LSM_TREE_ACTIVE 0x01 /* Workers are active */
#define WT_LSM_TREE_AGGRESSIVE_TIMER 0x02 /* Timer for merge aggression */
#define WT_LSM_TREE_COMPACTING 0x04 /* Tree being compacted */
diff --git a/src/include/meta.h b/src/include/meta.h
index 66547262417..a5a303f1630 100644
--- a/src/include/meta.h
+++ b/src/include/meta.h
@@ -21,7 +21,9 @@
#define WT_METADATA_TURTLE_SET "WiredTiger.turtle.set" /* Turtle temp file */
#define WT_METADATA_URI "metadata:" /* Metadata alias */
-#define WT_METAFILE_URI "file:WiredTiger.wt" /* Metadata file URI */
+#define WT_METAFILE_URI "file:WiredTiger.wt" /* Metadata table URI */
+
+#define WT_LAS_URI "file:WiredTigerLAS.wt" /* Lookaside table URI*/
/*
* Pre computed hash for the metadata file. Used to optimize comparisons
diff --git a/src/include/misc.h b/src/include/misc.h
index 7fb6ae13d38..1b2cbf11fc2 100644
--- a/src/include/misc.h
+++ b/src/include/misc.h
@@ -130,6 +130,7 @@
#define FLD_CLR(field, mask) ((field) &= ~((uint32_t)(mask)))
#define FLD_ISSET(field, mask) ((field) & ((uint32_t)(mask)))
+#define FLD64_ISSET(field, mask) ((field) & ((uint64_t)(mask)))
#define FLD_SET(field, mask) ((field) |= ((uint32_t)(mask)))
/*
diff --git a/src/include/misc.i b/src/include/misc.i
index 98facff02b9..6b502c4c1d1 100644
--- a/src/include/misc.i
+++ b/src/include/misc.i
@@ -7,6 +7,18 @@
*/
/*
+ * __wt_cond_wait --
+ * Wait on a mutex, optionally timing out.
+ */
+static inline int
+__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs)
+{
+ int notused;
+
+ return (__wt_cond_wait_signal(session, cond, usecs, &notused));
+}
+
+/*
* __wt_strdup --
* ANSI strdup function.
*/
diff --git a/src/include/msvc.h b/src/include/msvc.h
index bc72ddf8193..8f5aa9abde8 100644
--- a/src/include/msvc.h
+++ b/src/include/msvc.h
@@ -31,52 +31,56 @@
#define WT_GCC_FUNC_ATTRIBUTE(x)
#define WT_GCC_FUNC_DECL_ATTRIBUTE(x)
-#define __WT_ATOMIC_ADD(v, val, n, s, t) \
- (WT_STATIC_ASSERT(sizeof(v) == (n)), \
- _InterlockedExchangeAdd ## s((t*)&(v), (t)(val)) + (val))
-#define __WT_ATOMIC_FETCH_ADD(v, val, n, s, t) \
- (WT_STATIC_ASSERT(sizeof(v) == (n)), \
- _InterlockedExchangeAdd ## s((t*)&(v), (t)(val)))
-#define __WT_ATOMIC_CAS(v, old, new, n, s, t) \
- (WT_STATIC_ASSERT(sizeof(v) == (n)), \
- _InterlockedCompareExchange ## s \
- ((t*)&(v), (t)(new), (t)(old)) == (t)(old))
-#define __WT_ATOMIC_STORE(v, val, n, s, t) \
- (WT_STATIC_ASSERT(sizeof(v) == (n)), \
- _InterlockedExchange ## s((t*)&(v), (t)(val)))
-#define __WT_ATOMIC_SUB(v, val, n, s, t) \
- (WT_STATIC_ASSERT(sizeof(v) == (n)), \
- _InterlockedExchangeAdd ## s((t*)&(v), -(t) val) - (val))
+#define WT_ATOMIC_FUNC(name, ret, type, s, t) \
+static inline ret \
+__wt_atomic_add##name(type *vp, type v) \
+{ \
+ return (_InterlockedExchangeAdd ## s((t *)(vp), (t)(v)) + (v)); \
+} \
+static inline ret \
+__wt_atomic_fetch_add##name(type *vp, type v) \
+{ \
+ return (_InterlockedExchangeAdd ## s((t *)(vp), (t)(v))); \
+} \
+static inline ret \
+__wt_atomic_store##name(type *vp, type v) \
+{ \
+ return (_InterlockedExchange ## s((t *)(vp), (t)(v))); \
+} \
+static inline ret \
+__wt_atomic_sub##name(type *vp, type v) \
+{ \
+ return (_InterlockedExchangeAdd ## s((t *)(vp), - (t)v) - (v)); \
+} \
+static inline bool \
+__wt_atomic_cas##name(type *vp, type old, type new) \
+{ \
+ return (_InterlockedCompareExchange ## s \
+ ((t *)(vp), (t)(new), (t)(old)) == (t)(old)); \
+}
-#define WT_ATOMIC_ADD1(v, val) __WT_ATOMIC_ADD(v, val, 1, 8, char)
-#define WT_ATOMIC_FETCH_ADD1(v, val) \
- __WT_ATOMIC_FETCH_ADD(v, val, 1, 8, char)
-#define WT_ATOMIC_CAS1(v, old, new) __WT_ATOMIC_CAS(v, old, new, 1, 8, char)
-#define WT_ATOMIC_STORE1(v, val) __WT_ATOMIC_STORE(v, val, 1, 8, char)
-#define WT_ATOMIC_SUB1(v, val) __WT_ATOMIC_SUB(v, val, 1, 8, char)
+WT_ATOMIC_FUNC(8, uint8_t, uint8_t, 8, char)
+WT_ATOMIC_FUNC(16, uint16_t, uint16_t, 16, short)
+WT_ATOMIC_FUNC(32, uint32_t, uint32_t, , long)
+WT_ATOMIC_FUNC(v32, uint32_t, volatile uint32_t, , long)
+WT_ATOMIC_FUNC(i32, int32_t, int32_t, , long)
+WT_ATOMIC_FUNC(iv32, int32_t, volatile int32_t, , long)
+WT_ATOMIC_FUNC(64, uint64_t, uint64_t, 64, __int64)
+WT_ATOMIC_FUNC(v64, uint64_t, volatile uint64_t, 64, __int64)
+WT_ATOMIC_FUNC(i64, int64_t, int64_t, 64, __int64)
+WT_ATOMIC_FUNC(iv64, int64_t, volatile int64_t, 64, __int64)
+WT_ATOMIC_FUNC(size, size_t, size_t, 64, __int64)
-#define WT_ATOMIC_ADD2(v, val) __WT_ATOMIC_ADD(v, val, 2, 16, short)
-#define WT_ATOMIC_FETCH_ADD2(v, val) \
- __WT_ATOMIC_FETCH_ADD(v, val, 2, 16, short)
-#define WT_ATOMIC_CAS2(v, old, new) \
- __WT_ATOMIC_CAS(v, old, new, 2, 16, short)
-#define WT_ATOMIC_STORE2(v, val) __WT_ATOMIC_STORE(v, val, 2, 16, short)
-#define WT_ATOMIC_SUB2(v, val) __WT_ATOMIC_SUB(v, val, 2, 16, short)
-
-#define WT_ATOMIC_ADD4(v, val) __WT_ATOMIC_ADD(v, val, 4, , long)
-#define WT_ATOMIC_FETCH_ADD4(v, val) __WT_ATOMIC_FETCH_ADD(v, val, 4, , long)
-#define WT_ATOMIC_CAS4(v, old, new) __WT_ATOMIC_CAS(v, old, new, 4, , long)
-#define WT_ATOMIC_STORE4(v, val) __WT_ATOMIC_STORE(v, val, 4, , long)
-#define WT_ATOMIC_SUB4(v, val) __WT_ATOMIC_SUB(v, val, 4, , long)
-
-#define WT_ATOMIC_ADD8(v, val) __WT_ATOMIC_ADD(v, val, 8, 64, __int64)
-#define WT_ATOMIC_FETCH_ADD8(v, val) \
- __WT_ATOMIC_FETCH_ADD(v, val, 8, 64, __int64)
-#define WT_ATOMIC_CAS8(v, old, new) \
- __WT_ATOMIC_CAS(v, old, new, 8, 64, __int64)
-#define WT_ATOMIC_STORE8(v, val) \
- __WT_ATOMIC_STORE(v, val, 8, 64, __int64)
-#define WT_ATOMIC_SUB8(v, val) __WT_ATOMIC_SUB(v, val, 8, 64, __int64)
+/*
+ * __wt_atomic_cas_ptr --
+ * Pointer compare and swap.
+ */
+static inline bool
+__wt_atomic_cas_ptr(void *vp, void *old, void *new)
+{
+ return (_InterlockedCompareExchange64(
+ vp, (int64_t)new, (int64_t)old) == ((int64_t)old));
+}
static inline void WT_BARRIER(void) { _ReadWriteBarrier(); }
static inline void WT_FULL_BARRIER(void) { _mm_mfence(); }
diff --git a/src/include/mutex.h b/src/include/mutex.h
index 7a5028d6a28..1f1bb8f4b5c 100644
--- a/src/include/mutex.h
+++ b/src/include/mutex.h
@@ -24,24 +24,20 @@ struct __wt_condvar {
/*
* !!!
- * Don't touch this structure without understanding the read/write
- * locking functions.
+ * Don't modify this structure without understanding the read/write locking
+ * functions.
*/
-typedef union { /* Read/write lock */
-#ifdef WORDS_BIGENDIAN
- WiredTiger read/write locks require modification for big-endian systems.
-#else
+typedef union { /* Read/write lock */
uint64_t u;
struct {
- uint32_t us;
+ uint32_t wr; /* Writers and readers */
} i;
struct {
- uint16_t writers;
- uint16_t readers;
- uint16_t users;
- uint16_t pad;
+ uint16_t writers; /* Now serving for writers */
+ uint16_t readers; /* Now serving for readers */
+ uint16_t users; /* Next available ticket number */
+ uint16_t __notused; /* Padding */
} s;
-#endif
} wt_rwlock_t;
/*
@@ -69,20 +65,21 @@ struct __wt_rwlock {
#if SPINLOCK_TYPE == SPINLOCK_GCC
-typedef volatile int WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT)
- WT_SPINLOCK;
+struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_spinlock {
+ volatile int lock;
+};
#elif SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\
SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX_ADAPTIVE ||\
SPINLOCK_TYPE == SPINLOCK_MSVC
-typedef WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) struct {
+struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_spinlock {
wt_mutex_t lock;
const char *name; /* Statistics: mutex name */
int8_t initialized; /* Lock initialized, for cleanup */
-} WT_SPINLOCK;
+};
#else
diff --git a/src/include/mutex.i b/src/include/mutex.i
index 8bca50635e6..5ea4583a2ab 100644
--- a/src/include/mutex.i
+++ b/src/include/mutex.i
@@ -31,7 +31,7 @@ __wt_spin_init(WT_SESSION_IMPL *session, WT_SPINLOCK *t, const char *name)
WT_UNUSED(session);
WT_UNUSED(name);
- *(t) = 0;
+ t->lock = 0;
return (0);
}
@@ -44,7 +44,7 @@ __wt_spin_destroy(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
{
WT_UNUSED(session);
- *(t) = 0;
+ t->lock = 0;
}
/*
@@ -56,7 +56,7 @@ __wt_spin_trylock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
{
WT_UNUSED(session);
- return (__sync_lock_test_and_set(t, 1) == 0 ? 0 : EBUSY);
+ return (__sync_lock_test_and_set(&t->lock, 1) == 0 ? 0 : EBUSY);
}
/*
@@ -70,10 +70,10 @@ __wt_spin_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
WT_UNUSED(session);
- while (__sync_lock_test_and_set(t, 1)) {
- for (i = 0; *t && i < WT_SPIN_COUNT; i++)
+ while (__sync_lock_test_and_set(&t->lock, 1)) {
+ for (i = 0; t->lock && i < WT_SPIN_COUNT; i++)
WT_PAUSE();
- if (*t)
+ if (t->lock)
__wt_yield();
}
}
@@ -87,7 +87,7 @@ __wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
{
WT_UNUSED(session);
- __sync_lock_release(t);
+ __sync_lock_release(&t->lock);
}
#elif SPINLOCK_TYPE == SPINLOCK_PTHREAD_MUTEX ||\
diff --git a/src/include/os.h b/src/include/os.h
index ba5d95657d5..518b124f547 100644
--- a/src/include/os.h
+++ b/src/include/os.h
@@ -56,7 +56,7 @@ typedef enum {
case EMFILE: \
case ENFILE: \
case ENOSPC: \
- __wt_sleep(0L, 500000L); \
+ __wt_sleep(0L, 50000L); \
continue; \
default: \
break; \
@@ -77,8 +77,8 @@ typedef enum {
struct __wt_fh {
char *name; /* File name */
uint64_t name_hash; /* Hash of name */
- SLIST_ENTRY(__wt_fh) l; /* List of open handles */
- SLIST_ENTRY(__wt_fh) hashl; /* Hashed list of handles */
+ TAILQ_ENTRY(__wt_fh) q; /* List of open handles */
+ TAILQ_ENTRY(__wt_fh) hashq; /* Hashed list of handles */
u_int ref; /* Reference count */
diff --git a/src/include/queue.h b/src/include/queue.h
index 42e736e7b09..1d494875cf6 100644
--- a/src/include/queue.h
+++ b/src/include/queue.h
@@ -38,71 +38,17 @@ extern "C" {
#endif
/*
- * This file defines four types of data structures: singly-linked lists,
- * singly-linked tail queues, lists and tail queues.
+ * WiredTiger only uses the TAILQ macros (we've gotten into trouble in the past
+ * by trying to use simpler queues and subsequently discovering a list we didn't
+ * think would ever get to be large could, under some workloads, become large,
+ * and the linear performance for removal of elements from simpler macros proved
+ * to be more trouble than the memory savings were worth.
*
- * A singly-linked list is headed by a single forward pointer. The elements
- * are singly linked for minimum space and pointer manipulation overhead at
- * the expense of O(n) removal for arbitrary elements. New elements can be
- * added to the list after an existing element or at the head of the list.
- * Elements being removed from the head of the list should use the explicit
- * macro for this purpose for optimum efficiency. A singly-linked list may
- * only be traversed in the forward direction. Singly-linked lists are ideal
- * for applications with large datasets and few or no removals or for
- * implementing a LIFO queue.
+ * Additionally, we've altered the TAILQ_INSERT_XXX functions to include a write
+ * barrier, in order to ensure we never insert a partially built structure onto
+ * a list (this is required because the spinlocks we use don't necessarily imply
+ * a write barrier).
*
- * A singly-linked tail queue is headed by a pair of pointers, one to the
- * head of the list and the other to the tail of the list. The elements are
- * singly linked for minimum space and pointer manipulation overhead at the
- * expense of O(n) removal for arbitrary elements. New elements can be added
- * to the list after an existing element, at the head of the list, or at the
- * end of the list. Elements being removed from the head of the tail queue
- * should use the explicit macro for this purpose for optimum efficiency.
- * A singly-linked tail queue may only be traversed in the forward direction.
- * Singly-linked tail queues are ideal for applications with large datasets
- * and few or no removals or for implementing a FIFO queue.
- *
- * A list is headed by a single forward pointer (or an array of forward
- * pointers for a hash table header). The elements are doubly linked
- * so that an arbitrary element can be removed without a need to
- * traverse the list. New elements can be added to the list before
- * or after an existing element or at the head of the list. A list
- * may only be traversed in the forward direction.
- *
- * A tail queue is headed by a pair of pointers, one to the head of the
- * list and the other to the tail of the list. The elements are doubly
- * linked so that an arbitrary element can be removed without a need to
- * traverse the list. New elements can be added to the list before or
- * after an existing element, at the head of the list, or at the end of
- * the list. A tail queue may be traversed in either direction.
- *
- * For details on the use of these macros, see the queue(3) manual page.
- *
- *
- * SLIST LIST STAILQ TAILQ
- * _HEAD + + + +
- * _HEAD_INITIALIZER + + + +
- * _ENTRY + + + +
- * _INIT + + + +
- * _EMPTY + + + +
- * _FIRST + + + +
- * _NEXT + + + +
- * _PREV - - - +
- * _LAST - - + +
- * _FOREACH + + + +
- * _FOREACH_REVERSE - - - +
- * _INSERT_HEAD + + + +
- * _INSERT_BEFORE - + - +
- * _INSERT_AFTER + + + +
- * _INSERT_TAIL - - + +
- * _CONCAT - - + +
- * _REMOVE_HEAD + - + -
- * _REMOVE + + + +
- *
- */
-
-/*
- * XXX
* We #undef all of the macros because there are incompatible versions of this
* file and these macros on various systems. What makes the problem worse is
* they are included and/or defined by system include files which we may have
@@ -111,50 +57,7 @@ extern "C" {
* several of the LIST_XXX macros. Visual C.NET 7.0 also defines some of these
* same macros in Vc7\PlatformSDK\Include\WinNT.h. Make sure we use ours.
*/
-#undef LIST_EMPTY
-#undef LIST_ENTRY
-#undef LIST_FIRST
-#undef LIST_FOREACH
-#undef LIST_HEAD
-#undef LIST_HEAD_INITIALIZER
-#undef LIST_INIT
-#undef LIST_INSERT_AFTER
-#undef LIST_INSERT_BEFORE
-#undef LIST_INSERT_HEAD
-#undef LIST_NEXT
-#undef LIST_REMOVE
-#undef QMD_TRACE_ELEM
-#undef QMD_TRACE_HEAD
-#undef QUEUE_MACRO_DEBUG
-#undef SLIST_EMPTY
-#undef SLIST_ENTRY
-#undef SLIST_FIRST
-#undef SLIST_FOREACH
-#undef SLIST_FOREACH_PREVPTR
-#undef SLIST_HEAD
-#undef SLIST_HEAD_INITIALIZER
-#undef SLIST_INIT
-#undef SLIST_INSERT_AFTER
-#undef SLIST_INSERT_HEAD
-#undef SLIST_NEXT
-#undef SLIST_REMOVE
-#undef SLIST_REMOVE_HEAD
-#undef STAILQ_CONCAT
-#undef STAILQ_EMPTY
-#undef STAILQ_ENTRY
-#undef STAILQ_FIRST
-#undef STAILQ_FOREACH
-#undef STAILQ_HEAD
-#undef STAILQ_HEAD_INITIALIZER
-#undef STAILQ_INIT
-#undef STAILQ_INSERT_AFTER
-#undef STAILQ_INSERT_HEAD
-#undef STAILQ_INSERT_TAIL
-#undef STAILQ_LAST
-#undef STAILQ_NEXT
-#undef STAILQ_REMOVE
-#undef STAILQ_REMOVE_HEAD
-#undef STAILQ_REMOVE_HEAD_UNTIL
+
#undef TAILQ_CONCAT
#undef TAILQ_EMPTY
#undef TAILQ_ENTRY
@@ -210,230 +113,6 @@ struct qm_trace {
#endif /* QUEUE_MACRO_DEBUG */
/*
- * Singly-linked List declarations.
- */
-#define SLIST_HEAD(name, type) \
-struct name { \
- struct type *slh_first; /* first element */ \
-}
-
-#define SLIST_HEAD_INITIALIZER(head) \
- { NULL }
-
-#define SLIST_ENTRY(type) \
-struct { \
- struct type *sle_next; /* next element */ \
-}
-
-/*
- * Singly-linked List functions.
- */
-#define SLIST_EMPTY(head) ((head)->slh_first == NULL)
-
-#define SLIST_FIRST(head) ((head)->slh_first)
-
-#define SLIST_FOREACH(var, head, field) \
- for ((var) = SLIST_FIRST((head)); \
- (var); \
- (var) = SLIST_NEXT((var), field))
-
-#define SLIST_FOREACH_PREVPTR(var, varp, head, field) \
- for ((varp) = &SLIST_FIRST((head)); \
- ((var) = *(varp)) != NULL; \
- (varp) = &SLIST_NEXT((var), field))
-
-#define SLIST_INIT(head) do { \
- SLIST_FIRST((head)) = NULL; \
-} while (0)
-
-#define SLIST_INSERT_AFTER(slistelm, elm, field) do { \
- SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field); \
- SLIST_NEXT((slistelm), field) = (elm); \
-} while (0)
-
-#define SLIST_INSERT_HEAD(head, elm, field) do { \
- SLIST_NEXT((elm), field) = SLIST_FIRST((head)); \
- SLIST_FIRST((head)) = (elm); \
-} while (0)
-
-#define SLIST_NEXT(elm, field) ((elm)->field.sle_next)
-
-#define SLIST_REMOVE(head, elm, type, field) do { \
- if (SLIST_FIRST((head)) == (elm)) { \
- SLIST_REMOVE_HEAD((head), field); \
- } \
- else { \
- struct type *curelm = SLIST_FIRST((head)); \
- while (SLIST_NEXT(curelm, field) != (elm)) \
- curelm = SLIST_NEXT(curelm, field); \
- SLIST_NEXT(curelm, field) = \
- SLIST_NEXT(SLIST_NEXT(curelm, field), field); \
- } \
-} while (0)
-
-#define SLIST_REMOVE_HEAD(head, field) do { \
- SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field); \
-} while (0)
-
-/*
- * Singly-linked Tail queue declarations.
- */
-#define STAILQ_HEAD(name, type) \
-struct name { \
- struct type *stqh_first;/* first element */ \
- struct type **stqh_last;/* addr of last next element */ \
-}
-
-#define STAILQ_HEAD_INITIALIZER(head) \
- { NULL, &(head).stqh_first }
-
-#define STAILQ_ENTRY(type) \
-struct { \
- struct type *stqe_next; /* next element */ \
-}
-
-/*
- * Singly-linked Tail queue functions.
- */
-#define STAILQ_CONCAT(head1, head2) do { \
- if (!STAILQ_EMPTY((head2))) { \
- *(head1)->stqh_last = (head2)->stqh_first; \
- (head1)->stqh_last = (head2)->stqh_last; \
- STAILQ_INIT((head2)); \
- } \
-} while (0)
-
-#define STAILQ_EMPTY(head) ((head)->stqh_first == NULL)
-
-#define STAILQ_FIRST(head) ((head)->stqh_first)
-
-#define STAILQ_FOREACH(var, head, field) \
- for ((var) = STAILQ_FIRST((head)); \
- (var); \
- (var) = STAILQ_NEXT((var), field))
-
-#define STAILQ_INIT(head) do { \
- STAILQ_FIRST((head)) = NULL; \
- (head)->stqh_last = &STAILQ_FIRST((head)); \
-} while (0)
-
-#define STAILQ_INSERT_AFTER(head, tqelm, elm, field) do { \
- if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\
- (head)->stqh_last = &STAILQ_NEXT((elm), field); \
- STAILQ_NEXT((tqelm), field) = (elm); \
-} while (0)
-
-#define STAILQ_INSERT_HEAD(head, elm, field) do { \
- if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL) \
- (head)->stqh_last = &STAILQ_NEXT((elm), field); \
- STAILQ_FIRST((head)) = (elm); \
-} while (0)
-
-#define STAILQ_INSERT_TAIL(head, elm, field) do { \
- STAILQ_NEXT((elm), field) = NULL; \
- *(head)->stqh_last = (elm); \
- (head)->stqh_last = &STAILQ_NEXT((elm), field); \
-} while (0)
-
-#define STAILQ_LAST(head, type, field) \
- (STAILQ_EMPTY((head)) ? \
- NULL : \
- ((struct type *) \
- ((char *)((head)->stqh_last) - __offsetof(struct type, field))))
-
-#define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next)
-
-#define STAILQ_REMOVE(head, elm, type, field) do { \
- if (STAILQ_FIRST((head)) == (elm)) { \
- STAILQ_REMOVE_HEAD((head), field); \
- } \
- else { \
- struct type *curelm = STAILQ_FIRST((head)); \
- while (STAILQ_NEXT(curelm, field) != (elm)) \
- curelm = STAILQ_NEXT(curelm, field); \
- if ((STAILQ_NEXT(curelm, field) = \
- STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\
- (head)->stqh_last = &STAILQ_NEXT((curelm), field);\
- } \
-} while (0)
-
-#define STAILQ_REMOVE_HEAD(head, field) do { \
- if ((STAILQ_FIRST((head)) = \
- STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL) \
- (head)->stqh_last = &STAILQ_FIRST((head)); \
-} while (0)
-
-#define STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do { \
- if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL) \
- (head)->stqh_last = &STAILQ_FIRST((head)); \
-} while (0)
-
-/*
- * List declarations.
- */
-#define LIST_HEAD(name, type) \
-struct name { \
- struct type *lh_first; /* first element */ \
-}
-
-#define LIST_HEAD_INITIALIZER(head) \
- { NULL }
-
-#define LIST_ENTRY(type) \
-struct { \
- struct type *le_next; /* next element */ \
- struct type **le_prev; /* address of previous next element */ \
-}
-
-/*
- * List functions.
- */
-
-#define LIST_EMPTY(head) ((head)->lh_first == NULL)
-
-#define LIST_FIRST(head) ((head)->lh_first)
-
-#define LIST_FOREACH(var, head, field) \
- for ((var) = LIST_FIRST((head)); \
- (var); \
- (var) = LIST_NEXT((var), field))
-
-#define LIST_INIT(head) do { \
- LIST_FIRST((head)) = NULL; \
-} while (0)
-
-#define LIST_INSERT_AFTER(listelm, elm, field) do { \
- if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\
- LIST_NEXT((listelm), field)->field.le_prev = \
- &LIST_NEXT((elm), field); \
- LIST_NEXT((listelm), field) = (elm); \
- (elm)->field.le_prev = &LIST_NEXT((listelm), field); \
-} while (0)
-
-#define LIST_INSERT_BEFORE(listelm, elm, field) do { \
- (elm)->field.le_prev = (listelm)->field.le_prev; \
- LIST_NEXT((elm), field) = (listelm); \
- *(listelm)->field.le_prev = (elm); \
- (listelm)->field.le_prev = &LIST_NEXT((elm), field); \
-} while (0)
-
-#define LIST_INSERT_HEAD(head, elm, field) do { \
- if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL) \
- LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\
- LIST_FIRST((head)) = (elm); \
- (elm)->field.le_prev = &LIST_FIRST((head)); \
-} while (0)
-
-#define LIST_NEXT(elm, field) ((elm)->field.le_next)
-
-#define LIST_REMOVE(elm, field) do { \
- if (LIST_NEXT((elm), field) != NULL) \
- LIST_NEXT((elm), field)->field.le_prev = \
- (elm)->field.le_prev; \
- *(elm)->field.le_prev = LIST_NEXT((elm), field); \
-} while (0)
-
-/*
* Tail queue declarations.
*/
#define TAILQ_HEAD(name, type) \
@@ -488,6 +167,7 @@ struct { \
} while (0)
#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \
+ WT_WRITE_BARRIER(); \
if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\
TAILQ_NEXT((elm), field)->field.tqe_prev = \
&TAILQ_NEXT((elm), field); \
@@ -502,6 +182,7 @@ struct { \
} while (0)
#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \
+ WT_WRITE_BARRIER(); \
(elm)->field.tqe_prev = (listelm)->field.tqe_prev; \
TAILQ_NEXT((elm), field) = (listelm); \
*(listelm)->field.tqe_prev = (elm); \
@@ -511,6 +192,7 @@ struct { \
} while (0)
#define TAILQ_INSERT_HEAD(head, elm, field) do { \
+ WT_WRITE_BARRIER(); \
if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL) \
TAILQ_FIRST((head))->field.tqe_prev = \
&TAILQ_NEXT((elm), field); \
@@ -523,6 +205,7 @@ struct { \
} while (0)
#define TAILQ_INSERT_TAIL(head, elm, field) do { \
+ WT_WRITE_BARRIER(); \
TAILQ_NEXT((elm), field) = NULL; \
(elm)->field.tqe_prev = (head)->tqh_last; \
*(head)->tqh_last = (elm); \
diff --git a/src/include/schema.h b/src/include/schema.h
index 8f4884281cd..0664af5adba 100644
--- a/src/include/schema.h
+++ b/src/include/schema.h
@@ -62,8 +62,8 @@ struct __wt_table {
WT_INDEX **indices;
size_t idx_alloc;
- SLIST_ENTRY(__wt_table) l;
- SLIST_ENTRY(__wt_table) hashl;
+ TAILQ_ENTRY(__wt_table) q;
+ TAILQ_ENTRY(__wt_table) hashq;
int cg_complete, idx_complete, is_simple;
u_int ncolgroups, nindices, nkey_columns;
diff --git a/src/include/serial.i b/src/include/serial.i
index 9e6b0f7916c..d90b29c2133 100644
--- a/src/include/serial.i
+++ b/src/include/serial.i
@@ -30,11 +30,11 @@ __page_write_gen_wrapped_check(WT_PAGE *page)
}
/*
- * __insert_serial_func --
- * Worker function to add a WT_INSERT entry to a skiplist.
+ * __insert_simple_func --
+ * Worker function to add a WT_INSERT entry to the middle of a skiplist.
*/
static inline int
-__insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head,
+__insert_simple_func(WT_SESSION_IMPL *session,
WT_INSERT ***ins_stack, WT_INSERT *new_ins, u_int skipdepth)
{
u_int i;
@@ -42,31 +42,62 @@ __insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head,
WT_UNUSED(session);
/*
- * Confirm we are still in the expected position, and no item has been
- * added where our insert belongs. Take extra care at the beginning
- * and end of the list (at each level): retry if we race there.
+ * Update the skiplist elements referencing the new WT_INSERT item.
+ * If we fail connecting one of the upper levels in the skiplist,
+ * return success: the levels we updated are correct and sufficient.
+ * Even though we don't get the benefit of the memory we allocated,
+ * we can't roll back.
*
- * !!!
- * Note the test for ins_stack[0] == NULL: that's the test for an
- * uninitialized cursor, ins_stack[0] is cleared as part of
- * initializing a cursor for a search.
+ * All structure setup must be flushed before the structure is entered
+ * into the list. We need a write barrier here, our callers depend on
+ * it. Don't pass complex arguments to the macro, some implementations
+ * read the old value multiple times.
*/
for (i = 0; i < skipdepth; i++) {
- if (ins_stack[i] == NULL ||
- *ins_stack[i] != new_ins->next[i])
- return (WT_RESTART);
- if (new_ins->next[i] == NULL &&
- ins_head->tail[i] != NULL &&
- ins_stack[i] != &ins_head->tail[i]->next[i])
- return (WT_RESTART);
+ WT_INSERT *old_ins = *ins_stack[i];
+ if (old_ins != new_ins->next[i] ||
+ !__wt_atomic_cas_ptr(ins_stack[i], old_ins, new_ins))
+ return (i == 0 ? WT_RESTART : 0);
}
- /* Update the skiplist elements referencing the new WT_INSERT item. */
+ return (0);
+}
+
+/*
+ * __insert_serial_func --
+ * Worker function to add a WT_INSERT entry to a skiplist.
+ */
+static inline int
+__insert_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head,
+ WT_INSERT ***ins_stack, WT_INSERT *new_ins, u_int skipdepth)
+{
+ u_int i;
+
+ /* The cursor should be positioned. */
+ WT_ASSERT(session, ins_stack[0] != NULL);
+
+ /*
+ * Update the skiplist elements referencing the new WT_INSERT item.
+ *
+ * Confirm we are still in the expected position, and no item has been
+ * added where our insert belongs. If we fail connecting one of the
+ * upper levels in the skiplist, return success: the levels we updated
+ * are correct and sufficient. Even though we don't get the benefit of
+ * the memory we allocated, we can't roll back.
+ *
+ * All structure setup must be flushed before the structure is entered
+ * into the list. We need a write barrier here, our callers depend on
+ * it. Don't pass complex arguments to the macro, some implementations
+ * read the old value multiple times.
+ */
for (i = 0; i < skipdepth; i++) {
+ WT_INSERT *old_ins = *ins_stack[i];
+ if (old_ins != new_ins->next[i] ||
+ !__wt_atomic_cas_ptr(ins_stack[i], old_ins, new_ins))
+ return (i == 0 ? WT_RESTART : 0);
if (ins_head->tail[i] == NULL ||
ins_stack[i] == &ins_head->tail[i]->next[i])
ins_head->tail[i] = new_ins;
- *ins_stack[i] = new_ins;
}
return (0);
@@ -92,7 +123,7 @@ __col_append_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head,
* If the application didn't specify a record number, allocate a new one
* and set up for an append.
*/
- if ((recno = WT_INSERT_RECNO(new_ins)) == 0) {
+ if ((recno = WT_INSERT_RECNO(new_ins)) == WT_RECNO_OOB) {
recno = WT_INSERT_RECNO(new_ins) = btree->last_recno + 1;
WT_ASSERT(session, WT_SKIP_LAST(ins_head) == NULL ||
recno > WT_INSERT_RECNO(WT_SKIP_LAST(ins_head)));
@@ -128,20 +159,20 @@ __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
WT_INSERT *new_ins = *new_insp;
WT_DECL_RET;
- /* Clear references to memory we now own. */
- *new_insp = NULL;
-
/* Check for page write generation wrap. */
WT_RET(__page_write_gen_wrapped_check(page));
+ /* Clear references to memory we now own and must free on error. */
+ *new_insp = NULL;
+
/* Acquire the page's spinlock, call the worker function. */
WT_PAGE_LOCK(session, page);
ret = __col_append_serial_func(
session, ins_head, ins_stack, new_ins, recnop, skipdepth);
WT_PAGE_UNLOCK(session, page);
- /* Free unused memory on error. */
if (ret != 0) {
+ /* Free unused memory on error. */
__wt_free(session, new_ins);
return (ret);
}
@@ -171,21 +202,32 @@ __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
{
WT_INSERT *new_ins = *new_insp;
WT_DECL_RET;
-
- /* Clear references to memory we now own. */
- *new_insp = NULL;
+ int simple;
+ u_int i;
/* Check for page write generation wrap. */
WT_RET(__page_write_gen_wrapped_check(page));
- /* Acquire the page's spinlock, call the worker function. */
- WT_PAGE_LOCK(session, page);
- ret = __insert_serial_func(
- session, ins_head, ins_stack, new_ins, skipdepth);
- WT_PAGE_UNLOCK(session, page);
+ /* Clear references to memory we now own and must free on error. */
+ *new_insp = NULL;
+
+ simple = 1;
+ for (i = 0; i < skipdepth; i++)
+ if (new_ins->next[i] == NULL)
+ simple = 0;
+
+ if (simple)
+ ret = __insert_simple_func(
+ session, ins_stack, new_ins, skipdepth);
+ else {
+ WT_PAGE_LOCK(session, page);
+ ret = __insert_serial_func(
+ session, ins_head, ins_stack, new_ins, skipdepth);
+ WT_PAGE_UNLOCK(session, page);
+ }
- /* Free unused memory on error. */
if (ret != 0) {
+ /* Free unused memory on error. */
__wt_free(session, new_ins);
return (ret);
}
@@ -215,26 +257,27 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
WT_DECL_RET;
WT_UPDATE *obsolete, *upd = *updp;
- /* Clear references to memory we now own. */
- *updp = NULL;
-
/* Check for page write generation wrap. */
WT_RET(__page_write_gen_wrapped_check(page));
+ /* Clear references to memory we now own and must free on error. */
+ *updp = NULL;
+
/*
+ * All structure setup must be flushed before the structure is entered
+ * into the list. We need a write barrier here, our callers depend on
+ * it.
+ *
* Swap the update into place. If that fails, a new update was added
- * after our search, we raced. Check if our update is still permitted,
- * and if it is, do a full-barrier to ensure the update's next pointer
- * is set before we update the linked list and try again.
+ * after our search, we raced. Check if our update is still permitted.
*/
- while (!WT_ATOMIC_CAS8(*srch_upd, upd->next, upd)) {
+ while (!__wt_atomic_cas_ptr(srch_upd, upd->next, upd)) {
if ((ret = __wt_txn_update_check(
session, upd->next = *srch_upd)) != 0) {
/* Free unused memory on error. */
__wt_free(session, upd);
return (ret);
}
- WT_WRITE_BARRIER();
}
/*
@@ -249,25 +292,37 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
__wt_page_modify_set(session, page);
/*
- * If there are subsequent WT_UPDATE structures, we're evicting pages
- * and the page-scanning mutex isn't held, discard obsolete WT_UPDATE
- * structures. Serialization is needed so only one thread does the
- * obsolete check at a time, and to protect updates from disappearing
- * under reconciliation.
+ * If there are no subsequent WT_UPDATE structures we are done here.
*/
- if (upd->next != NULL &&
- __wt_txn_visible_all(session, page->modify->obsolete_check_txn)) {
- F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret);
- /* If we can't lock it, don't scan, that's okay. */
- if (ret != 0)
- return (0);
- obsolete = __wt_update_obsolete_check(session, page, upd->next);
- F_CLR_ATOMIC(page, WT_PAGE_SCANNING);
- if (obsolete != NULL) {
+ if (upd->next == NULL)
+ return (0);
+ /*
+ * We would like to call __wt_txn_update_oldest only in the event that
+ * there are further updates to this page, the check against WT_TXN_NONE
+ * is used as an indicator of there being further updates on this page.
+ */
+ if (page->modify->obsolete_check_txn != WT_TXN_NONE) {
+ if (!__wt_txn_visible_all(session,
+ page->modify->obsolete_check_txn)) {
+ /* Try to move the oldest ID forward and re-check */
+ __wt_txn_update_oldest(session,0);
+ }
+ if (!__wt_txn_visible_all(session,
+ page->modify->obsolete_check_txn)) {
page->modify->obsolete_check_txn = WT_TXN_NONE;
- __wt_update_obsolete_free(session, page, obsolete);
+ return (0);
}
}
+ F_CAS_ATOMIC(page, WT_PAGE_RECONCILIATION, ret);
+
+ /* If we can't lock it, don't scan, that's okay. */
+ if (ret != 0)
+ return (0);
+ obsolete = __wt_update_obsolete_check(session, page, upd->next);
+ F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION);
+ if (obsolete != NULL) {
+ __wt_update_obsolete_free(session, page, obsolete);
+ }
return (0);
}
diff --git a/src/include/session.h b/src/include/session.h
index f32da177bf9..a691794fd46 100644
--- a/src/include/session.h
+++ b/src/include/session.h
@@ -14,8 +14,8 @@
struct __wt_data_handle_cache {
WT_DATA_HANDLE *dhandle;
- SLIST_ENTRY(__wt_data_handle_cache) l;
- SLIST_ENTRY(__wt_data_handle_cache) hashl;
+ TAILQ_ENTRY(__wt_data_handle_cache) q;
+ TAILQ_ENTRY(__wt_data_handle_cache) hashq;
};
/*
@@ -66,7 +66,7 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl {
* across session close - so it is declared further down.
*/
/* Session handle reference list */
- SLIST_HEAD(__dhandles, __wt_data_handle_cache) dhandles;
+ TAILQ_HEAD(__dhandles, __wt_data_handle_cache) dhandles;
time_t last_sweep; /* Last sweep for dead handles */
WT_CURSOR *cursor; /* Current cursor */
@@ -76,6 +76,11 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl {
WT_CURSOR_BACKUP *bkp_cursor; /* Hot backup cursor */
WT_COMPACT *compact; /* Compact state */
+ /*
+ * Lookaside table cursor, sweep and eviction worker threads only.
+ */
+ WT_CURSOR *las_cursor; /* Lookaside table cursor */
+
WT_DATA_HANDLE *meta_dhandle; /* Metadata file */
void *meta_track; /* Metadata operation tracking */
void *meta_track_next; /* Current position */
@@ -90,7 +95,7 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl {
* table of lists. The hash table list is kept in allocated memory
* that lives across session close - so it is declared further down.
*/
- SLIST_HEAD(__tables, __wt_table) tables;
+ TAILQ_HEAD(__tables, __wt_table) tables;
WT_ITEM **scratch; /* Temporary memory for any function */
u_int scratch_alloc; /* Currently allocated */
@@ -151,9 +156,9 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl {
WT_RAND_STATE rnd; /* Random number generation state */
/* Hashed handle reference list array */
- SLIST_HEAD(__dhandles_hash, __wt_data_handle_cache) *dhhash;
+ TAILQ_HEAD(__dhandles_hash, __wt_data_handle_cache) *dhhash;
/* Hashed table reference list array */
- SLIST_HEAD(__tables_hash, __wt_table) *tablehash;
+ TAILQ_HEAD(__tables_hash, __wt_table) *tablehash;
/*
* Splits can "free" memory that may still be in use, and we use a
diff --git a/src/include/stat.h b/src/include/stat.h
index 6dc9282a613..cd2c149bc94 100644
--- a/src/include/stat.h
+++ b/src/include/stat.h
@@ -6,122 +6,217 @@
* See the file LICENSE for redistribution information.
*/
-struct __wt_stats {
- const char *desc; /* text description */
- uint64_t v; /* 64-bit value */
-};
+/*
+ * Statistics counters:
+ *
+ * We use an array of statistics structures; threads write different structures
+ * to avoid writing the same cache line and incurring cache coherency overheads,
+ * which can dramatically slow fast and otherwise read-mostly workloads.
+ *
+ * With an 8B statistics value and 64B cache-line alignment, 8 values share the
+ * same cache line. There are collisions when different threads choose the same
+ * statistics structure and update values that live on the cache line. There is
+ * likely some locality however: a thread updating the cursor search statistic
+ * is likely to update other cursor statistics with a chance of hitting already
+ * cached values.
+ *
+ * The actual statistic value must be signed, because one thread might increment
+ * the value in its structure, and then another thread might decrement the same
+ * value in another structure (where the value was initially zero), so the value
+ * in the second thread's slot will go negative.
+ *
+ * When reading a statistics value, the array values are summed and returned to
+ * the caller. The summation is performed without locking, so the value read
+ * may be inconsistent (and might be negative, if increments/decrements race
+ * with the reader).
+ *
+ * Choosing how many structures isn't easy: obviously, a smaller number creates
+ * more conflicts while a larger number uses more memory.
+ *
+ * Ideally, if the application running on the system is CPU-intensive, and using
+ * all CPUs on the system, we want to use the same number of slots as there are
+ * CPUs (because their L1 caches are the units of coherency). However, in
+ * practice we cannot easily determine how many CPUs are actually available to
+ * the application.
+ *
+ * Our next best option is to use the number of threads in the application as a
+ * heuristic for the number of CPUs (presumably, the application architect has
+ * figured out how many CPUs are available). However, inside WiredTiger we don't
+ * know when the application creates its threads.
+ *
+ * For now, we use a fixed number of slots. Ideally, we would approximate the
+ * largest number of cores we expect on any machine where WiredTiger might be
+ * run, however, we don't want to waste that much memory on smaller machines.
+ * As of 2015, machines with more than 24 CPUs are relatively rare.
+ *
+ * Default hash table size; use a prime number of buckets rather than assuming
+ * a good hash (Reference Sedgewick, Algorithms in C, "Hash Functions").
+ */
+#define WT_COUNTER_SLOTS 23
/*
- * Read/write statistics without any test for statistics configuration.
+ * WT_STATS_SLOT_ID is the thread's slot ID for the array of structures.
+ *
+ * Ideally, we want a slot per CPU, and we want each thread to index the slot
+ * corresponding to the CPU it runs on. Unfortunately, getting the ID of the
+ * current CPU is difficult: some operating systems provide a system call to
+ * acquire a CPU ID, but not all (regardless, making a system call to increment
+ * a statistics value is far too expensive).
+ *
+ * Our second-best option is to use the thread ID. Unfortunately, there is no
+ * portable way to obtain a unique thread ID that's a small-enough number to
+ * be used as an array index (portable thread IDs are usually a pointer or an
+ * opaque chunk, not a simple integer).
+ *
+ * Our solution is to use the session ID; there is normally a session per thread
+ * and the session ID is a small, monotonically increasing number.
*/
-#define WT_STAT(stats, fld) \
- ((stats)->fld.v)
-#define WT_STAT_ATOMIC_DECRV(stats, fld, value) do { \
- (void)WT_ATOMIC_SUB8(WT_STAT(stats, fld), (value)); \
-} while (0)
-#define WT_STAT_ATOMIC_DECR(stats, fld) WT_STAT_ATOMIC_DECRV(stats, fld, 1)
-#define WT_STAT_ATOMIC_INCRV(stats, fld, value) do { \
- (void)WT_ATOMIC_ADD8(WT_STAT(stats, fld), (value)); \
-} while (0)
-#define WT_STAT_ATOMIC_INCR(stats, fld) WT_STAT_ATOMIC_INCRV(stats, fld, 1)
-#define WT_STAT_DECRV(stats, fld, value) do { \
- (stats)->fld.v -= (value); \
-} while (0)
-#define WT_STAT_DECR(stats, fld) WT_STAT_DECRV(stats, fld, 1)
-#define WT_STAT_INCRV(stats, fld, value) do { \
- (stats)->fld.v += (value); \
-} while (0)
-#define WT_STAT_INCR(stats, fld) WT_STAT_INCRV(stats, fld, 1)
-#define WT_STAT_SET(stats, fld, value) do { \
- (stats)->fld.v = (uint64_t)(value); \
-} while (0)
+#define WT_STATS_SLOT_ID(session) \
+ ((session)->id) % WT_COUNTER_SLOTS
/*
- * Read/write statistics if "fast" statistics are configured.
+ * Statistic structures are arrays of int64_t's. We have functions to read/write
+ * those structures regardless of the specific statistic structure we're working
+ * with, by translating statistics structure field names to structure offsets.
+ *
+ * Translate a statistic's value name to an offset.
*/
-#define WT_STAT_FAST_ATOMIC_DECRV(session, stats, fld, value) do { \
- if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \
- WT_STAT_ATOMIC_DECRV(stats, fld, value); \
-} while (0)
-#define WT_STAT_FAST_ATOMIC_DECR(session, stats, fld) \
- WT_STAT_FAST_ATOMIC_DECRV(session, stats, fld, 1)
-#define WT_STAT_FAST_ATOMIC_INCRV(session, stats, fld, value) do { \
- if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \
- WT_STAT_ATOMIC_INCRV(stats, fld, value); \
+#define WT_STATS_FIELD_TO_SLOT(stats, fld) \
+ (int)(&(stats)[0]->fld - (int64_t *)(stats)[0])
+
+/*
+ * Sum the values from all structures in the array.
+ */
+static inline int64_t
+__wt_stats_aggregate(void *stats_arg, int slot)
+{
+ int64_t **stats, aggr_v;
+ int i;
+
+ stats = stats_arg;
+ for (aggr_v = 0, i = 0; i < WT_COUNTER_SLOTS; i++)
+ aggr_v += stats[i][slot];
+
+ /*
+ * This can race. However, any implementation with a single value can
+ * race as well, different threads could set the same counter value
+ * simultaneously. While we are making races more likely, we are not
+ * fundamentally weakening the isolation semantics found in updating a
+ * single value.
+ *
+ * Additionally, the aggregation can go negative (imagine a thread
+ * incrementing a value after aggregation has passed its slot and a
+ * second thread decrementing a value before aggregation has reached
+ * its slot).
+ *
+ * For historic API compatibility, the external type is a uint64_t;
+ * limit our return to positive values, negative numbers would just
+ * look really, really large.
+ */
+ if (aggr_v < 0)
+ aggr_v = 0;
+ return (aggr_v);
+}
+
+/*
+ * Clear the values in all structures in the array.
+ */
+static inline void
+__wt_stats_clear(void *stats_arg, int slot)
+{
+ int64_t **stats;
+ int i;
+
+ stats = stats_arg;
+ for (i = 0; i < WT_COUNTER_SLOTS; i++)
+ stats[i][slot] = 0;
+}
+
+/*
+ * Read/write statistics without any test for statistics configuration. Reading
+ * and writing the field requires different actions: reading sums the values
+ * across the array of structures, writing updates a single structure's value.
+ */
+#define WT_STAT_READ(stats, fld) \
+ __wt_stats_aggregate(stats, WT_STATS_FIELD_TO_SLOT(stats, fld))
+#define WT_STAT_WRITE(session, stats, fld) \
+ ((stats)[WT_STATS_SLOT_ID(session)]->fld);
+
+#define WT_STAT_DECRV(session, stats, fld, value) \
+ (stats)[WT_STATS_SLOT_ID(session)]->fld -= (int64_t)(value)
+#define WT_STAT_DECR(session, stats, fld) \
+ WT_STAT_DECRV(session, stats, fld, 1)
+#define WT_STAT_INCRV(session, stats, fld, value) \
+ (stats)[WT_STATS_SLOT_ID(session)]->fld += (int64_t)(value)
+#define WT_STAT_INCR(session, stats, fld) \
+ WT_STAT_INCRV(session, stats, fld, 1)
+#define WT_STAT_SET(session, stats, fld, value) do { \
+ __wt_stats_clear(stats, WT_STATS_FIELD_TO_SLOT(stats, fld)); \
+ (stats)[0]->fld = (int64_t)(value); \
} while (0)
-#define WT_STAT_FAST_ATOMIC_INCR(session, stats, fld) \
- WT_STAT_FAST_ATOMIC_INCRV(session, stats, fld, 1)
+
+/*
+ * Update statistics if "fast" statistics are configured.
+ */
#define WT_STAT_FAST_DECRV(session, stats, fld, value) do { \
if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \
- WT_STAT_DECRV(stats, fld, value); \
+ WT_STAT_DECRV(session, stats, fld, value); \
} while (0)
#define WT_STAT_FAST_DECR(session, stats, fld) \
WT_STAT_FAST_DECRV(session, stats, fld, 1)
#define WT_STAT_FAST_INCRV(session, stats, fld, value) do { \
if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \
- WT_STAT_INCRV(stats, fld, value); \
+ WT_STAT_INCRV(session, stats, fld, value); \
} while (0)
#define WT_STAT_FAST_INCR(session, stats, fld) \
WT_STAT_FAST_INCRV(session, stats, fld, 1)
#define WT_STAT_FAST_SET(session, stats, fld, value) do { \
if (FLD_ISSET(S2C(session)->stat_flags, WT_CONN_STAT_FAST)) \
- WT_STAT_SET(stats, fld, value); \
+ WT_STAT_SET(session, stats, fld, value); \
} while (0)
/*
- * Read/write connection handle statistics if "fast" statistics are configured.
+ * Update connection handle statistics if "fast" statistics are configured.
*/
-#define WT_STAT_FAST_CONN_ATOMIC_DECRV(session, fld, value) \
- WT_STAT_FAST_ATOMIC_DECRV(session, &S2C(session)->stats, fld, value)
-#define WT_STAT_FAST_CONN_ATOMIC_DECR(session, fld) \
- WT_STAT_FAST_ATOMIC_DECR(session, &S2C(session)->stats, fld)
-#define WT_STAT_FAST_CONN_ATOMIC_INCRV(session, fld, value) \
- WT_STAT_FAST_ATOMIC_INCRV(session, &S2C(session)->stats, fld, value)
-#define WT_STAT_FAST_CONN_ATOMIC_INCR(session, fld) \
- WT_STAT_FAST_ATOMIC_INCR(session, &S2C(session)->stats, fld)
#define WT_STAT_FAST_CONN_DECR(session, fld) \
- WT_STAT_FAST_DECR(session, &S2C(session)->stats, fld)
+ WT_STAT_FAST_DECR(session, S2C(session)->stats, fld)
#define WT_STAT_FAST_CONN_DECRV(session, fld, value) \
- WT_STAT_FAST_DECRV(session, &S2C(session)->stats, fld, value)
+ WT_STAT_FAST_DECRV(session, S2C(session)->stats, fld, value)
#define WT_STAT_FAST_CONN_INCR(session, fld) \
- WT_STAT_FAST_INCR(session, &S2C(session)->stats, fld)
+ WT_STAT_FAST_INCR(session, S2C(session)->stats, fld)
#define WT_STAT_FAST_CONN_INCRV(session, fld, value) \
- WT_STAT_FAST_INCRV(session, &S2C(session)->stats, fld, value)
+ WT_STAT_FAST_INCRV(session, S2C(session)->stats, fld, value)
#define WT_STAT_FAST_CONN_SET(session, fld, value) \
- WT_STAT_FAST_SET(session, &S2C(session)->stats, fld, value)
+ WT_STAT_FAST_SET(session, S2C(session)->stats, fld, value)
/*
- * Read/write data-source handle statistics if the data-source handle is set
- * and "fast" statistics are configured.
+ * Update data-source handle statistics if "fast" statistics are configured
+ * and the data-source handle is set.
*
* XXX
* We shouldn't have to check if the data-source handle is NULL, but it's
- * useful until everything is converted to using data-source handles.
+ * necessary until everything is converted to using data-source handles.
*/
#define WT_STAT_FAST_DATA_DECRV(session, fld, value) do { \
if ((session)->dhandle != NULL) \
WT_STAT_FAST_DECRV( \
- session, &(session)->dhandle->stats, fld, value); \
+ session, (session)->dhandle->stats, fld, value); \
} while (0)
#define WT_STAT_FAST_DATA_DECR(session, fld) \
WT_STAT_FAST_DATA_DECRV(session, fld, 1)
#define WT_STAT_FAST_DATA_INCRV(session, fld, value) do { \
if ((session)->dhandle != NULL) \
WT_STAT_FAST_INCRV( \
- session, &(session)->dhandle->stats, fld, value); \
+ session, (session)->dhandle->stats, fld, value); \
} while (0)
#define WT_STAT_FAST_DATA_INCR(session, fld) \
WT_STAT_FAST_DATA_INCRV(session, fld, 1)
#define WT_STAT_FAST_DATA_SET(session, fld, value) do { \
if ((session)->dhandle != NULL) \
WT_STAT_FAST_SET( \
- session, &(session)->dhandle->stats, fld, value); \
+ session, (session)->dhandle->stats, fld, value); \
} while (0)
-/* Connection handle statistics value. */
-#define WT_CONN_STAT(session, fld) \
- WT_STAT(&S2C(session)->stats, fld)
-
/*
* DO NOT EDIT: automatically built by dist/stat.py.
*/
@@ -132,148 +227,157 @@ struct __wt_stats {
*/
#define WT_CONNECTION_STATS_BASE 1000
struct __wt_connection_stats {
- WT_STATS async_alloc_race;
- WT_STATS async_alloc_view;
- WT_STATS async_cur_queue;
- WT_STATS async_flush;
- WT_STATS async_full;
- WT_STATS async_max_queue;
- WT_STATS async_nowork;
- WT_STATS async_op_alloc;
- WT_STATS async_op_compact;
- WT_STATS async_op_insert;
- WT_STATS async_op_remove;
- WT_STATS async_op_search;
- WT_STATS async_op_update;
- WT_STATS block_byte_map_read;
- WT_STATS block_byte_read;
- WT_STATS block_byte_write;
- WT_STATS block_map_read;
- WT_STATS block_preload;
- WT_STATS block_read;
- WT_STATS block_write;
- WT_STATS cache_bytes_dirty;
- WT_STATS cache_bytes_internal;
- WT_STATS cache_bytes_inuse;
- WT_STATS cache_bytes_leaf;
- WT_STATS cache_bytes_max;
- WT_STATS cache_bytes_overflow;
- WT_STATS cache_bytes_read;
- WT_STATS cache_bytes_write;
- WT_STATS cache_eviction_app;
- WT_STATS cache_eviction_checkpoint;
- WT_STATS cache_eviction_clean;
- WT_STATS cache_eviction_deepen;
- WT_STATS cache_eviction_dirty;
- WT_STATS cache_eviction_fail;
- WT_STATS cache_eviction_force;
- WT_STATS cache_eviction_force_delete;
- WT_STATS cache_eviction_force_fail;
- WT_STATS cache_eviction_hazard;
- WT_STATS cache_eviction_internal;
- WT_STATS cache_eviction_maximum_page_size;
- WT_STATS cache_eviction_queue_empty;
- WT_STATS cache_eviction_queue_not_empty;
- WT_STATS cache_eviction_server_evicting;
- WT_STATS cache_eviction_server_not_evicting;
- WT_STATS cache_eviction_slow;
- WT_STATS cache_eviction_split;
- WT_STATS cache_eviction_walk;
- WT_STATS cache_eviction_worker_evicting;
- WT_STATS cache_inmem_split;
- WT_STATS cache_overhead;
- WT_STATS cache_pages_dirty;
- WT_STATS cache_pages_inuse;
- WT_STATS cache_read;
- WT_STATS cache_write;
- WT_STATS cond_wait;
- WT_STATS cursor_create;
- WT_STATS cursor_insert;
- WT_STATS cursor_next;
- WT_STATS cursor_prev;
- WT_STATS cursor_remove;
- WT_STATS cursor_reset;
- WT_STATS cursor_search;
- WT_STATS cursor_search_near;
- WT_STATS cursor_update;
- WT_STATS dh_conn_handles;
- WT_STATS dh_conn_ref;
- WT_STATS dh_conn_sweeps;
- WT_STATS dh_conn_tod;
- WT_STATS dh_session_handles;
- WT_STATS dh_session_sweeps;
- WT_STATS file_open;
- WT_STATS log_buffer_size;
- WT_STATS log_bytes_payload;
- WT_STATS log_bytes_written;
- WT_STATS log_close_yields;
- WT_STATS log_compress_len;
- WT_STATS log_compress_mem;
- WT_STATS log_compress_small;
- WT_STATS log_compress_write_fails;
- WT_STATS log_compress_writes;
- WT_STATS log_max_filesize;
- WT_STATS log_prealloc_files;
- WT_STATS log_prealloc_max;
- WT_STATS log_prealloc_used;
- WT_STATS log_release_write_lsn;
- WT_STATS log_scan_records;
- WT_STATS log_scan_rereads;
- WT_STATS log_scans;
- WT_STATS log_slot_closes;
- WT_STATS log_slot_coalesced;
- WT_STATS log_slot_consolidated;
- WT_STATS log_slot_joins;
- WT_STATS log_slot_races;
- WT_STATS log_slot_toobig;
- WT_STATS log_slot_toosmall;
- WT_STATS log_slot_transitions;
- WT_STATS log_sync;
- WT_STATS log_sync_dir;
- WT_STATS log_write_lsn;
- WT_STATS log_writes;
- WT_STATS lsm_checkpoint_throttle;
- WT_STATS lsm_merge_throttle;
- WT_STATS lsm_rows_merged;
- WT_STATS lsm_work_queue_app;
- WT_STATS lsm_work_queue_manager;
- WT_STATS lsm_work_queue_max;
- WT_STATS lsm_work_queue_switch;
- WT_STATS lsm_work_units_created;
- WT_STATS lsm_work_units_discarded;
- WT_STATS lsm_work_units_done;
- WT_STATS memory_allocation;
- WT_STATS memory_free;
- WT_STATS memory_grow;
- WT_STATS page_busy_blocked;
- WT_STATS page_forcible_evict_blocked;
- WT_STATS page_locked_blocked;
- WT_STATS page_read_blocked;
- WT_STATS page_sleep;
- WT_STATS read_io;
- WT_STATS rec_pages;
- WT_STATS rec_pages_eviction;
- WT_STATS rec_split_stashed_bytes;
- WT_STATS rec_split_stashed_objects;
- WT_STATS rwlock_read;
- WT_STATS rwlock_write;
- WT_STATS session_cursor_open;
- WT_STATS session_open;
- WT_STATS txn_begin;
- WT_STATS txn_checkpoint;
- WT_STATS txn_checkpoint_generation;
- WT_STATS txn_checkpoint_running;
- WT_STATS txn_checkpoint_time_max;
- WT_STATS txn_checkpoint_time_min;
- WT_STATS txn_checkpoint_time_recent;
- WT_STATS txn_checkpoint_time_total;
- WT_STATS txn_commit;
- WT_STATS txn_fail_cache;
- WT_STATS txn_pinned_checkpoint_range;
- WT_STATS txn_pinned_range;
- WT_STATS txn_rollback;
- WT_STATS txn_sync;
- WT_STATS write_io;
+ int64_t async_alloc_race;
+ int64_t async_alloc_view;
+ int64_t async_cur_queue;
+ int64_t async_flush;
+ int64_t async_full;
+ int64_t async_max_queue;
+ int64_t async_nowork;
+ int64_t async_op_alloc;
+ int64_t async_op_compact;
+ int64_t async_op_insert;
+ int64_t async_op_remove;
+ int64_t async_op_search;
+ int64_t async_op_update;
+ int64_t block_byte_map_read;
+ int64_t block_byte_read;
+ int64_t block_byte_write;
+ int64_t block_map_read;
+ int64_t block_preload;
+ int64_t block_read;
+ int64_t block_write;
+ int64_t cache_bytes_dirty;
+ int64_t cache_bytes_internal;
+ int64_t cache_bytes_inuse;
+ int64_t cache_bytes_leaf;
+ int64_t cache_bytes_max;
+ int64_t cache_bytes_overflow;
+ int64_t cache_bytes_read;
+ int64_t cache_bytes_write;
+ int64_t cache_eviction_app;
+ int64_t cache_eviction_checkpoint;
+ int64_t cache_eviction_clean;
+ int64_t cache_eviction_deepen;
+ int64_t cache_eviction_dirty;
+ int64_t cache_eviction_fail;
+ int64_t cache_eviction_force;
+ int64_t cache_eviction_force_delete;
+ int64_t cache_eviction_force_fail;
+ int64_t cache_eviction_hazard;
+ int64_t cache_eviction_internal;
+ int64_t cache_eviction_maximum_page_size;
+ int64_t cache_eviction_queue_empty;
+ int64_t cache_eviction_queue_not_empty;
+ int64_t cache_eviction_server_evicting;
+ int64_t cache_eviction_server_not_evicting;
+ int64_t cache_eviction_slow;
+ int64_t cache_eviction_split;
+ int64_t cache_eviction_walk;
+ int64_t cache_eviction_worker_evicting;
+ int64_t cache_inmem_split;
+ int64_t cache_inmem_splittable;
+ int64_t cache_lookaside_insert;
+ int64_t cache_lookaside_remove;
+ int64_t cache_overhead;
+ int64_t cache_pages_dirty;
+ int64_t cache_pages_inuse;
+ int64_t cache_read;
+ int64_t cache_read_lookaside;
+ int64_t cache_write;
+ int64_t cache_write_lookaside;
+ int64_t cache_write_restore;
+ int64_t cond_wait;
+ int64_t cursor_create;
+ int64_t cursor_insert;
+ int64_t cursor_next;
+ int64_t cursor_prev;
+ int64_t cursor_remove;
+ int64_t cursor_reset;
+ int64_t cursor_restart;
+ int64_t cursor_search;
+ int64_t cursor_search_near;
+ int64_t cursor_update;
+ int64_t dh_conn_handle_count;
+ int64_t dh_session_handles;
+ int64_t dh_session_sweeps;
+ int64_t dh_sweep_close;
+ int64_t dh_sweep_ref;
+ int64_t dh_sweep_remove;
+ int64_t dh_sweep_tod;
+ int64_t dh_sweeps;
+ int64_t file_open;
+ int64_t log_buffer_size;
+ int64_t log_bytes_payload;
+ int64_t log_bytes_written;
+ int64_t log_close_yields;
+ int64_t log_compress_len;
+ int64_t log_compress_mem;
+ int64_t log_compress_small;
+ int64_t log_compress_write_fails;
+ int64_t log_compress_writes;
+ int64_t log_max_filesize;
+ int64_t log_prealloc_files;
+ int64_t log_prealloc_max;
+ int64_t log_prealloc_used;
+ int64_t log_release_write_lsn;
+ int64_t log_scan_records;
+ int64_t log_scan_rereads;
+ int64_t log_scans;
+ int64_t log_slot_closes;
+ int64_t log_slot_coalesced;
+ int64_t log_slot_consolidated;
+ int64_t log_slot_joins;
+ int64_t log_slot_races;
+ int64_t log_slot_switch_busy;
+ int64_t log_slot_transitions;
+ int64_t log_slot_unbuffered;
+ int64_t log_sync;
+ int64_t log_sync_dir;
+ int64_t log_write_lsn;
+ int64_t log_writes;
+ int64_t lsm_checkpoint_throttle;
+ int64_t lsm_merge_throttle;
+ int64_t lsm_rows_merged;
+ int64_t lsm_work_queue_app;
+ int64_t lsm_work_queue_manager;
+ int64_t lsm_work_queue_max;
+ int64_t lsm_work_queue_switch;
+ int64_t lsm_work_units_created;
+ int64_t lsm_work_units_discarded;
+ int64_t lsm_work_units_done;
+ int64_t memory_allocation;
+ int64_t memory_free;
+ int64_t memory_grow;
+ int64_t page_busy_blocked;
+ int64_t page_forcible_evict_blocked;
+ int64_t page_locked_blocked;
+ int64_t page_read_blocked;
+ int64_t page_sleep;
+ int64_t read_io;
+ int64_t rec_pages;
+ int64_t rec_pages_eviction;
+ int64_t rec_split_stashed_bytes;
+ int64_t rec_split_stashed_objects;
+ int64_t rwlock_read;
+ int64_t rwlock_write;
+ int64_t session_cursor_open;
+ int64_t session_open;
+ int64_t txn_begin;
+ int64_t txn_checkpoint;
+ int64_t txn_checkpoint_generation;
+ int64_t txn_checkpoint_running;
+ int64_t txn_checkpoint_time_max;
+ int64_t txn_checkpoint_time_min;
+ int64_t txn_checkpoint_time_recent;
+ int64_t txn_checkpoint_time_total;
+ int64_t txn_commit;
+ int64_t txn_fail_cache;
+ int64_t txn_pinned_checkpoint_range;
+ int64_t txn_pinned_range;
+ int64_t txn_rollback;
+ int64_t txn_sync;
+ int64_t write_io;
};
/*
@@ -281,96 +385,102 @@ struct __wt_connection_stats {
*/
#define WT_DSRC_STATS_BASE 2000
struct __wt_dsrc_stats {
- WT_STATS allocation_size;
- WT_STATS block_alloc;
- WT_STATS block_checkpoint_size;
- WT_STATS block_extension;
- WT_STATS block_free;
- WT_STATS block_magic;
- WT_STATS block_major;
- WT_STATS block_minor;
- WT_STATS block_reuse_bytes;
- WT_STATS block_size;
- WT_STATS bloom_count;
- WT_STATS bloom_false_positive;
- WT_STATS bloom_hit;
- WT_STATS bloom_miss;
- WT_STATS bloom_page_evict;
- WT_STATS bloom_page_read;
- WT_STATS bloom_size;
- WT_STATS btree_checkpoint_generation;
- WT_STATS btree_column_deleted;
- WT_STATS btree_column_fix;
- WT_STATS btree_column_internal;
- WT_STATS btree_column_variable;
- WT_STATS btree_compact_rewrite;
- WT_STATS btree_entries;
- WT_STATS btree_fixed_len;
- WT_STATS btree_maximum_depth;
- WT_STATS btree_maxintlkey;
- WT_STATS btree_maxintlpage;
- WT_STATS btree_maxleafkey;
- WT_STATS btree_maxleafpage;
- WT_STATS btree_maxleafvalue;
- WT_STATS btree_overflow;
- WT_STATS btree_row_internal;
- WT_STATS btree_row_leaf;
- WT_STATS cache_bytes_read;
- WT_STATS cache_bytes_write;
- WT_STATS cache_eviction_checkpoint;
- WT_STATS cache_eviction_clean;
- WT_STATS cache_eviction_deepen;
- WT_STATS cache_eviction_dirty;
- WT_STATS cache_eviction_fail;
- WT_STATS cache_eviction_hazard;
- WT_STATS cache_eviction_internal;
- WT_STATS cache_eviction_split;
- WT_STATS cache_inmem_split;
- WT_STATS cache_overflow_value;
- WT_STATS cache_read;
- WT_STATS cache_read_overflow;
- WT_STATS cache_write;
- WT_STATS compress_raw_fail;
- WT_STATS compress_raw_fail_temporary;
- WT_STATS compress_raw_ok;
- WT_STATS compress_read;
- WT_STATS compress_write;
- WT_STATS compress_write_fail;
- WT_STATS compress_write_too_small;
- WT_STATS cursor_create;
- WT_STATS cursor_insert;
- WT_STATS cursor_insert_bulk;
- WT_STATS cursor_insert_bytes;
- WT_STATS cursor_next;
- WT_STATS cursor_prev;
- WT_STATS cursor_remove;
- WT_STATS cursor_remove_bytes;
- WT_STATS cursor_reset;
- WT_STATS cursor_search;
- WT_STATS cursor_search_near;
- WT_STATS cursor_update;
- WT_STATS cursor_update_bytes;
- WT_STATS lsm_checkpoint_throttle;
- WT_STATS lsm_chunk_count;
- WT_STATS lsm_generation_max;
- WT_STATS lsm_lookup_no_bloom;
- WT_STATS lsm_merge_throttle;
- WT_STATS rec_dictionary;
- WT_STATS rec_multiblock_internal;
- WT_STATS rec_multiblock_leaf;
- WT_STATS rec_multiblock_max;
- WT_STATS rec_overflow_key_internal;
- WT_STATS rec_overflow_key_leaf;
- WT_STATS rec_overflow_value;
- WT_STATS rec_page_delete;
- WT_STATS rec_page_match;
- WT_STATS rec_pages;
- WT_STATS rec_pages_eviction;
- WT_STATS rec_prefix_compression;
- WT_STATS rec_suffix_compression;
- WT_STATS session_compact;
- WT_STATS session_cursor_open;
- WT_STATS txn_update_conflict;
+ int64_t allocation_size;
+ int64_t block_alloc;
+ int64_t block_checkpoint_size;
+ int64_t block_extension;
+ int64_t block_free;
+ int64_t block_magic;
+ int64_t block_major;
+ int64_t block_minor;
+ int64_t block_reuse_bytes;
+ int64_t block_size;
+ int64_t bloom_count;
+ int64_t bloom_false_positive;
+ int64_t bloom_hit;
+ int64_t bloom_miss;
+ int64_t bloom_page_evict;
+ int64_t bloom_page_read;
+ int64_t bloom_size;
+ int64_t btree_checkpoint_generation;
+ int64_t btree_column_deleted;
+ int64_t btree_column_fix;
+ int64_t btree_column_internal;
+ int64_t btree_column_rle;
+ int64_t btree_column_variable;
+ int64_t btree_compact_rewrite;
+ int64_t btree_entries;
+ int64_t btree_fixed_len;
+ int64_t btree_maximum_depth;
+ int64_t btree_maxintlkey;
+ int64_t btree_maxintlpage;
+ int64_t btree_maxleafkey;
+ int64_t btree_maxleafpage;
+ int64_t btree_maxleafvalue;
+ int64_t btree_overflow;
+ int64_t btree_row_internal;
+ int64_t btree_row_leaf;
+ int64_t cache_bytes_read;
+ int64_t cache_bytes_write;
+ int64_t cache_eviction_checkpoint;
+ int64_t cache_eviction_clean;
+ int64_t cache_eviction_deepen;
+ int64_t cache_eviction_dirty;
+ int64_t cache_eviction_fail;
+ int64_t cache_eviction_hazard;
+ int64_t cache_eviction_internal;
+ int64_t cache_eviction_split;
+ int64_t cache_inmem_split;
+ int64_t cache_inmem_splittable;
+ int64_t cache_overflow_value;
+ int64_t cache_read;
+ int64_t cache_read_lookaside;
+ int64_t cache_read_overflow;
+ int64_t cache_write;
+ int64_t cache_write_lookaside;
+ int64_t cache_write_restore;
+ int64_t compress_raw_fail;
+ int64_t compress_raw_fail_temporary;
+ int64_t compress_raw_ok;
+ int64_t compress_read;
+ int64_t compress_write;
+ int64_t compress_write_fail;
+ int64_t compress_write_too_small;
+ int64_t cursor_create;
+ int64_t cursor_insert;
+ int64_t cursor_insert_bulk;
+ int64_t cursor_insert_bytes;
+ int64_t cursor_next;
+ int64_t cursor_prev;
+ int64_t cursor_remove;
+ int64_t cursor_remove_bytes;
+ int64_t cursor_reset;
+ int64_t cursor_restart;
+ int64_t cursor_search;
+ int64_t cursor_search_near;
+ int64_t cursor_update;
+ int64_t cursor_update_bytes;
+ int64_t lsm_checkpoint_throttle;
+ int64_t lsm_chunk_count;
+ int64_t lsm_generation_max;
+ int64_t lsm_lookup_no_bloom;
+ int64_t lsm_merge_throttle;
+ int64_t rec_dictionary;
+ int64_t rec_multiblock_internal;
+ int64_t rec_multiblock_leaf;
+ int64_t rec_multiblock_max;
+ int64_t rec_overflow_key_internal;
+ int64_t rec_overflow_key_leaf;
+ int64_t rec_overflow_value;
+ int64_t rec_page_delete;
+ int64_t rec_page_match;
+ int64_t rec_pages;
+ int64_t rec_pages_eviction;
+ int64_t rec_prefix_compression;
+ int64_t rec_suffix_compression;
+ int64_t session_compact;
+ int64_t session_cursor_open;
+ int64_t txn_update_conflict;
};
/* Statistics section: END */
diff --git a/src/include/txn.h b/src/include/txn.h
index 7a67f713244..4a325c70a95 100644
--- a/src/include/txn.h
+++ b/src/include/txn.h
@@ -31,7 +31,7 @@
struct __wt_named_snapshot {
const char *name;
- STAILQ_ENTRY(__wt_named_snapshot) q;
+ TAILQ_ENTRY(__wt_named_snapshot) q;
uint64_t snap_min, snap_max;
uint64_t *snapshot;
@@ -72,15 +72,14 @@ struct __wt_txn_global {
/* Named snapshot state. */
WT_RWLOCK *nsnap_rwlock;
volatile uint64_t nsnap_oldest_id;
- STAILQ_HEAD(__wt_nsnap_qh, __wt_named_snapshot) nsnaph;
+ TAILQ_HEAD(__wt_nsnap_qh, __wt_named_snapshot) nsnaph;
WT_TXN_STATE *states; /* Per-session transaction states */
};
typedef enum __wt_txn_isolation {
- WT_ISO_EVICTION, /* Internal: eviction context */
- WT_ISO_READ_UNCOMMITTED,
WT_ISO_READ_COMMITTED,
+ WT_ISO_READ_UNCOMMITTED,
WT_ISO_SNAPSHOT
} WT_TXN_ISOLATION;
diff --git a/src/include/txn.i b/src/include/txn.i
index a9b54d26e47..2b42990f5e5 100644
--- a/src/include/txn.i
+++ b/src/include/txn.i
@@ -140,12 +140,22 @@ __wt_txn_oldest_id(WT_SESSION_IMPL *session)
}
/*
+ * __wt_txn_committed --
+ * Return if a transaction has been committed.
+ */
+static inline bool
+__wt_txn_committed(WT_SESSION_IMPL *session, uint64_t id)
+{
+ return (WT_TXNID_LT(id, S2C(session)->txn_global.last_running));
+}
+
+/*
* __wt_txn_visible_all --
* Check if a given transaction ID is "globally visible". This is, if
* all sessions in the system will see the transaction ID including the
* ID that belongs to a running checkpoint.
*/
-static inline int
+static inline bool
__wt_txn_visible_all(WT_SESSION_IMPL *session, uint64_t id)
{
uint64_t oldest_id;
@@ -159,28 +169,21 @@ __wt_txn_visible_all(WT_SESSION_IMPL *session, uint64_t id)
* __wt_txn_visible --
* Can the current transaction see the given ID?
*/
-static inline int
+static inline bool
__wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id)
{
WT_TXN *txn;
- int found;
+ bool found;
txn = &session->txn;
/* Changes with no associated transaction are always visible. */
if (id == WT_TXN_NONE)
- return (1);
+ return (true);
/* Nobody sees the results of aborted transactions. */
if (id == WT_TXN_ABORTED)
- return (0);
-
- /*
- * Eviction only sees globally visible updates, or if there is a
- * checkpoint transaction running, use its transaction.
- */
- if (txn->isolation == WT_ISO_EVICTION)
- return (__wt_txn_visible_all(session, id));
+ return (false);
/*
* Read-uncommitted transactions see all other changes.
@@ -194,11 +197,11 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id)
*/
if (txn->isolation == WT_ISO_READ_UNCOMMITTED ||
session->dhandle == session->meta_dhandle)
- return (1);
+ return (true);
/* Transactions see their own changes. */
if (id == txn->id)
- return (1);
+ return (true);
/*
* WT_ISO_SNAPSHOT, WT_ISO_READ_COMMITTED: the ID is visible if it is
@@ -210,9 +213,9 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id)
* snapshot is empty.
*/
if (WT_TXNID_LE(txn->snap_max, id))
- return (0);
+ return (false);
if (txn->snapshot_count == 0 || WT_TXNID_LT(id, txn->snap_min))
- return (1);
+ return (true);
WT_BINARY_SEARCH(id, txn->snapshot, txn->snapshot_count, found);
return (!found);
@@ -266,7 +269,7 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
}
F_SET(txn, WT_TXN_RUNNING);
- return (0);
+ return (false);
}
/*
@@ -300,7 +303,7 @@ __wt_txn_new_id(WT_SESSION_IMPL *session)
* global current ID, so we want post-increment semantics. Our atomic
* add primitive does pre-increment, so adjust the result here.
*/
- return (WT_ATOMIC_ADD8(S2C(session)->txn_global.current, 1) - 1);
+ return (__wt_atomic_addv64(&S2C(session)->txn_global.current, 1) - 1);
}
/*
@@ -376,8 +379,9 @@ __wt_txn_id_check(WT_SESSION_IMPL *session)
*/
do {
txn_state->id = txn->id = txn_global->current;
- } while (!WT_ATOMIC_CAS8(
- txn_global->current, txn->id, txn->id + 1));
+ } while (!__wt_atomic_casv64(
+ &txn_global->current, txn->id, txn->id + 1) ||
+ WT_TXNID_LT(txn->id, txn_global->last_running));
/*
* If we have used 64-bits of transaction IDs, there is nothing
@@ -476,7 +480,7 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session)
* __wt_txn_am_oldest --
* Am I the oldest transaction in the system?
*/
-static inline int
+static inline bool
__wt_txn_am_oldest(WT_SESSION_IMPL *session)
{
WT_CONNECTION_IMPL *conn;
@@ -491,12 +495,12 @@ __wt_txn_am_oldest(WT_SESSION_IMPL *session)
txn_global = &conn->txn_global;
if (txn->id == WT_TXN_NONE)
- return (0);
+ return (false);
WT_ORDERED_READ(session_cnt, conn->session_cnt);
for (i = 0, s = txn_global->states; i < session_cnt; i++, s++)
if ((id = s->id) != WT_TXN_NONE && WT_TXNID_LT(id, txn->id))
- return (0);
+ return (false);
- return (1);
+ return (true);
}
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index e8f3b9958ce..71ba3f41a44 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -1750,6 +1750,9 @@ struct __wt_connection {
* @config{&nbsp;&nbsp;&nbsp;&nbsp;name, the name of a cache that
* is shared between databases or \c "none" when no shared cache is
* configured., a string; default \c none.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;quota, maximum size of cache this
+ * database can be allocated from the shared cache. Defaults to the
+ * entire shared cache size., an integer; default \c 0.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;reserve, amount of cache this
* database is guaranteed to have available from the shared cache. This
* setting is per database. Defaults to the chunk size., an integer;
@@ -2072,8 +2075,10 @@ struct __wt_connection {
* @config{checkpoint_sync, flush files to stable storage when closing or
* writing checkpoints., a boolean flag; default \c true.}
* @config{config_base, write the base configuration file if creating the
- * database\, see @ref config_base for more information., a boolean flag;
- * default \c true.}
+ * database. If \c false in the config passed directly to ::wiredtiger_open\,
+ * will ignore any existing base configuration file in addition to not creating
+ * one. See @ref config_base for more information., a boolean flag; default \c
+ * true.}
* @config{create, create the database if it does not exist., a boolean flag;
* default \c false.}
* @config{direct_io, Use \c O_DIRECT to access files. Options are given as a
@@ -2214,10 +2219,12 @@ struct __wt_connection {
* @config{&nbsp;&nbsp;&nbsp;&nbsp;name, the name of a cache that is shared
* between databases or \c "none" when no shared cache is configured., a string;
* default \c none.}
- * @config{&nbsp;&nbsp;&nbsp;&nbsp;reserve, amount of cache
- * this database is guaranteed to have available from the shared cache. This
- * setting is per database. Defaults to the chunk size., an integer; default \c
- * 0.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;quota, maximum size of
+ * cache this database can be allocated from the shared cache. Defaults to the
+ * entire shared cache size., an integer; default \c 0.}
+ * @config{&nbsp;&nbsp;&nbsp;&nbsp;reserve, amount of cache this database is
+ * guaranteed to have available from the shared cache. This setting is per
+ * database. Defaults to the chunk size., an integer; default \c 0.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;size, maximum memory to allocate for the
* shared cache. Setting this will update the value if one is already set., an
* integer between 1MB and 10TB; default \c 500MB.}
@@ -3640,192 +3647,210 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1047
/*! cache: in-memory page splits */
#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1048
+/*! cache: in-memory page passed criteria to be split */
+#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1049
+/*! cache: lookaside table insert calls */
+#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1050
+/*! cache: lookaside table remove calls */
+#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1051
/*! cache: percentage overhead */
-#define WT_STAT_CONN_CACHE_OVERHEAD 1049
+#define WT_STAT_CONN_CACHE_OVERHEAD 1052
/*! cache: tracked dirty pages in the cache */
-#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1050
+#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1053
/*! cache: pages currently held in the cache */
-#define WT_STAT_CONN_CACHE_PAGES_INUSE 1051
+#define WT_STAT_CONN_CACHE_PAGES_INUSE 1054
/*! cache: pages read into cache */
-#define WT_STAT_CONN_CACHE_READ 1052
+#define WT_STAT_CONN_CACHE_READ 1055
+/*! cache: pages read into cache requiring lookaside entries */
+#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1056
/*! cache: pages written from cache */
-#define WT_STAT_CONN_CACHE_WRITE 1053
+#define WT_STAT_CONN_CACHE_WRITE 1057
+/*! cache: page written requiring lookaside records */
+#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1058
+/*! cache: pages written requiring in-memory restoration */
+#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1059
/*! connection: pthread mutex condition wait calls */
-#define WT_STAT_CONN_COND_WAIT 1054
+#define WT_STAT_CONN_COND_WAIT 1060
/*! cursor: cursor create calls */
-#define WT_STAT_CONN_CURSOR_CREATE 1055
+#define WT_STAT_CONN_CURSOR_CREATE 1061
/*! cursor: cursor insert calls */
-#define WT_STAT_CONN_CURSOR_INSERT 1056
+#define WT_STAT_CONN_CURSOR_INSERT 1062
/*! cursor: cursor next calls */
-#define WT_STAT_CONN_CURSOR_NEXT 1057
+#define WT_STAT_CONN_CURSOR_NEXT 1063
/*! cursor: cursor prev calls */
-#define WT_STAT_CONN_CURSOR_PREV 1058
+#define WT_STAT_CONN_CURSOR_PREV 1064
/*! cursor: cursor remove calls */
-#define WT_STAT_CONN_CURSOR_REMOVE 1059
+#define WT_STAT_CONN_CURSOR_REMOVE 1065
/*! cursor: cursor reset calls */
-#define WT_STAT_CONN_CURSOR_RESET 1060
+#define WT_STAT_CONN_CURSOR_RESET 1066
+/*! cursor: cursor restarted searches */
+#define WT_STAT_CONN_CURSOR_RESTART 1067
/*! cursor: cursor search calls */
-#define WT_STAT_CONN_CURSOR_SEARCH 1061
+#define WT_STAT_CONN_CURSOR_SEARCH 1068
/*! cursor: cursor search near calls */
-#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1062
+#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1069
/*! cursor: cursor update calls */
-#define WT_STAT_CONN_CURSOR_UPDATE 1063
-/*! data-handle: connection dhandles swept */
-#define WT_STAT_CONN_DH_CONN_HANDLES 1064
-/*! data-handle: connection candidate referenced */
-#define WT_STAT_CONN_DH_CONN_REF 1065
-/*! data-handle: connection sweeps */
-#define WT_STAT_CONN_DH_CONN_SWEEPS 1066
-/*! data-handle: connection time-of-death sets */
-#define WT_STAT_CONN_DH_CONN_TOD 1067
+#define WT_STAT_CONN_CURSOR_UPDATE 1070
+/*! data-handle: connection data handles currently active */
+#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1071
/*! data-handle: session dhandles swept */
-#define WT_STAT_CONN_DH_SESSION_HANDLES 1068
+#define WT_STAT_CONN_DH_SESSION_HANDLES 1072
/*! data-handle: session sweep attempts */
-#define WT_STAT_CONN_DH_SESSION_SWEEPS 1069
+#define WT_STAT_CONN_DH_SESSION_SWEEPS 1073
+/*! data-handle: connection sweep dhandles closed */
+#define WT_STAT_CONN_DH_SWEEP_CLOSE 1074
+/*! data-handle: connection sweep candidate became referenced */
+#define WT_STAT_CONN_DH_SWEEP_REF 1075
+/*! data-handle: connection sweep dhandles removed from hash list */
+#define WT_STAT_CONN_DH_SWEEP_REMOVE 1076
+/*! data-handle: connection sweep time-of-death sets */
+#define WT_STAT_CONN_DH_SWEEP_TOD 1077
+/*! data-handle: connection sweeps */
+#define WT_STAT_CONN_DH_SWEEPS 1078
/*! connection: files currently open */
-#define WT_STAT_CONN_FILE_OPEN 1070
+#define WT_STAT_CONN_FILE_OPEN 1079
/*! log: total log buffer size */
-#define WT_STAT_CONN_LOG_BUFFER_SIZE 1071
+#define WT_STAT_CONN_LOG_BUFFER_SIZE 1080
/*! log: log bytes of payload data */
-#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1072
+#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1081
/*! log: log bytes written */
-#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1073
+#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1082
/*! log: yields waiting for previous log file close */
-#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1074
+#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1083
/*! log: total size of compressed records */
-#define WT_STAT_CONN_LOG_COMPRESS_LEN 1075
+#define WT_STAT_CONN_LOG_COMPRESS_LEN 1084
/*! log: total in-memory size of compressed records */
-#define WT_STAT_CONN_LOG_COMPRESS_MEM 1076
+#define WT_STAT_CONN_LOG_COMPRESS_MEM 1085
/*! log: log records too small to compress */
-#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1077
+#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1086
/*! log: log records not compressed */
-#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1078
+#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1087
/*! log: log records compressed */
-#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1079
+#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1088
/*! log: maximum log file size */
-#define WT_STAT_CONN_LOG_MAX_FILESIZE 1080
+#define WT_STAT_CONN_LOG_MAX_FILESIZE 1089
/*! log: pre-allocated log files prepared */
-#define WT_STAT_CONN_LOG_PREALLOC_FILES 1081
+#define WT_STAT_CONN_LOG_PREALLOC_FILES 1090
/*! log: number of pre-allocated log files to create */
-#define WT_STAT_CONN_LOG_PREALLOC_MAX 1082
+#define WT_STAT_CONN_LOG_PREALLOC_MAX 1091
/*! log: pre-allocated log files used */
-#define WT_STAT_CONN_LOG_PREALLOC_USED 1083
+#define WT_STAT_CONN_LOG_PREALLOC_USED 1092
/*! log: log release advances write LSN */
-#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1084
+#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1093
/*! log: records processed by log scan */
-#define WT_STAT_CONN_LOG_SCAN_RECORDS 1085
+#define WT_STAT_CONN_LOG_SCAN_RECORDS 1094
/*! log: log scan records requiring two reads */
-#define WT_STAT_CONN_LOG_SCAN_REREADS 1086
+#define WT_STAT_CONN_LOG_SCAN_REREADS 1095
/*! log: log scan operations */
-#define WT_STAT_CONN_LOG_SCANS 1087
+#define WT_STAT_CONN_LOG_SCANS 1096
/*! log: consolidated slot closures */
-#define WT_STAT_CONN_LOG_SLOT_CLOSES 1088
+#define WT_STAT_CONN_LOG_SLOT_CLOSES 1097
/*! log: written slots coalesced */
-#define WT_STAT_CONN_LOG_SLOT_COALESCED 1089
+#define WT_STAT_CONN_LOG_SLOT_COALESCED 1098
/*! log: logging bytes consolidated */
-#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1090
+#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1099
/*! log: consolidated slot joins */
-#define WT_STAT_CONN_LOG_SLOT_JOINS 1091
+#define WT_STAT_CONN_LOG_SLOT_JOINS 1100
/*! log: consolidated slot join races */
-#define WT_STAT_CONN_LOG_SLOT_RACES 1092
-/*! log: record size exceeded maximum */
-#define WT_STAT_CONN_LOG_SLOT_TOOBIG 1093
-/*! log: failed to find a slot large enough for record */
-#define WT_STAT_CONN_LOG_SLOT_TOOSMALL 1094
+#define WT_STAT_CONN_LOG_SLOT_RACES 1101
+/*! log: busy returns attempting to switch slots */
+#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1102
/*! log: consolidated slot join transitions */
-#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1095
+#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1103
+/*! log: consolidated slot unbuffered writes */
+#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1104
/*! log: log sync operations */
-#define WT_STAT_CONN_LOG_SYNC 1096
+#define WT_STAT_CONN_LOG_SYNC 1105
/*! log: log sync_dir operations */
-#define WT_STAT_CONN_LOG_SYNC_DIR 1097
+#define WT_STAT_CONN_LOG_SYNC_DIR 1106
/*! log: log server thread advances write LSN */
-#define WT_STAT_CONN_LOG_WRITE_LSN 1098
+#define WT_STAT_CONN_LOG_WRITE_LSN 1107
/*! log: log write operations */
-#define WT_STAT_CONN_LOG_WRITES 1099
+#define WT_STAT_CONN_LOG_WRITES 1108
/*! LSM: sleep for LSM checkpoint throttle */
-#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1100
+#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1109
/*! LSM: sleep for LSM merge throttle */
-#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1101
+#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1110
/*! LSM: rows merged in an LSM tree */
-#define WT_STAT_CONN_LSM_ROWS_MERGED 1102
+#define WT_STAT_CONN_LSM_ROWS_MERGED 1111
/*! LSM: application work units currently queued */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1103
+#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1112
/*! LSM: merge work units currently queued */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1104
+#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1113
/*! LSM: tree queue hit maximum */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1105
+#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1114
/*! LSM: switch work units currently queued */
-#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1106
+#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1115
/*! LSM: tree maintenance operations scheduled */
-#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1107
+#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1116
/*! LSM: tree maintenance operations discarded */
-#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1108
+#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1117
/*! LSM: tree maintenance operations executed */
-#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1109
+#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1118
/*! connection: memory allocations */
-#define WT_STAT_CONN_MEMORY_ALLOCATION 1110
+#define WT_STAT_CONN_MEMORY_ALLOCATION 1119
/*! connection: memory frees */
-#define WT_STAT_CONN_MEMORY_FREE 1111
+#define WT_STAT_CONN_MEMORY_FREE 1120
/*! connection: memory re-allocations */
-#define WT_STAT_CONN_MEMORY_GROW 1112
+#define WT_STAT_CONN_MEMORY_GROW 1121
/*! thread-yield: page acquire busy blocked */
-#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1113
+#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1122
/*! thread-yield: page acquire eviction blocked */
-#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1114
+#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1123
/*! thread-yield: page acquire locked blocked */
-#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1115
+#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1124
/*! thread-yield: page acquire read blocked */
-#define WT_STAT_CONN_PAGE_READ_BLOCKED 1116
+#define WT_STAT_CONN_PAGE_READ_BLOCKED 1125
/*! thread-yield: page acquire time sleeping (usecs) */
-#define WT_STAT_CONN_PAGE_SLEEP 1117
+#define WT_STAT_CONN_PAGE_SLEEP 1126
/*! connection: total read I/Os */
-#define WT_STAT_CONN_READ_IO 1118
+#define WT_STAT_CONN_READ_IO 1127
/*! reconciliation: page reconciliation calls */
-#define WT_STAT_CONN_REC_PAGES 1119
+#define WT_STAT_CONN_REC_PAGES 1128
/*! reconciliation: page reconciliation calls for eviction */
-#define WT_STAT_CONN_REC_PAGES_EVICTION 1120
+#define WT_STAT_CONN_REC_PAGES_EVICTION 1129
/*! reconciliation: split bytes currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1121
+#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1130
/*! reconciliation: split objects currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1122
+#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1131
/*! connection: pthread mutex shared lock read-lock calls */
-#define WT_STAT_CONN_RWLOCK_READ 1123
+#define WT_STAT_CONN_RWLOCK_READ 1132
/*! connection: pthread mutex shared lock write-lock calls */
-#define WT_STAT_CONN_RWLOCK_WRITE 1124
+#define WT_STAT_CONN_RWLOCK_WRITE 1133
/*! session: open cursor count */
-#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1125
+#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1134
/*! session: open session count */
-#define WT_STAT_CONN_SESSION_OPEN 1126
+#define WT_STAT_CONN_SESSION_OPEN 1135
/*! transaction: transaction begins */
-#define WT_STAT_CONN_TXN_BEGIN 1127
+#define WT_STAT_CONN_TXN_BEGIN 1136
/*! transaction: transaction checkpoints */
-#define WT_STAT_CONN_TXN_CHECKPOINT 1128
+#define WT_STAT_CONN_TXN_CHECKPOINT 1137
/*! transaction: transaction checkpoint generation */
-#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1129
+#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1138
/*! transaction: transaction checkpoint currently running */
-#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1130
+#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1139
/*! transaction: transaction checkpoint max time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1131
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1140
/*! transaction: transaction checkpoint min time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1132
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1141
/*! transaction: transaction checkpoint most recent time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1133
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1142
/*! transaction: transaction checkpoint total time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1134
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1143
/*! transaction: transactions committed */
-#define WT_STAT_CONN_TXN_COMMIT 1135
+#define WT_STAT_CONN_TXN_COMMIT 1144
/*! transaction: transaction failures due to cache overflow */
-#define WT_STAT_CONN_TXN_FAIL_CACHE 1136
+#define WT_STAT_CONN_TXN_FAIL_CACHE 1145
/*! transaction: transaction range of IDs currently pinned by a checkpoint */
-#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1137
+#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1146
/*! transaction: transaction range of IDs currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_RANGE 1138
+#define WT_STAT_CONN_TXN_PINNED_RANGE 1147
/*! transaction: transactions rolled back */
-#define WT_STAT_CONN_TXN_ROLLBACK 1139
+#define WT_STAT_CONN_TXN_ROLLBACK 1148
/*! transaction: transaction sync calls */
-#define WT_STAT_CONN_TXN_SYNC 1140
+#define WT_STAT_CONN_TXN_SYNC 1149
/*! connection: total write I/Os */
-#define WT_STAT_CONN_WRITE_IO 1141
+#define WT_STAT_CONN_WRITE_IO 1150
/*!
* @}
@@ -3875,146 +3900,158 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_DSRC_BTREE_COLUMN_FIX 2019
/*! btree: column-store internal pages */
#define WT_STAT_DSRC_BTREE_COLUMN_INTERNAL 2020
+/*! btree: column-store variable-size RLE encoded values */
+#define WT_STAT_DSRC_BTREE_COLUMN_RLE 2021
/*! btree: column-store variable-size leaf pages */
-#define WT_STAT_DSRC_BTREE_COLUMN_VARIABLE 2021
+#define WT_STAT_DSRC_BTREE_COLUMN_VARIABLE 2022
/*! btree: pages rewritten by compaction */
-#define WT_STAT_DSRC_BTREE_COMPACT_REWRITE 2022
+#define WT_STAT_DSRC_BTREE_COMPACT_REWRITE 2023
/*! btree: number of key/value pairs */
-#define WT_STAT_DSRC_BTREE_ENTRIES 2023
+#define WT_STAT_DSRC_BTREE_ENTRIES 2024
/*! btree: fixed-record size */
-#define WT_STAT_DSRC_BTREE_FIXED_LEN 2024
+#define WT_STAT_DSRC_BTREE_FIXED_LEN 2025
/*! btree: maximum tree depth */
-#define WT_STAT_DSRC_BTREE_MAXIMUM_DEPTH 2025
+#define WT_STAT_DSRC_BTREE_MAXIMUM_DEPTH 2026
/*! btree: maximum internal page key size */
-#define WT_STAT_DSRC_BTREE_MAXINTLKEY 2026
+#define WT_STAT_DSRC_BTREE_MAXINTLKEY 2027
/*! btree: maximum internal page size */
-#define WT_STAT_DSRC_BTREE_MAXINTLPAGE 2027
+#define WT_STAT_DSRC_BTREE_MAXINTLPAGE 2028
/*! btree: maximum leaf page key size */
-#define WT_STAT_DSRC_BTREE_MAXLEAFKEY 2028
+#define WT_STAT_DSRC_BTREE_MAXLEAFKEY 2029
/*! btree: maximum leaf page size */
-#define WT_STAT_DSRC_BTREE_MAXLEAFPAGE 2029
+#define WT_STAT_DSRC_BTREE_MAXLEAFPAGE 2030
/*! btree: maximum leaf page value size */
-#define WT_STAT_DSRC_BTREE_MAXLEAFVALUE 2030
+#define WT_STAT_DSRC_BTREE_MAXLEAFVALUE 2031
/*! btree: overflow pages */
-#define WT_STAT_DSRC_BTREE_OVERFLOW 2031
+#define WT_STAT_DSRC_BTREE_OVERFLOW 2032
/*! btree: row-store internal pages */
-#define WT_STAT_DSRC_BTREE_ROW_INTERNAL 2032
+#define WT_STAT_DSRC_BTREE_ROW_INTERNAL 2033
/*! btree: row-store leaf pages */
-#define WT_STAT_DSRC_BTREE_ROW_LEAF 2033
+#define WT_STAT_DSRC_BTREE_ROW_LEAF 2034
/*! cache: bytes read into cache */
-#define WT_STAT_DSRC_CACHE_BYTES_READ 2034
+#define WT_STAT_DSRC_CACHE_BYTES_READ 2035
/*! cache: bytes written from cache */
-#define WT_STAT_DSRC_CACHE_BYTES_WRITE 2035
+#define WT_STAT_DSRC_CACHE_BYTES_WRITE 2036
/*! cache: checkpoint blocked page eviction */
-#define WT_STAT_DSRC_CACHE_EVICTION_CHECKPOINT 2036
+#define WT_STAT_DSRC_CACHE_EVICTION_CHECKPOINT 2037
/*! cache: unmodified pages evicted */
-#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2037
+#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2038
/*! cache: page split during eviction deepened the tree */
-#define WT_STAT_DSRC_CACHE_EVICTION_DEEPEN 2038
+#define WT_STAT_DSRC_CACHE_EVICTION_DEEPEN 2039
/*! cache: modified pages evicted */
-#define WT_STAT_DSRC_CACHE_EVICTION_DIRTY 2039
+#define WT_STAT_DSRC_CACHE_EVICTION_DIRTY 2040
/*! cache: data source pages selected for eviction unable to be evicted */
-#define WT_STAT_DSRC_CACHE_EVICTION_FAIL 2040
+#define WT_STAT_DSRC_CACHE_EVICTION_FAIL 2041
/*! cache: hazard pointer blocked page eviction */
-#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 2041
+#define WT_STAT_DSRC_CACHE_EVICTION_HAZARD 2042
/*! cache: internal pages evicted */
-#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 2042
+#define WT_STAT_DSRC_CACHE_EVICTION_INTERNAL 2043
/*! cache: pages split during eviction */
-#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT 2043
+#define WT_STAT_DSRC_CACHE_EVICTION_SPLIT 2044
/*! cache: in-memory page splits */
-#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2044
+#define WT_STAT_DSRC_CACHE_INMEM_SPLIT 2045
+/*! cache: in-memory page passed criteria to be split */
+#define WT_STAT_DSRC_CACHE_INMEM_SPLITTABLE 2046
/*! cache: overflow values cached in memory */
-#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2045
+#define WT_STAT_DSRC_CACHE_OVERFLOW_VALUE 2047
/*! cache: pages read into cache */
-#define WT_STAT_DSRC_CACHE_READ 2046
+#define WT_STAT_DSRC_CACHE_READ 2048
+/*! cache: pages read into cache requiring lookaside entries */
+#define WT_STAT_DSRC_CACHE_READ_LOOKASIDE 2049
/*! cache: overflow pages read into cache */
-#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2047
+#define WT_STAT_DSRC_CACHE_READ_OVERFLOW 2050
/*! cache: pages written from cache */
-#define WT_STAT_DSRC_CACHE_WRITE 2048
+#define WT_STAT_DSRC_CACHE_WRITE 2051
+/*! cache: page written requiring lookaside records */
+#define WT_STAT_DSRC_CACHE_WRITE_LOOKASIDE 2052
+/*! cache: pages written requiring in-memory restoration */
+#define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2053
/*! compression: raw compression call failed, no additional data available */
-#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2049
+#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2054
/*! compression: raw compression call failed, additional data available */
-#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2050
+#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2055
/*! compression: raw compression call succeeded */
-#define WT_STAT_DSRC_COMPRESS_RAW_OK 2051
+#define WT_STAT_DSRC_COMPRESS_RAW_OK 2056
/*! compression: compressed pages read */
-#define WT_STAT_DSRC_COMPRESS_READ 2052
+#define WT_STAT_DSRC_COMPRESS_READ 2057
/*! compression: compressed pages written */
-#define WT_STAT_DSRC_COMPRESS_WRITE 2053
+#define WT_STAT_DSRC_COMPRESS_WRITE 2058
/*! compression: page written failed to compress */
-#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2054
+#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2059
/*! compression: page written was too small to compress */
-#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2055
+#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2060
/*! cursor: create calls */
-#define WT_STAT_DSRC_CURSOR_CREATE 2056
+#define WT_STAT_DSRC_CURSOR_CREATE 2061
/*! cursor: insert calls */
-#define WT_STAT_DSRC_CURSOR_INSERT 2057
+#define WT_STAT_DSRC_CURSOR_INSERT 2062
/*! cursor: bulk-loaded cursor-insert calls */
-#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2058
+#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2063
/*! cursor: cursor-insert key and value bytes inserted */
-#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2059
+#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2064
/*! cursor: next calls */
-#define WT_STAT_DSRC_CURSOR_NEXT 2060
+#define WT_STAT_DSRC_CURSOR_NEXT 2065
/*! cursor: prev calls */
-#define WT_STAT_DSRC_CURSOR_PREV 2061
+#define WT_STAT_DSRC_CURSOR_PREV 2066
/*! cursor: remove calls */
-#define WT_STAT_DSRC_CURSOR_REMOVE 2062
+#define WT_STAT_DSRC_CURSOR_REMOVE 2067
/*! cursor: cursor-remove key bytes removed */
-#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2063
+#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2068
/*! cursor: reset calls */
-#define WT_STAT_DSRC_CURSOR_RESET 2064
+#define WT_STAT_DSRC_CURSOR_RESET 2069
+/*! cursor: restarted searches */
+#define WT_STAT_DSRC_CURSOR_RESTART 2070
/*! cursor: search calls */
-#define WT_STAT_DSRC_CURSOR_SEARCH 2065
+#define WT_STAT_DSRC_CURSOR_SEARCH 2071
/*! cursor: search near calls */
-#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2066
+#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2072
/*! cursor: update calls */
-#define WT_STAT_DSRC_CURSOR_UPDATE 2067
+#define WT_STAT_DSRC_CURSOR_UPDATE 2073
/*! cursor: cursor-update value bytes updated */
-#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2068
+#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2074
/*! LSM: sleep for LSM checkpoint throttle */
-#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2069
+#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2075
/*! LSM: chunks in the LSM tree */
-#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2070
+#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2076
/*! LSM: highest merge generation in the LSM tree */
-#define WT_STAT_DSRC_LSM_GENERATION_MAX 2071
+#define WT_STAT_DSRC_LSM_GENERATION_MAX 2077
/*! LSM: queries that could have benefited from a Bloom filter that did
* not exist */
-#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2072
+#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2078
/*! LSM: sleep for LSM merge throttle */
-#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2073
+#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2079
/*! reconciliation: dictionary matches */
-#define WT_STAT_DSRC_REC_DICTIONARY 2074
+#define WT_STAT_DSRC_REC_DICTIONARY 2080
/*! reconciliation: internal page multi-block writes */
-#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2075
+#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2081
/*! reconciliation: leaf page multi-block writes */
-#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2076
+#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2082
/*! reconciliation: maximum blocks required for a page */
-#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2077
+#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2083
/*! reconciliation: internal-page overflow keys */
-#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2078
+#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2084
/*! reconciliation: leaf-page overflow keys */
-#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2079
+#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2085
/*! reconciliation: overflow values written */
-#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2080
+#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2086
/*! reconciliation: pages deleted */
-#define WT_STAT_DSRC_REC_PAGE_DELETE 2081
+#define WT_STAT_DSRC_REC_PAGE_DELETE 2087
/*! reconciliation: page checksum matches */
-#define WT_STAT_DSRC_REC_PAGE_MATCH 2082
+#define WT_STAT_DSRC_REC_PAGE_MATCH 2088
/*! reconciliation: page reconciliation calls */
-#define WT_STAT_DSRC_REC_PAGES 2083
+#define WT_STAT_DSRC_REC_PAGES 2089
/*! reconciliation: page reconciliation calls for eviction */
-#define WT_STAT_DSRC_REC_PAGES_EVICTION 2084
+#define WT_STAT_DSRC_REC_PAGES_EVICTION 2090
/*! reconciliation: leaf page key bytes discarded using prefix compression */
-#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2085
+#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2091
/*! reconciliation: internal page key bytes discarded using suffix
* compression */
-#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2086
+#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2092
/*! session: object compaction */
-#define WT_STAT_DSRC_SESSION_COMPACT 2087
+#define WT_STAT_DSRC_SESSION_COMPACT 2093
/*! session: open cursor count */
-#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2088
+#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2094
/*! transaction: update conflicts */
-#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2089
+#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2095
/*! @} */
/*
* Statistics section: END
diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h
index 64e29e104bc..4d46a25b63c 100644
--- a/src/include/wt_internal.h
+++ b/src/include/wt_internal.h
@@ -41,6 +41,7 @@ extern "C" {
#else
#include <pthread.h>
#endif
+#include <stdbool.h>
#include <stddef.h>
#include <stdio.h>
#include <stdint.h>
@@ -55,11 +56,6 @@ extern "C" {
#include <windows.h>
#endif
-/*******************************************
- * WiredTiger externally maintained include files.
- *******************************************/
-#include "queue.h"
-
/*
* DO NOT EDIT: automatically built by dist/s_typedef.
* Forward type declarations for internal types: BEGIN
@@ -182,12 +178,18 @@ struct __wt_insert_head;
typedef struct __wt_insert_head WT_INSERT_HEAD;
struct __wt_keyed_encryptor;
typedef struct __wt_keyed_encryptor WT_KEYED_ENCRYPTOR;
+struct __wt_log;
+ typedef struct __wt_log WT_LOG;
struct __wt_log_desc;
typedef struct __wt_log_desc WT_LOG_DESC;
struct __wt_log_op_desc;
typedef struct __wt_log_op_desc WT_LOG_OP_DESC;
struct __wt_log_rec_desc;
typedef struct __wt_log_rec_desc WT_LOG_REC_DESC;
+struct __wt_log_record;
+ typedef struct __wt_log_record WT_LOG_RECORD;
+struct __wt_logslot;
+ typedef struct __wt_logslot WT_LOGSLOT;
struct __wt_lsm_chunk;
typedef struct __wt_lsm_chunk WT_LSM_CHUNK;
struct __wt_lsm_data_source;
@@ -204,6 +206,8 @@ struct __wt_lsm_worker_cookie;
typedef struct __wt_lsm_worker_cookie WT_LSM_WORKER_COOKIE;
struct __wt_multi;
typedef struct __wt_multi WT_MULTI;
+struct __wt_myslot;
+ typedef struct __wt_myslot WT_MYSLOT;
struct __wt_named_collator;
typedef struct __wt_named_collator WT_NAMED_COLLATOR;
struct __wt_named_compressor;
@@ -242,16 +246,18 @@ struct __wt_rwlock;
typedef struct __wt_rwlock WT_RWLOCK;
struct __wt_salvage_cookie;
typedef struct __wt_salvage_cookie WT_SALVAGE_COOKIE;
+struct __wt_save_upd;
+ typedef struct __wt_save_upd WT_SAVE_UPD;
struct __wt_scratch_track;
typedef struct __wt_scratch_track WT_SCRATCH_TRACK;
struct __wt_session_impl;
typedef struct __wt_session_impl WT_SESSION_IMPL;
struct __wt_size;
typedef struct __wt_size WT_SIZE;
+struct __wt_spinlock;
+ typedef struct __wt_spinlock WT_SPINLOCK;
struct __wt_split_stash;
typedef struct __wt_split_stash WT_SPLIT_STASH;
-struct __wt_stats;
- typedef struct __wt_stats WT_STATS;
struct __wt_table;
typedef struct __wt_table WT_TABLE;
struct __wt_txn;
@@ -262,8 +268,6 @@ struct __wt_txn_op;
typedef struct __wt_txn_op WT_TXN_OP;
struct __wt_txn_state;
typedef struct __wt_txn_state WT_TXN_STATE;
-struct __wt_upd_skipped;
- typedef struct __wt_upd_skipped WT_UPD_SKIPPED;
struct __wt_update;
typedef struct __wt_update WT_UPDATE;
union __wt_rand_state;
@@ -285,6 +289,8 @@ union __wt_rand_state;
#endif
#include "hardware.h"
+#include "queue.h"
+
#ifdef _WIN32
#include "os_windows.h"
#else
@@ -330,6 +336,7 @@ union __wt_rand_state;
#include "cache.i" /* required by txn.i */
#include "cell.i" /* required by btree.i */
+#include "log.i"
#include "mutex.i" /* required by btree.i */
#include "txn.i" /* required by btree.i */
diff --git a/src/log/log.c b/src/log/log.c
index 4242571fe53..4041761d062 100644
--- a/src/log/log.c
+++ b/src/log/log.c
@@ -34,6 +34,24 @@ __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn)
}
/*
+ * __wt_log_ckpt_lsn --
+ * Force out buffered records and return an LSN for checkpoint.
+ */
+int
+__wt_log_ckpt_lsn(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+
+ conn = S2C(session);
+ log = conn->log;
+ WT_RET(__wt_log_force_write(session, 1));
+ WT_RET(__wt_log_wrlsn(session));
+ *ckp_lsn = log->write_start_lsn;
+ return (0);
+}
+
+/*
* __wt_log_background --
* Record the given LSN as the background LSN and signal the
* thread as needed.
@@ -53,7 +71,7 @@ __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn)
* needed.
*/
__wt_spin_lock(session, &log->log_sync_lock);
- if (WT_LOG_CMP(lsn, &log->bg_sync_lsn) > 0)
+ if (__wt_log_cmp(lsn, &log->bg_sync_lsn) > 0)
log->bg_sync_lsn = *lsn;
__wt_spin_unlock(session, &log->log_sync_lock);
return (__wt_cond_signal(session, conn->log_file_cond));
@@ -100,7 +118,7 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn)
/*
* Sync the log file if needed.
*/
- if (WT_LOG_CMP(&log->sync_lsn, min_lsn) < 0) {
+ if (__wt_log_cmp(&log->sync_lsn, min_lsn) < 0) {
WT_ERR(__wt_verbose(session, WT_VERB_LOG,
"log_force_sync: sync to LSN %d/%lu",
min_lsn->file, min_lsn->offset));
@@ -241,6 +259,11 @@ __wt_log_get_all_files(WT_SESSION_IMPL *session,
log = S2C(session)->log;
*maxid = 0;
+ /*
+ * These may be files needed by backup. Force the current slot
+ * to get written to the file.
+ */
+ WT_RET(__wt_log_force_write(session, 1));
WT_RET(__log_get_files(session, WT_LOG_FILENAME, &files, &count));
/* Filter out any files that are below the checkpoint LSN. */
@@ -354,70 +377,12 @@ static int
__log_size_fit(WT_SESSION_IMPL *session, WT_LSN *lsn, uint64_t recsize)
{
WT_CONNECTION_IMPL *conn;
-
- conn = S2C(session);
- return (lsn->offset + (wt_off_t)recsize < conn->log_file_max);
-}
-
-/*
- * __log_acquire --
- * Called with the log slot lock held. Can be called recursively
- * from __wt_log_newfile when we change log files.
- */
-static int
-__log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot)
-{
- WT_CONNECTION_IMPL *conn;
WT_LOG *log;
- int created_log;
conn = S2C(session);
log = conn->log;
- created_log = 1;
- /*
- * Called locked. Add recsize to alloc_lsn. Save our starting LSN
- * where the previous allocation finished for the release LSN.
- * That way when log files switch, we're waiting for the correct LSN
- * from outstanding writes.
- */
- slot->slot_release_lsn = log->alloc_lsn;
- if (!__log_size_fit(session, &log->alloc_lsn, recsize)) {
- WT_RET(__wt_log_newfile(session, 0, &created_log));
- if (log->log_close_fh != NULL)
- F_SET(slot, WT_SLOT_CLOSEFH);
- }
-
- /*
- * Checkpoints can be configured based on amount of log written.
- * Add in this log record to the sum and if needed, signal the
- * checkpoint condition. The logging subsystem manages the
- * accumulated field. There is a bit of layering violation
- * here checking the connection ckpt field and using its
- * condition.
- */
- if (WT_CKPT_LOGSIZE(conn)) {
- log->log_written += (wt_off_t)recsize;
- WT_RET(__wt_checkpoint_signal(session, log->log_written));
- }
-
- /*
- * Need to minimally fill in slot info here. Our slot start LSN
- * comes after any potential new log file creations.
- */
- slot->slot_start_lsn = log->alloc_lsn;
- slot->slot_start_offset = log->alloc_lsn.offset;
- /*
- * Pre-allocate on the first real write into the log file, if it
- * was just created (i.e. not pre-allocated).
- */
- if (log->alloc_lsn.offset == WT_LOG_FIRST_RECORD && created_log)
- WT_RET(__log_prealloc(session, log->log_fh));
-
- log->alloc_lsn.offset += (wt_off_t)recsize;
- slot->slot_end_lsn = log->alloc_lsn;
- slot->slot_error = 0;
- slot->slot_fh = log->log_fh;
- return (0);
+ return (lsn->offset == WT_LOG_FIRST_RECORD ||
+ lsn->offset + (wt_off_t)recsize < conn->log_file_max);
}
/*
@@ -490,24 +455,32 @@ __log_decrypt(WT_SESSION_IMPL *session, WT_ITEM *in, WT_ITEM *out)
*/
static int
__log_fill(WT_SESSION_IMPL *session,
- WT_MYSLOT *myslot, int direct, WT_ITEM *record, WT_LSN *lsnp)
+ WT_MYSLOT *myslot, int force, WT_ITEM *record, WT_LSN *lsnp)
{
WT_DECL_RET;
WT_LOG_RECORD *logrec;
+ /*
+ * The WT_LOG_SLOT_BUF_MAX macro uses log.
+ */
logrec = (WT_LOG_RECORD *)record->mem;
/*
- * Call __wt_write. For now the offset is the real byte offset. If the
- * offset becomes a unit of WT_LOG_ALIGN this is where we would multiply
- * by WT_LOG_ALIGN to get the real file byte offset for write().
+ * Call __wt_write or copy into the buffer. For now the offset is the
+ * real byte offset. If the offset becomes a unit of WT_LOG_ALIGN this
+ * is where we would multiply by WT_LOG_ALIGN to get the real file byte
+ * offset for write().
*/
- if (direct)
+ if (!force && !F_ISSET(myslot, WT_MYSLOT_UNBUFFERED))
+ memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset,
+ logrec, logrec->len);
+ else
+ /*
+ * If this is a force or unbuffered write, write it now.
+ * A forced write sends in a temporary, local slot.
+ */
WT_ERR(__wt_write(session, myslot->slot->slot_fh,
myslot->offset + myslot->slot->slot_start_offset,
(size_t)logrec->len, (void *)logrec));
- else
- memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset,
- logrec, logrec->len);
WT_STAT_FAST_CONN_INCRV(session, log_bytes_written, logrec->len);
if (lsnp != NULL) {
@@ -563,13 +536,13 @@ __log_file_header(
logrec->checksum = 0;
logrec->checksum = __wt_cksum(logrec, log->allocsize);
WT_CLEAR(tmp);
+ memset(&myslot, 0, sizeof(myslot));
myslot.slot = &tmp;
- myslot.offset = 0;
/*
- * We may recursively call __log_acquire to allocate log space for the
- * log descriptor record. Call __log_fill to write it, but we
- * do not need to call __log_release because we're not waiting for
+ * We may recursively call __wt_log_acquire to allocate log space for
+ * the log descriptor record. Call __log_fill to write it, but we
+ * do not need to call __wt_log_release because we're not waiting for
* any earlier operations to complete.
*/
if (prealloc) {
@@ -577,7 +550,7 @@ __log_file_header(
tmp.slot_fh = fh;
} else {
WT_ASSERT(session, fh == NULL);
- WT_ERR(__log_acquire(session, logrec->len, &tmp));
+ WT_ERR(__wt_log_acquire(session, logrec->len, &tmp));
}
WT_ERR(__log_fill(session, &myslot, 1, buf, NULL));
/*
@@ -697,6 +670,146 @@ err: __wt_scr_free(session, &from_path);
}
/*
+ * __log_newfile --
+ * Create the next log file and write the file header record into it.
+ */
+static int
+__log_newfile(WT_SESSION_IMPL *session, int conn_open, int *created)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_LOG *log;
+ WT_LSN end_lsn;
+ int create_log, yield_cnt;
+
+ conn = S2C(session);
+ log = conn->log;
+
+ create_log = 1;
+ yield_cnt = 0;
+ /*
+ * Set aside the log file handle to be closed later. Other threads
+ * may still be using it to write to the log. If the log file size
+ * is small we could fill a log file before the previous one is closed.
+ * Wait for that to close.
+ */
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
+ while (log->log_close_fh != NULL) {
+ WT_STAT_FAST_CONN_INCR(session, log_close_yields);
+ WT_RET(__wt_log_wrlsn(session));
+ if (++yield_cnt > 10000)
+ return (EBUSY);
+ __wt_yield();
+ }
+ log->log_close_fh = log->log_fh;
+ if (log->log_close_fh != NULL)
+ log->log_close_lsn = log->alloc_lsn;
+ log->fileid++;
+ /*
+ * Make sure everything we set above is visible.
+ */
+ WT_FULL_BARRIER();
+ /*
+ * If we're pre-allocating log files, look for one. If there aren't any
+ * or we're not pre-allocating, then create one.
+ */
+ if (conn->log_prealloc) {
+ ret = __log_alloc_prealloc(session, log->fileid);
+ /*
+ * If ret is 0 it means we found a pre-allocated file.
+ * If ret is non-zero but not WT_NOTFOUND, we return the error.
+ * If ret is WT_NOTFOUND, we leave create_log set and create
+ * the new log file.
+ */
+ if (ret == 0)
+ create_log = 0;
+ /*
+ * If we get any error other than WT_NOTFOUND, return it.
+ */
+ if (ret != 0 && ret != WT_NOTFOUND)
+ return (ret);
+ ret = 0;
+ }
+ /*
+ * If we need to create the log file, do so now.
+ */
+ if (create_log) {
+ log->prep_missed++;
+ WT_RET(__wt_log_allocfile(
+ session, log->fileid, WT_LOG_FILENAME, 1));
+ }
+ WT_RET(__log_openfile(session,
+ 0, &log->log_fh, WT_LOG_FILENAME, log->fileid));
+ /*
+ * We need to setup the LSNs. Set the end LSN and alloc LSN to
+ * the end of the header.
+ */
+ log->alloc_lsn.file = log->fileid;
+ log->alloc_lsn.offset = WT_LOG_FIRST_RECORD;
+ end_lsn = log->alloc_lsn;
+
+ /*
+ * If we're called from connection creation code, we need to update
+ * the LSNs since we're the only write in progress.
+ */
+ if (conn_open) {
+ WT_RET(__wt_fsync(session, log->log_fh));
+ log->sync_lsn = end_lsn;
+ log->write_lsn = end_lsn;
+ log->write_start_lsn = end_lsn;
+ }
+ if (created != NULL)
+ *created = create_log;
+ return (0);
+}
+
+/*
+ * __wt_log_acquire --
+ * Called serially when switching slots. Can be called recursively
+ * from __log_newfile when we change log files.
+ */
+int
+__wt_log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+ int created_log;
+
+ conn = S2C(session);
+ log = conn->log;
+ created_log = 1;
+ /*
+ * Add recsize to alloc_lsn. Save our starting LSN
+ * where the previous allocation finished for the release LSN.
+ * That way when log files switch, we're waiting for the correct LSN
+ * from outstanding writes.
+ */
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
+ /*
+ * We need to set the release LSN earlier, before a log file change.
+ */
+ slot->slot_release_lsn = log->alloc_lsn;
+ if (!__log_size_fit(session, &log->alloc_lsn, recsize)) {
+ WT_RET(__log_newfile(session, 0, &created_log));
+ if (log->log_close_fh != NULL)
+ F_SET(slot, WT_SLOT_CLOSEFH);
+ }
+
+ /*
+ * Pre-allocate on the first real write into the log file, if it
+ * was just created (i.e. not pre-allocated).
+ */
+ if (log->alloc_lsn.offset == WT_LOG_FIRST_RECORD && created_log)
+ WT_RET(__log_prealloc(session, log->log_fh));
+ /*
+ * Initialize the slot for activation.
+ */
+ __wt_log_slot_activate(session, slot);
+
+ return (0);
+}
+
+/*
* __log_truncate --
* Truncate the log to the given LSN. If this_log is set, it will only
* truncate the log file indicated in the given LSN. If not set,
@@ -791,7 +904,7 @@ __wt_log_allocfile(
*/
WT_RET(__wt_scr_alloc(session, 0, &from_path));
WT_ERR(__wt_scr_alloc(session, 0, &to_path));
- tmp_id = WT_ATOMIC_ADD4(log->tmp_fileid, 1);
+ tmp_id = __wt_atomic_add32(&log->tmp_fileid, 1);
WT_ERR(__log_filename(session, tmp_id, WT_LOG_TMPNAME, from_path));
WT_ERR(__log_filename(session, lognum, dest, to_path));
/*
@@ -842,7 +955,7 @@ err: __wt_scr_free(session, &path);
* __wt_log_open --
* Open the appropriate log file for the connection. The purpose is
* to find the last log file that exists, open it and set our initial
- * LSNs to the end of that file. If none exist, call __wt_log_newfile
+ * LSNs to the end of that file. If none exist, call __log_newfile
* to create it.
*/
int
@@ -917,7 +1030,9 @@ __wt_log_open(WT_SESSION_IMPL *session)
* Start logging at the beginning of the next log file, no matter
* where the previous log file ends.
*/
- WT_ERR(__wt_log_newfile(session, 1, NULL));
+ WT_WITH_SLOT_LOCK(session, log,
+ ret = __log_newfile(session, 1, NULL));
+ WT_ERR(ret);
/* If we found log files, save the new state. */
if (logcount > 0) {
@@ -1055,48 +1170,67 @@ err:
}
/*
- * __log_release --
+ * __wt_log_release --
* Release a log slot.
*/
-static int
-__log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep)
+int
+__wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep)
{
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_LOG *log;
WT_LSN sync_lsn;
- size_t write_size;
- int locked, yield_count;
+ int locked, need_relock, yield_count;
+ int64_t release_buffered, release_bytes;
conn = S2C(session);
log = conn->log;
- locked = yield_count = 0;
- *freep = 1;
+ locked = need_relock = yield_count = 0;
+ if (freep != NULL)
+ *freep = 1;
+ release_buffered =
+ WT_LOG_SLOT_RELEASED_BUFFERED(slot->slot_state);
+ release_bytes = release_buffered + slot->slot_unbuffered;
/* Write the buffered records */
- if (F_ISSET(slot, WT_SLOT_BUFFERED)) {
- write_size = (size_t)
- (slot->slot_end_lsn.offset - slot->slot_start_offset);
- WT_ERR(__wt_write(session, slot->slot_fh,
- slot->slot_start_offset, write_size, slot->slot_buf.mem));
+ /*
+ * Checkpoints can be configured based on amount of log written.
+ * Add in this log record to the sum and if needed, signal the
+ * checkpoint condition. The logging subsystem manages the
+ * accumulated field. There is a bit of layering violation
+ * here checking the connection ckpt field and using its
+ * condition.
+ */
+ if (WT_CKPT_LOGSIZE(conn)) {
+ log->log_written += (wt_off_t)release_bytes;
+ WT_RET(__wt_checkpoint_signal(session, log->log_written));
}
+ if (release_buffered != 0)
+ WT_ERR(__wt_write(session,
+ slot->slot_fh, slot->slot_start_offset,
+ (size_t)release_buffered, slot->slot_buf.mem));
+
/*
- * If this is not a buffered write, meaning the slot we have is a
- * dummy constructed slot, not from the slot pool, or we have to wait
- * for a synchronous operation, we do not pass handling of this slot
- * off to the worker thread. The caller is responsible for freeing
- * the slot in that case. Otherwise the worker thread will free it.
+ * If we have to wait for a synchronous operation, we do not pass
+ * handling of this slot off to the worker thread. The caller is
+ * responsible for freeing the slot in that case. Otherwise the
+ * worker thread will free it.
*/
- if (F_ISSET(slot, WT_SLOT_BUFFERED) &&
- !F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) {
- *freep = 0;
+ if (!F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) {
+ if (freep != NULL)
+ *freep = 0;
slot->slot_state = WT_LOG_SLOT_WRITTEN;
/*
* After this point the worker thread owns the slot. There
* is nothing more to do but return.
*/
- WT_ERR(__wt_cond_signal(session, conn->log_wrlsn_cond));
+ /*
+ * !!! Signalling the wrlsn_cond condition here results in
+ * worse performance because it causes more scheduling churn
+ * and more walking of the slot pool for a very small number
+ * of slots to process. Don't signal here.
+ */
goto done;
}
@@ -1105,15 +1239,31 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep)
* be holes in the log file.
*/
WT_STAT_FAST_CONN_INCR(session, log_release_write_lsn);
- while (WT_LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0) {
+ while (__wt_log_cmp(&log->write_lsn, &slot->slot_release_lsn) != 0) {
+ /*
+ * If we're on a locked path and the write LSN is not advancing,
+ * unlock in case an earlier thread is trying to switch its
+ * slot and complete its operation.
+ */
+ if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) {
+ __wt_spin_unlock(session, &log->log_slot_lock);
+ need_relock = 1;
+ }
if (++yield_count < 1000)
__wt_yield();
else
WT_ERR(__wt_cond_wait(
session, log->log_write_cond, 200));
+ if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) {
+ __wt_spin_lock(session, &log->log_slot_lock);
+ need_relock = 0;
+ }
}
+
log->write_start_lsn = slot->slot_start_lsn;
log->write_lsn = slot->slot_end_lsn;
+
+ WT_ASSERT(session, slot != log->active_slot);
WT_ERR(__wt_cond_signal(session, log->log_write_cond));
/*
@@ -1168,7 +1318,7 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep)
* Sync the log file if needed.
*/
if (F_ISSET(slot, WT_SLOT_SYNC) &&
- WT_LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
+ __wt_log_cmp(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
WT_ERR(__wt_verbose(session, WT_VERB_LOG,
"log_release: sync log %s", log->log_fh->name));
WT_STAT_FAST_CONN_INCR(session, log_sync);
@@ -1186,6 +1336,8 @@ __log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *freep)
}
err: if (locked)
__wt_spin_unlock(session, &log->log_sync_lock);
+ if (need_relock)
+ __wt_spin_lock(session, &log->log_slot_lock);
if (ret != 0 && slot->slot_error == 0)
slot->slot_error = ret;
done:
@@ -1193,93 +1345,6 @@ done:
}
/*
- * __wt_log_newfile --
- * Create the next log file and write the file header record into it.
- */
-int
-__wt_log_newfile(WT_SESSION_IMPL *session, int conn_create, int *created)
-{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_LOG *log;
- WT_LSN end_lsn;
- int create_log;
-
- conn = S2C(session);
- log = conn->log;
-
- create_log = 1;
- /*
- * Set aside the log file handle to be closed later. Other threads
- * may still be using it to write to the log. If the log file size
- * is small we could fill a log file before the previous one is closed.
- * Wait for that to close.
- */
- while (log->log_close_fh != NULL) {
- WT_STAT_FAST_CONN_INCR(session, log_close_yields);
- WT_RET(__wt_log_wrlsn(session, NULL, NULL));
- __wt_yield();
- }
- log->log_close_fh = log->log_fh;
- log->fileid++;
-
- /*
- * If we're pre-allocating log files, look for one. If there aren't any
- * or we're not pre-allocating, then create one.
- */
- ret = 0;
- if (conn->log_prealloc) {
- ret = __log_alloc_prealloc(session, log->fileid);
- /*
- * If ret is 0 it means we found a pre-allocated file.
- * If ret is non-zero but not WT_NOTFOUND, we return the error.
- * If ret is WT_NOTFOUND, we leave create_log set and create
- * the new log file.
- */
- if (ret == 0)
- create_log = 0;
- /*
- * If we get any error other than WT_NOTFOUND, return it.
- */
- if (ret != 0 && ret != WT_NOTFOUND)
- return (ret);
- ret = 0;
- }
- /*
- * If we need to create the log file, do so now.
- */
- if (create_log) {
- log->prep_missed++;
- if ((ret = __wt_log_allocfile(
- session, log->fileid, WT_LOG_FILENAME, 0)) != 0)
- return (ret);
- }
- WT_RET(__log_openfile(session,
- 0, &log->log_fh, WT_LOG_FILENAME, log->fileid));
- /*
- * We need to setup the LSNs. Set the end LSN and alloc LSN to
- * the end of the header.
- */
- log->alloc_lsn.file = log->fileid;
- log->alloc_lsn.offset = WT_LOG_FIRST_RECORD;
- end_lsn = log->alloc_lsn;
-
- /*
- * If we're called from connection creation code, we need to update
- * the LSNs since we're the only write in progress.
- */
- if (conn_create) {
- WT_RET(__wt_fsync(session, log->log_fh));
- log->sync_lsn = end_lsn;
- log->write_lsn = end_lsn;
- log->write_start_lsn = end_lsn;
- }
- if (created != NULL)
- *created = create_log;
- return (0);
-}
-
-/*
* __wt_log_scan --
* Scan the logs, calling a function on each record found.
*/
@@ -1535,7 +1600,7 @@ advance:
/* Truncate if we're in recovery. */
if (LF_ISSET(WT_LOGSCAN_RECOVER) &&
- WT_LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0)
+ __wt_log_cmp(&rd_lsn, &log->trunc_lsn) < 0)
WT_ERR(__log_truncate(session,
&rd_lsn, WT_LOG_FILENAME, 0));
@@ -1559,43 +1624,20 @@ err: WT_STAT_FAST_CONN_INCR(session, log_scans);
}
/*
- * __log_direct_write --
- * Write a log record without using the consolidation arrays.
+ * __wt_log_force_write --
+ * Force a switch and release and write of the current slot.
+ * Wrapper function that takes the lock.
*/
-static int
-__log_direct_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
- uint32_t flags)
+int
+__wt_log_force_write(WT_SESSION_IMPL *session, int retry)
{
- WT_DECL_RET;
WT_LOG *log;
- WT_LOGSLOT tmp;
WT_MYSLOT myslot;
- int dummy, locked;
log = S2C(session)->log;
- myslot.slot = &tmp;
- myslot.offset = 0;
- dummy = 0;
- WT_CLEAR(tmp);
-
- /* Fast path the contended case. */
- if (__wt_spin_trylock(session, &log->log_slot_lock) != 0)
- return (EAGAIN);
- locked = 1;
-
- if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC))
- F_SET(&tmp, WT_SLOT_SYNC_DIR);
- if (LF_ISSET(WT_LOG_FSYNC))
- F_SET(&tmp, WT_SLOT_SYNC);
- WT_ERR(__log_acquire(session, record->size, &tmp));
- __wt_spin_unlock(session, &log->log_slot_lock);
- locked = 0;
- WT_ERR(__log_fill(session, &myslot, 1, record, lsnp));
- WT_ERR(__log_release(session, &tmp, &dummy));
-
-err: if (locked)
- __wt_spin_unlock(session, &log->log_slot_lock);
- return (ret);
+ memset(&myslot, 0, sizeof(myslot));
+ myslot.slot = log->active_slot;
+ return (__wt_log_slot_switch(session, &myslot, retry, 1));
}
/*
@@ -1741,14 +1783,16 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
WT_LOG_RECORD *logrec;
WT_LSN lsn;
WT_MYSLOT myslot;
- uint32_t rdup_len;
- int free_slot, locked;
+ int64_t release_size;
+ uint32_t force, rdup_len;
+ int free_slot;
conn = S2C(session);
log = conn->log;
- free_slot = locked = 0;
+ free_slot = 0;
WT_INIT_LSN(&lsn);
myslot.slot = NULL;
+ memset(&myslot, 0, sizeof(myslot));
/*
* Assume the WT_ITEM the caller passed is a WT_LOG_RECORD, which has a
* header at the beginning for us to fill in.
@@ -1778,87 +1822,67 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
WT_STAT_FAST_CONN_INCR(session, log_writes);
- if (!F_ISSET(log, WT_LOG_FORCE_CONSOLIDATE)) {
- ret = __log_direct_write(session, record, &lsn, flags);
- if (ret == 0 && lsnp != NULL)
- *lsnp = lsn;
- /*
- * All needed syncing will be handled directly except
- * a background sync. Handle that here.
- */
- if (ret == 0) {
- if (LF_ISSET(WT_LOG_BACKGROUND))
- goto bg;
- else
- return (0);
- }
- if (ret != EAGAIN)
- WT_ERR(ret);
- /*
- * An EAGAIN return means we failed to get the try lock -
- * fall through to the consolidation code in that case.
- */
- }
-
+ __wt_log_slot_join(session, rdup_len, flags, &myslot);
+ /*
+ * If the addition of this record crosses the buffer boundary,
+ * switch in a new slot.
+ */
+ force = LF_ISSET(WT_LOG_FLUSH | WT_LOG_FSYNC);
+ ret = 0;
+ if (myslot.end_offset >= WT_LOG_SLOT_BUF_MAX ||
+ F_ISSET(&myslot, WT_MYSLOT_UNBUFFERED) || force)
+ ret = __wt_log_slot_switch(session, &myslot, 1, 0);
+ if (ret == 0)
+ ret = __log_fill(session, &myslot, 0, record, &lsn);
+ release_size = __wt_log_slot_release(
+ session, &myslot, (int64_t)rdup_len);
/*
- * As soon as we see contention for the log slot, disable direct
- * log writes. We get better performance by forcing writes through
- * the consolidation code. This is because individual writes flood
- * the I/O system faster than they contend on the log slot lock.
+ * If we get an error we still need to do proper accounting in
+ * the slot fields.
+ * XXX On error we may still need to call release and free.
*/
- F_SET(log, WT_LOG_FORCE_CONSOLIDATE);
- if ((ret = __wt_log_slot_join(
- session, rdup_len, flags, &myslot)) == ENOMEM) {
+ if (ret != 0)
+ myslot.slot->slot_error = ret;
+ WT_ASSERT(session, ret == 0);
+ if (WT_LOG_SLOT_DONE(release_size)) {
+ WT_ERR(__wt_log_release(session, myslot.slot, &free_slot));
+ if (free_slot)
+ __wt_log_slot_free(session, myslot.slot);
+ } else if (force) {
/*
- * If we couldn't find a consolidated slot for this record
- * write the record directly.
+ * If we are going to wait for this slot to get written,
+ * signal the wrlsn thread.
+ *
+ * XXX I've seen times when conditions are NULL.
*/
- while ((ret = __log_direct_write(
- session, record, lsnp, flags)) == EAGAIN)
- ;
- WT_ERR(ret);
- return (0);
+ if (conn->log_cond != NULL) {
+ WT_ERR(__wt_cond_signal(session, conn->log_cond));
+ __wt_yield();
+ } else
+ WT_ERR(__wt_log_force_write(session, 1));
}
- WT_ERR(ret);
- if (myslot.offset == 0) {
- __wt_spin_lock(session, &log->log_slot_lock);
- locked = 1;
- WT_ERR(__wt_log_slot_close(session, myslot.slot));
- WT_ERR(__log_acquire(
- session, myslot.slot->slot_group_size, myslot.slot));
- __wt_spin_unlock(session, &log->log_slot_lock);
- locked = 0;
- WT_ERR(__wt_log_slot_notify(session, myslot.slot));
- } else
- WT_ERR(__wt_log_slot_wait(session, myslot.slot));
- WT_ERR(__log_fill(session, &myslot, 0, record, &lsn));
- if (__wt_log_slot_release(myslot.slot, rdup_len) == WT_LOG_SLOT_DONE) {
- WT_ERR(__log_release(session, myslot.slot, &free_slot));
- if (free_slot)
- WT_ERR(__wt_log_slot_free(session, myslot.slot));
+ if (LF_ISSET(WT_LOG_FLUSH)) {
+ /* Wait for our writes to reach the OS */
+ while (__wt_log_cmp(&log->write_lsn, &lsn) <= 0 &&
+ myslot.slot->slot_error == 0)
+ (void)__wt_cond_wait(
+ session, log->log_write_cond, 10000);
} else if (LF_ISSET(WT_LOG_FSYNC)) {
/* Wait for our writes to reach disk */
- while (WT_LOG_CMP(&log->sync_lsn, &lsn) <= 0 &&
+ while (__wt_log_cmp(&log->sync_lsn, &lsn) <= 0 &&
myslot.slot->slot_error == 0)
(void)__wt_cond_wait(
session, log->log_sync_cond, 10000);
- } else if (LF_ISSET(WT_LOG_FLUSH)) {
- /* Wait for our writes to reach the OS */
- while (WT_LOG_CMP(&log->write_lsn, &lsn) <= 0 &&
- myslot.slot->slot_error == 0)
- (void)__wt_cond_wait(
- session, log->log_write_cond, 10000);
}
/*
* Advance the background sync LSN if needed.
*/
-bg: if (LF_ISSET(WT_LOG_BACKGROUND) &&
- WT_LOG_CMP(&session->bg_sync_lsn, &lsn) <= 0)
+ if (LF_ISSET(WT_LOG_BACKGROUND) &&
+ __wt_log_cmp(&session->bg_sync_lsn, &lsn) <= 0)
WT_ERR(__wt_log_background(session, &lsn));
-err: if (locked)
- __wt_spin_unlock(session, &log->log_slot_lock);
+err:
if (ret == 0 && lsnp != NULL)
*lsnp = lsn;
/*
diff --git a/src/log/log_slot.c b/src/log/log_slot.c
index 0b580af4526..216a594ce3d 100644
--- a/src/log/log_slot.c
+++ b/src/log/log_slot.c
@@ -9,325 +9,486 @@
#include "wt_internal.h"
/*
- * This file implements the consolidated array algorithm as described in
- * the paper:
- * Scalability of write-ahead logging on multicore and multisocket hardware
- * by Ryan Johnson, Ippokratis Pandis, Radu Stoica, Manos Athanassoulis
- * and Anastasia Ailamaki.
- *
- * It appeared in The VLDB Journal, DOI 10.1007/s00778-011-0260-8 and can
- * be found at:
- * http://infoscience.epfl.ch/record/170505/files/aether-smpfulltext.pdf
+ * __wt_log_slot_activate --
+ * Initialize a slot to become active.
*/
-
-/*
- * __wt_log_slot_init --
- * Initialize the slot array.
- */
-int
-__wt_log_slot_init(WT_SESSION_IMPL *session)
+void
+__wt_log_slot_activate(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
{
WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
WT_LOG *log;
- WT_LOGSLOT *slot;
- int32_t i;
conn = S2C(session);
log = conn->log;
- for (i = 0; i < WT_SLOT_POOL; i++) {
- log->slot_pool[i].slot_state = WT_LOG_SLOT_FREE;
- log->slot_pool[i].slot_index = WT_SLOT_INVALID_INDEX;
- }
- /*
- * Set up the available slots from the pool the first time.
- */
- for (i = 0; i < WT_SLOT_ACTIVE; i++) {
- slot = &log->slot_pool[i];
- slot->slot_index = (uint32_t)i;
- slot->slot_state = WT_LOG_SLOT_READY;
- log->slot_array[i] = slot;
- }
-
- /*
- * Allocate memory for buffers now that the arrays are setup. Split
- * this out to make error handling simpler.
- *
- * Cap the slot buffer to the log file size.
- */
- log->slot_buf_size =
- WT_MIN((size_t)conn->log_file_max, WT_LOG_SLOT_BUF_SIZE);
- for (i = 0; i < WT_SLOT_POOL; i++) {
- WT_ERR(__wt_buf_init(session,
- &log->slot_pool[i].slot_buf, log->slot_buf_size));
- F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS);
- }
- WT_STAT_FAST_CONN_INCRV(session,
- log_buffer_size, log->slot_buf_size * WT_SLOT_POOL);
- if (0) {
-err: while (--i >= 0)
- __wt_buf_free(session, &log->slot_pool[i].slot_buf);
- }
- return (ret);
+ slot->slot_state = 0;
+ slot->slot_start_lsn = slot->slot_end_lsn = log->alloc_lsn;
+ slot->slot_start_offset = log->alloc_lsn.offset;
+ slot->slot_last_offset = log->alloc_lsn.offset;
+ slot->slot_fh = log->log_fh;
+ slot->slot_error = 0;
+ slot->slot_unbuffered = 0;
}
/*
- * __wt_log_slot_destroy --
- * Clean up the slot array on shutdown.
+ * __wt_log_slot_close --
+ * Close out the slot the caller is using. The slot may already be
+ * closed or freed by another thread.
*/
int
-__wt_log_slot_destroy(WT_SESSION_IMPL *session)
+__wt_log_slot_close(
+ WT_SESSION_IMPL *session, WT_LOGSLOT *slot, int *releasep, int forced)
{
WT_CONNECTION_IMPL *conn;
WT_LOG *log;
- int i;
+ int64_t end_offset, new_state, old_state;
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
conn = S2C(session);
log = conn->log;
-
- for (i = 0; i < WT_SLOT_POOL; i++)
- __wt_buf_free(session, &log->slot_pool[i].slot_buf);
+ if (releasep != NULL)
+ *releasep = 0;
+ if (slot == NULL)
+ return (WT_NOTFOUND);
+retry:
+ old_state = slot->slot_state;
+ /*
+ * If this close is coming from a forced close and a thread is in
+ * the middle of using the slot, return EBUSY. The caller can
+ * decide if retrying is necessary or not.
+ */
+ if (forced && WT_LOG_SLOT_INPROGRESS(old_state))
+ return (EBUSY);
+ /*
+ * If someone else is switching out this slot we lost. Nothing to
+ * do but return. Return WT_NOTFOUND anytime the given slot was
+ * processed by another closing thread. Only return 0 when we
+ * actually closed the slot.
+ */
+ if (WT_LOG_SLOT_CLOSED(old_state))
+ return (WT_NOTFOUND);
+ /*
+ * If someone completely processed this slot, we're done.
+ */
+ if (FLD64_ISSET((uint64_t)slot->slot_state, WT_LOG_SLOT_RESERVED))
+ return (WT_NOTFOUND);
+ new_state = (old_state | WT_LOG_SLOT_CLOSE);
+ /*
+ * Close this slot. If we lose the race retry.
+ */
+ if (!__wt_atomic_casiv64(&slot->slot_state, old_state, new_state))
+ goto retry;
+ /*
+ * We own the slot now. No one else can join.
+ * Set the end LSN.
+ */
+ WT_STAT_FAST_CONN_INCR(session, log_slot_closes);
+ if (WT_LOG_SLOT_DONE(new_state) && releasep != NULL)
+ *releasep = 1;
+ slot->slot_end_lsn = slot->slot_start_lsn;
+ end_offset =
+ WT_LOG_SLOT_JOINED_BUFFERED(old_state) + slot->slot_unbuffered;
+ slot->slot_end_lsn.offset += (wt_off_t)end_offset;
+ WT_STAT_FAST_CONN_INCRV(session,
+ log_slot_consolidated, end_offset);
+ /*
+ * XXX Would like to change so one piece of code advances the LSN.
+ */
+ log->alloc_lsn = slot->slot_end_lsn;
+ WT_ASSERT(session, log->alloc_lsn.file >= log->write_lsn.file);
return (0);
}
/*
- * __wt_log_slot_join --
- * Join a consolidated logging slot. Callers should be prepared to deal
- * with a ENOMEM return - which indicates no slots could accommodate
- * the log record.
+ * __log_slot_switch_internal --
+ * Switch out the current slot and set up a new one.
*/
-int
-__wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
- uint32_t flags, WT_MYSLOT *myslotp)
+static int
+__log_slot_switch_internal(
+ WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int forced)
{
- WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
WT_LOG *log;
WT_LOGSLOT *slot;
- int64_t new_state, old_state;
- uint32_t allocated_slot, slot_attempts;
+ int free_slot, release;
- conn = S2C(session);
- log = conn->log;
- slot_attempts = 0;
+ log = S2C(session)->log;
+ release = 0;
+ slot = myslot->slot;
+
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
- if (mysize >= (uint64_t)log->slot_buf_size) {
- WT_STAT_FAST_CONN_INCR(session, log_slot_toobig);
- return (ENOMEM);
- }
-find_slot:
-#if WT_SLOT_ACTIVE == 1
- allocated_slot = 0;
-#else
- allocated_slot = __wt_random(&session->rnd) % WT_SLOT_ACTIVE;
-#endif
- /*
- * Get the selected slot. Use a barrier to prevent the compiler from
- * caching this read.
- */
- WT_BARRIER();
- slot = log->slot_array[allocated_slot];
-join_slot:
- /*
- * Read the current slot state. Use a barrier to prevent the compiler
- * from caching this read.
- */
- WT_BARRIER();
- old_state = slot->slot_state;
- /*
- * WT_LOG_SLOT_READY and higher means the slot is available for
- * joining. Any other state means it is in use and transitioning
- * from the active array.
- */
- if (old_state < WT_LOG_SLOT_READY) {
- WT_STAT_FAST_CONN_INCR(session, log_slot_transitions);
- goto find_slot;
- }
/*
- * Add in our size to the state and then atomically swap that
- * into place if it is still the same value.
+ * If someone else raced us to closing this specific slot, we're
+ * done here.
*/
- new_state = old_state + (int64_t)mysize;
- if (new_state < old_state) {
- /* Our size doesn't fit here. */
- WT_STAT_FAST_CONN_INCR(session, log_slot_toobig);
- goto find_slot;
- }
+ if (slot != log->active_slot)
+ return (0);
+
/*
- * If the slot buffer isn't big enough to hold this update, try
- * to find another slot.
+ * If close returns WT_NOTFOUND, it means that someone else is
+ * processing the slot change. However, we could have retried
+ * from a busy time creating a new slot. If so, we are that
+ * someone else and we need to try setting up a new slot again.
*/
- if (new_state > (int64_t)slot->slot_buf.memsize) {
- if (++slot_attempts > 5) {
- WT_STAT_FAST_CONN_INCR(session, log_slot_toosmall);
- return (ENOMEM);
+ if (!F_ISSET(myslot, WT_MYSLOT_CLOSE)) {
+ ret = __wt_log_slot_close(
+ session, slot, &release, forced);
+ if (ret == WT_NOTFOUND)
+ return (0);
+ WT_RET(ret);
+ if (release) {
+ WT_RET(__wt_log_release(session, slot, &free_slot));
+ if (free_slot)
+ __wt_log_slot_free(session, slot);
}
- goto find_slot;
}
/*
- * We lost a race to add our size into this slot. Check the state
- * and try again.
+ * Set that we have closed this slot because we may call in here
+ * multiple times if we retry creating a new slot.
*/
- if (!WT_ATOMIC_CAS8(slot->slot_state, old_state, new_state)) {
- WT_STAT_FAST_CONN_INCR(session, log_slot_races);
- goto join_slot;
- }
- WT_ASSERT(session, myslotp != NULL);
+ F_SET(myslot, WT_MYSLOT_CLOSE);
+ WT_RET(__wt_log_slot_new(session));
+ F_CLR(myslot, WT_MYSLOT_CLOSE);
+ return (0);
+}
+
+/*
+ * __wt_log_slot_switch --
+ * Switch out the current slot and set up a new one.
+ */
+int
+__wt_log_slot_switch(
+ WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int retry, int forced)
+{
+ WT_DECL_RET;
+ WT_LOG *log;
+
+ log = S2C(session)->log;
/*
- * We joined this slot. Fill in our information to return to
- * the caller.
+ * !!! Since the WT_WITH_SLOT_LOCK macro is a do-while loop, the
+ * compiler does not like it combined directly with the while loop
+ * here.
*/
- WT_STAT_FAST_CONN_INCR(session, log_slot_joins);
- if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC))
- F_SET(slot, WT_SLOT_SYNC_DIR);
- if (LF_ISSET(WT_LOG_FSYNC))
- F_SET(slot, WT_SLOT_SYNC);
- myslotp->slot = slot;
- myslotp->offset = (wt_off_t)old_state - WT_LOG_SLOT_READY;
- return (0);
+ do {
+ WT_WITH_SLOT_LOCK(session, log,
+ ret = __log_slot_switch_internal(
+ session, myslot, forced));
+ if (ret == EBUSY) {
+ WT_STAT_FAST_CONN_INCR(session, log_slot_switch_busy);
+ __wt_yield();
+ }
+ } while (F_ISSET(myslot, WT_MYSLOT_CLOSE) || (retry && ret == EBUSY));
+ return (ret);
}
/*
- * __log_slot_find_free --
- * Find and return a free log slot.
+ * __wt_log_slot_new --
+ * Find a free slot and switch it as the new active slot.
+ * Must be called holding the slot lock.
*/
-static int
-__log_slot_find_free(WT_SESSION_IMPL *session, WT_LOGSLOT **slot)
+int
+__wt_log_slot_new(WT_SESSION_IMPL *session)
{
WT_CONNECTION_IMPL *conn;
WT_LOG *log;
- uint32_t pool_i;
+ WT_LOGSLOT *slot;
+ int32_t i;
+ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT));
conn = S2C(session);
log = conn->log;
- WT_ASSERT(session, slot != NULL);
/*
- * Encourage processing and moving the write LSN forward.
- * That process has to walk the slots anyway, so do that
- * work and let it give us the index of a free slot along
- * the way.
+ * Although this function is single threaded, multiple threads could
+ * be trying to set a new active slot sequentially. If we find an
+ * active slot that is valid, return.
*/
- WT_RET(__wt_log_wrlsn(session, &pool_i, NULL));
- while (pool_i == WT_SLOT_POOL) {
+ if ((slot = log->active_slot) != NULL &&
+ WT_LOG_SLOT_OPEN(slot->slot_state))
+ return (0);
+
+ /*
+ * Keep trying until we can find a free slot.
+ */
+ for (;;) {
+ /*
+ * For now just restart at 0. We could use log->pool_index
+ * if that is inefficient.
+ */
+ for (i = 0; i < WT_SLOT_POOL; i++) {
+ slot = &log->slot_pool[i];
+ if (slot->slot_state == WT_LOG_SLOT_FREE) {
+ /*
+ * Make sure that the next buffer size can
+ * fit in the file. Proactively switch if
+ * it cannot. This reduces, but does not
+ * eliminate, log files that exceed the
+ * maximum file size.
+ *
+ * We want to minimize the risk of an
+ * error due to no space.
+ */
+ WT_RET(__wt_log_acquire(session,
+ log->slot_buf_size, slot));
+ /*
+ * We have a new, free slot to use.
+ * Set it as the active slot.
+ */
+ WT_STAT_FAST_CONN_INCR(session,
+ log_slot_transitions);
+ log->active_slot = slot;
+ return (0);
+ }
+ }
+ /*
+ * If we didn't find any free slots signal the worker thread.
+ */
+ (void)__wt_cond_signal(session, conn->log_wrlsn_cond);
__wt_yield();
- WT_RET(__wt_log_wrlsn(session, &pool_i, NULL));
}
- *slot = &log->slot_pool[pool_i];
- WT_ASSERT(session, (*slot)->slot_state == WT_LOG_SLOT_FREE);
- return (0);
+ /* NOTREACHED */
}
/*
- * __wt_log_slot_close --
- * Close a slot and do not allow any other threads to join this slot.
- * Remove this from the active slot array and move a new slot from
- * the pool into its place. Set up the size of this group;
- * Must be called with the logging spinlock held.
+ * __wt_log_slot_init --
+ * Initialize the slot array.
*/
int
-__wt_log_slot_close(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+__wt_log_slot_init(WT_SESSION_IMPL *session)
{
WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
WT_LOG *log;
- WT_LOGSLOT *newslot;
- int64_t old_state;
+ WT_LOGSLOT *slot;
+ int32_t i;
conn = S2C(session);
log = conn->log;
- /*
- * Find an unused slot in the pool.
- */
- WT_RET(__log_slot_find_free(session, &newslot));
+ WT_CACHE_LINE_ALIGNMENT_VERIFY(session, log->slot_pool);
+ for (i = 0; i < WT_SLOT_POOL; i++)
+ log->slot_pool[i].slot_state = WT_LOG_SLOT_FREE;
/*
- * Swap out the slot we're going to use and put a free one in the
- * slot array in its place so that threads can use it right away.
+ * Allocate memory for buffers now that the arrays are setup. Split
+ * this out to make error handling simpler.
*/
- WT_STAT_FAST_CONN_INCR(session, log_slot_closes);
- newslot->slot_state = WT_LOG_SLOT_READY;
- newslot->slot_index = slot->slot_index;
- log->slot_array[newslot->slot_index] = newslot;
- old_state = WT_ATOMIC_STORE8(slot->slot_state, WT_LOG_SLOT_PENDING);
- slot->slot_group_size = (uint64_t)(old_state - WT_LOG_SLOT_READY);
/*
- * Note that this statistic may be much bigger than in reality,
- * especially when compared with the total bytes written in
- * __log_fill. The reason is that this size reflects any
- * rounding up that is needed and the total bytes in __log_fill
- * is the amount of user bytes.
+ * Cap the slot buffer to the log file size times two if needed.
+ * That means we try to fill to half the buffer but allow some
+ * extra space.
+ *
+ * !!! If the buffer size is too close to the log file size, we will
+ * switch log files very aggressively. Scale back the buffer for
+ * small log file sizes.
*/
+ log->slot_buf_size = (uint32_t)WT_MIN(
+ (size_t)conn->log_file_max/10, WT_LOG_SLOT_BUF_SIZE);
+ for (i = 0; i < WT_SLOT_POOL; i++) {
+ WT_ERR(__wt_buf_init(session,
+ &log->slot_pool[i].slot_buf, log->slot_buf_size));
+ F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS);
+ }
WT_STAT_FAST_CONN_INCRV(session,
- log_slot_consolidated, (uint64_t)slot->slot_group_size);
- return (0);
+ log_buffer_size, log->slot_buf_size * WT_SLOT_POOL);
+ /*
+ * Set up the available slot from the pool the first time.
+ */
+ slot = &log->slot_pool[0];
+ /*
+ * We cannot initialize the release LSN in the activate function
+ * because that is called after a log file switch.
+ */
+ slot->slot_release_lsn = log->alloc_lsn;
+ __wt_log_slot_activate(session, slot);
+ log->active_slot = slot;
+
+ if (0) {
+err: while (--i >= 0)
+ __wt_buf_free(session, &log->slot_pool[i].slot_buf);
+ }
+ return (ret);
}
/*
- * __wt_log_slot_notify --
- * Notify all threads waiting for the state to be < WT_LOG_SLOT_DONE.
+ * __wt_log_slot_destroy --
+ * Clean up the slot array on shutdown.
*/
int
-__wt_log_slot_notify(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+__wt_log_slot_destroy(WT_SESSION_IMPL *session)
{
- WT_UNUSED(session);
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+ WT_LOGSLOT *slot;
+ int64_t rel;
+ int i;
- slot->slot_state =
- (int64_t)WT_LOG_SLOT_DONE - (int64_t)slot->slot_group_size;
+ conn = S2C(session);
+ log = conn->log;
+
+ /*
+ * Write out any remaining buffers. Free the buffer.
+ */
+ for (i = 0; i < WT_SLOT_POOL; i++) {
+ slot = &log->slot_pool[i];
+ if (!FLD64_ISSET(
+ (uint64_t)slot->slot_state, WT_LOG_SLOT_RESERVED)) {
+ rel = WT_LOG_SLOT_RELEASED_BUFFERED(slot->slot_state);
+ if (rel != 0)
+ WT_RET(__wt_write(session, slot->slot_fh,
+ slot->slot_start_offset, (size_t)rel,
+ slot->slot_buf.mem));
+ }
+ __wt_buf_free(session, &log->slot_pool[i].slot_buf);
+ }
return (0);
}
/*
- * __wt_log_slot_wait --
- * Wait for slot leader to allocate log area and tell us our log offset.
+ * __wt_log_slot_join --
+ * Join a consolidated logging slot. Must be called with
+ * the read lock held.
*/
-int
-__wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
+void
+__wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
+ uint32_t flags, WT_MYSLOT *myslot)
{
- int yield_count;
+ WT_CONNECTION_IMPL *conn;
+ WT_LOG *log;
+ WT_LOGSLOT *slot;
+ int64_t flag_state, new_state, old_state, released;
+ int32_t join_offset, new_join;
+#ifdef HAVE_DIAGNOSTIC
+ int unbuf_force;
+#endif
- yield_count = 0;
- WT_UNUSED(session);
+ conn = S2C(session);
+ log = conn->log;
- while (slot->slot_state > WT_LOG_SLOT_DONE)
- if (++yield_count < 1000)
- __wt_yield();
- else
- __wt_sleep(0, 200);
- return (0);
+ /*
+ * Make sure the length cannot overflow. The caller should not
+ * even call this function if it doesn't fit but use direct
+ * writes.
+ */
+ WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT));
+
+ /*
+ * There should almost always be a slot open.
+ */
+#ifdef HAVE_DIAGNOSTIC
+ unbuf_force = ((++log->write_calls % 1000) == 0);
+#endif
+ for (;;) {
+ WT_BARRIER();
+ slot = log->active_slot;
+ old_state = slot->slot_state;
+ /*
+ * Try to join our size into the existing size and
+ * atomically write it back into the state.
+ */
+ flag_state = WT_LOG_SLOT_FLAGS(old_state);
+ released = WT_LOG_SLOT_RELEASED(old_state);
+ join_offset = WT_LOG_SLOT_JOINED(old_state);
+#ifdef HAVE_DIAGNOSTIC
+ if (unbuf_force || mysize > WT_LOG_SLOT_BUF_MAX) {
+#else
+ if (mysize > WT_LOG_SLOT_BUF_MAX) {
+#endif
+ new_join = join_offset + WT_LOG_SLOT_UNBUFFERED;
+ F_SET(myslot, WT_MYSLOT_UNBUFFERED);
+ myslot->slot = slot;
+ } else
+ new_join = join_offset + (int32_t)mysize;
+ new_state = (int64_t)WT_LOG_SLOT_JOIN_REL(
+ (int64_t)new_join, (int64_t)released, (int64_t)flag_state);
+
+ /*
+ * Check if the slot is open for joining and we are able to
+ * swap in our size into the state.
+ */
+ if (WT_LOG_SLOT_OPEN(old_state) &&
+ __wt_atomic_casiv64(
+ &slot->slot_state, old_state, new_state))
+ break;
+ /*
+ * The slot is no longer open or we lost the race to
+ * update it. Yield and try again.
+ */
+ WT_STAT_FAST_CONN_INCR(session, log_slot_races);
+ __wt_yield();
+ }
+ /*
+ * We joined this slot. Fill in our information to return to
+ * the caller.
+ */
+ if (mysize != 0)
+ WT_STAT_FAST_CONN_INCR(session, log_slot_joins);
+ if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC))
+ F_SET(slot, WT_SLOT_SYNC_DIR);
+ if (LF_ISSET(WT_LOG_FSYNC))
+ F_SET(slot, WT_SLOT_SYNC);
+ if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) {
+ WT_ASSERT(session, slot->slot_unbuffered == 0);
+ WT_STAT_FAST_CONN_INCR(session, log_slot_unbuffered);
+ slot->slot_unbuffered = (int64_t)mysize;
+ }
+ myslot->slot = slot;
+ myslot->offset = join_offset;
+ myslot->end_offset = (wt_off_t)((uint64_t)join_offset + mysize);
}
/*
* __wt_log_slot_release --
* Each thread in a consolidated group releases its portion to
- * signal it has completed writing its piece of the log.
+ * signal it has completed copying its piece of the log into
+ * the memory buffer.
*/
int64_t
-__wt_log_slot_release(WT_LOGSLOT *slot, uint64_t size)
+__wt_log_slot_release(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, int64_t size)
{
- int64_t newsize;
+ WT_LOGSLOT *slot;
+ wt_off_t cur_offset, my_start;
+ int64_t my_size, rel_size;
+ WT_UNUSED(session);
+ slot = myslot->slot;
+ my_start = slot->slot_start_offset + myslot->offset;
+ while ((cur_offset = slot->slot_last_offset) < my_start) {
+ /*
+ * Set our offset if we are larger.
+ */
+ if (__wt_atomic_casiv64(
+ &slot->slot_last_offset, cur_offset, my_start))
+ break;
+ /*
+ * If we raced another thread updating this, try again.
+ */
+ WT_BARRIER();
+ }
/*
- * Add my size into the state. When it reaches WT_LOG_SLOT_DONE
- * all participatory threads have completed copying their piece.
+ * Add my size into the state and return the new size.
*/
- newsize = WT_ATOMIC_ADD8(slot->slot_state, (int64_t)size);
- return (newsize);
+ rel_size = size;
+ if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED))
+ rel_size = WT_LOG_SLOT_UNBUFFERED;
+ my_size = (int64_t)WT_LOG_SLOT_JOIN_REL((int64_t)0, rel_size, 0);
+ return (__wt_atomic_addiv64(&slot->slot_state, my_size));
}
/*
* __wt_log_slot_free --
* Free a slot back into the pool.
*/
-int
+void
__wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
{
- WT_UNUSED(session);
/*
* Make sure flags don't get retained between uses.
* We have to reset them them here because multiple threads may
* change the flags when joining the slot.
*/
+ WT_UNUSED(session);
slot->flags = WT_SLOT_INIT_FLAGS;
+ slot->slot_error = 0;
slot->slot_state = WT_LOG_SLOT_FREE;
- return (0);
}
diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c
index 84b8d5c9532..6068bb6c559 100644
--- a/src/lsm/lsm_cursor.c
+++ b/src/lsm/lsm_cursor.c
@@ -134,7 +134,7 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm)
if (have_primary) {
WT_ENTER_PAGE_INDEX(session);
WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)primary)->btree,
- ovfl = __wt_btree_lsm_size(session, hard_limit ?
+ ovfl = __wt_btree_lsm_over_size(session, hard_limit ?
2 * lsm_tree->chunk_size : lsm_tree->chunk_size));
WT_LEAVE_PAGE_INDEX(session);
@@ -1066,12 +1066,12 @@ __clsm_lookup(WT_CURSOR_LSM *clsm, WT_ITEM *value)
ret = __wt_bloom_hash_get(bloom, &bhash);
if (ret == WT_NOTFOUND) {
- WT_STAT_FAST_INCR(session,
- &clsm->lsm_tree->stats, bloom_miss);
+ WT_LSM_TREE_STAT_INCR(
+ session, clsm->lsm_tree->bloom_miss);
continue;
} else if (ret == 0)
- WT_STAT_FAST_INCR(session,
- &clsm->lsm_tree->stats, bloom_hit);
+ WT_LSM_TREE_STAT_INCR(
+ session, clsm->lsm_tree->bloom_hit);
WT_ERR(ret);
}
c->set_key(c, &cursor->key);
@@ -1086,11 +1086,11 @@ __clsm_lookup(WT_CURSOR_LSM *clsm, WT_ITEM *value)
F_CLR(c, WT_CURSTD_KEY_SET);
/* Update stats: the active chunk can't have a bloom filter. */
if (bloom != NULL)
- WT_STAT_FAST_INCR(session,
- &clsm->lsm_tree->stats, bloom_false_positive);
+ WT_LSM_TREE_STAT_INCR(session,
+ clsm->lsm_tree->bloom_false_positive);
else if (clsm->primary_chunk == NULL || i != clsm->nchunks)
- WT_STAT_FAST_INCR(session,
- &clsm->lsm_tree->stats, lsm_lookup_no_bloom);
+ WT_LSM_TREE_STAT_INCR(session,
+ clsm->lsm_tree->lsm_lookup_no_bloom);
}
WT_ERR(WT_NOTFOUND);
@@ -1331,12 +1331,12 @@ __clsm_put(WT_SESSION_IMPL *session,
++clsm->update_count >= 100) &&
lsm_tree->merge_throttle + lsm_tree->ckpt_throttle > 0) {
clsm->update_count = 0;
- WT_STAT_FAST_INCRV(session, &clsm->lsm_tree->stats,
- lsm_checkpoint_throttle, lsm_tree->ckpt_throttle);
+ WT_LSM_TREE_STAT_INCRV(session,
+ lsm_tree->lsm_checkpoint_throttle, lsm_tree->ckpt_throttle);
WT_STAT_FAST_CONN_INCRV(session,
lsm_checkpoint_throttle, lsm_tree->ckpt_throttle);
- WT_STAT_FAST_INCRV(session, &clsm->lsm_tree->stats,
- lsm_merge_throttle, lsm_tree->merge_throttle);
+ WT_LSM_TREE_STAT_INCRV(session,
+ lsm_tree->lsm_merge_throttle, lsm_tree->merge_throttle);
WT_STAT_FAST_CONN_INCRV(session,
lsm_merge_throttle, lsm_tree->merge_throttle);
__wt_sleep(0,
diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c
index cb078d991d8..6c59232b619 100644
--- a/src/lsm/lsm_manager.c
+++ b/src/lsm/lsm_manager.c
@@ -258,7 +258,7 @@ __wt_lsm_manager_free_work_unit(
if (entry != NULL) {
WT_ASSERT(session, entry->lsm_tree->queue_ref > 0);
- (void)WT_ATOMIC_SUB4(entry->lsm_tree->queue_ref, 1);
+ (void)__wt_atomic_sub32(&entry->lsm_tree->queue_ref, 1);
__wt_free(session, entry);
}
}
@@ -273,7 +273,7 @@ __wt_lsm_manager_destroy(WT_SESSION_IMPL *session)
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_LSM_MANAGER *manager;
- WT_LSM_WORK_UNIT *current, *next;
+ WT_LSM_WORK_UNIT *current;
WT_SESSION *wt_session;
uint32_t i;
uint64_t removed;
@@ -297,23 +297,17 @@ __wt_lsm_manager_destroy(WT_SESSION_IMPL *session)
manager->lsm_worker_cookies[0].tid = 0;
/* Release memory from any operations left on the queue. */
- for (current = TAILQ_FIRST(&manager->switchqh);
- current != NULL; current = next) {
- next = TAILQ_NEXT(current, q);
+ while ((current = TAILQ_FIRST(&manager->switchqh)) != NULL) {
TAILQ_REMOVE(&manager->switchqh, current, q);
++removed;
__wt_lsm_manager_free_work_unit(session, current);
}
- for (current = TAILQ_FIRST(&manager->appqh);
- current != NULL; current = next) {
- next = TAILQ_NEXT(current, q);
+ while ((current = TAILQ_FIRST(&manager->appqh)) != NULL) {
TAILQ_REMOVE(&manager->appqh, current, q);
++removed;
__wt_lsm_manager_free_work_unit(session, current);
}
- for (current = TAILQ_FIRST(&manager->managerqh);
- current != NULL; current = next) {
- next = TAILQ_NEXT(current, q);
+ while ((current = TAILQ_FIRST(&manager->managerqh)) != NULL) {
TAILQ_REMOVE(&manager->managerqh, current, q);
++removed;
__wt_lsm_manager_free_work_unit(session, current);
@@ -645,9 +639,9 @@ __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session,
* on close, the flag is cleared and then the queue reference count
* is checked.
*/
- (void)WT_ATOMIC_ADD4(lsm_tree->queue_ref, 1);
+ (void)__wt_atomic_add32(&lsm_tree->queue_ref, 1);
if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) {
- (void)WT_ATOMIC_SUB4(lsm_tree->queue_ref, 1);
+ (void)__wt_atomic_sub32(&lsm_tree->queue_ref, 1);
return (0);
}
@@ -674,6 +668,6 @@ __wt_lsm_manager_push_entry(WT_SESSION_IMPL *session,
return (0);
err:
if (!pushed)
- (void)WT_ATOMIC_SUB4(lsm_tree->queue_ref, 1);
+ (void)__wt_atomic_sub32(&lsm_tree->queue_ref, 1);
return (ret);
}
diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c
index d7e684b8f51..01a61359949 100644
--- a/src/lsm/lsm_merge.c
+++ b/src/lsm/lsm_merge.c
@@ -398,7 +398,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
locked = 0;
/* Allocate an ID for the merge. */
- dest_id = WT_ATOMIC_ADD4(lsm_tree->last, 1);
+ dest_id = __wt_atomic_add32(&lsm_tree->last, 1);
/*
* We only want to do the chunk loop if we're running with verbose,
@@ -493,7 +493,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
* merge_syncing field so that compact knows it is still in
* progress.
*/
- (void)WT_ATOMIC_ADD4(lsm_tree->merge_syncing, 1);
+ (void)__wt_atomic_add32(&lsm_tree->merge_syncing, 1);
in_sync = 1;
/*
* We've successfully created the new chunk. Now install it. We need
@@ -512,7 +512,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
* Don't block if the cache is full: our next unit of work may be to
* discard some trees to free space.
*/
- F_SET(session, WT_SESSION_NO_CACHE_CHECK);
+ F_SET(session, WT_SESSION_NO_EVICTION);
if (create_bloom) {
if (ret == 0)
@@ -544,7 +544,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
WT_TRET(dest->close(dest));
dest = NULL;
++lsm_tree->merge_progressing;
- (void)WT_ATOMIC_SUB4(lsm_tree->merge_syncing, 1);
+ (void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1);
in_sync = 0;
WT_ERR_NOTFOUND_OK(ret);
@@ -600,7 +600,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
err: if (locked)
WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
if (in_sync)
- (void)WT_ATOMIC_SUB4(lsm_tree->merge_syncing, 1);
+ (void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1);
if (src != NULL)
WT_TRET(src->close(src));
if (dest != NULL)
@@ -632,6 +632,6 @@ err: if (locked)
"Merge failed with %s",
__wt_strerror(session, ret, NULL, 0)));
}
- F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK);
+ F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
return (ret);
}
diff --git a/src/lsm/lsm_stat.c b/src/lsm/lsm_stat.c
index 126a59af0d1..2817ec9eeb7 100644
--- a/src/lsm/lsm_stat.c
+++ b/src/lsm/lsm_stat.c
@@ -22,6 +22,7 @@ __curstat_lsm_init(
WT_DSRC_STATS *new, *stats;
WT_LSM_CHUNK *chunk;
WT_LSM_TREE *lsm_tree;
+ int64_t bloom_count;
u_int i;
int locked;
char config[64];
@@ -49,25 +50,22 @@ __curstat_lsm_init(
cfg[1] = disk_cfg[1] = config;
}
- /*
- * Set the cursor to reference the data source statistics; we don't
- * initialize it, instead we copy (rather than aggregate), the first
- * chunk's statistics, which has the same effect.
- */
- stats = &cst->u.dsrc_stats;
-
/* Hold the LSM lock so that we can safely walk through the chunks. */
WT_ERR(__wt_lsm_tree_readlock(session, lsm_tree));
locked = 1;
- /* Initialize the statistics. */
- __wt_stat_init_dsrc_stats(stats);
+ /*
+ * Set the cursor to reference the data source statistics into which
+ * we're going to aggregate statistics from the underlying objects.
+ */
+ stats = &cst->u.dsrc_stats;
+ __wt_stat_dsrc_init_single(stats);
/*
* For each chunk, aggregate its statistics, as well as any associated
* bloom filter statistics, into the total statistics.
*/
- for (i = 0; i < lsm_tree->nchunks; i++) {
+ for (bloom_count = 0, i = 0; i < lsm_tree->nchunks; i++) {
chunk = lsm_tree->chunk[i];
/*
@@ -93,17 +91,17 @@ __curstat_lsm_init(
* top-level.
*/
new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
- WT_STAT_SET(new, lsm_generation_max, chunk->generation);
+ new->lsm_generation_max = chunk->generation;
/* Aggregate statistics from each new chunk. */
- __wt_stat_aggregate_dsrc_stats(new, stats);
+ __wt_stat_dsrc_aggregate_single(new, stats);
WT_ERR(stat_cursor->close(stat_cursor));
if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
continue;
/* Maintain a count of bloom filters. */
- WT_STAT_INCR(&lsm_tree->stats, bloom_count);
+ ++bloom_count;
/* Get the bloom filter's underlying object. */
WT_ERR(__wt_buf_fmt(
@@ -117,24 +115,39 @@ __curstat_lsm_init(
* into the top-level.
*/
new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
- WT_STAT_SET(new,
- bloom_size, (chunk->count * lsm_tree->bloom_bit_count) / 8);
- WT_STAT_SET(new, bloom_page_evict,
- WT_STAT(new, cache_eviction_clean) +
- WT_STAT(new, cache_eviction_dirty));
- WT_STAT_SET(new, bloom_page_read, WT_STAT(new, cache_read));
-
- __wt_stat_aggregate_dsrc_stats(new, stats);
+ new->bloom_size =
+ (int64_t)((chunk->count * lsm_tree->bloom_bit_count) / 8);
+ new->bloom_page_evict =
+ new->cache_eviction_clean + new->cache_eviction_dirty;
+ new->bloom_page_read = new->cache_read;
+
+ __wt_stat_dsrc_aggregate_single(new, stats);
WT_ERR(stat_cursor->close(stat_cursor));
}
/* Set statistics that aren't aggregated directly into the cursor */
- WT_STAT_SET(stats, lsm_chunk_count, lsm_tree->nchunks);
+ stats->bloom_count = bloom_count;
+ stats->lsm_chunk_count = lsm_tree->nchunks;
- /* Aggregate, and optionally clear, LSM-level specific information. */
- __wt_stat_aggregate_dsrc_stats(&lsm_tree->stats, stats);
+ /* Include, and optionally clear, LSM-level specific information. */
+ stats->bloom_miss = lsm_tree->bloom_miss;
+ if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
+ lsm_tree->bloom_miss = 0;
+ stats->bloom_hit = lsm_tree->bloom_hit;
+ if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
+ lsm_tree->bloom_hit = 0;
+ stats->bloom_false_positive = lsm_tree->bloom_false_positive;
+ if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
+ lsm_tree->bloom_false_positive = 0;
+ stats->lsm_lookup_no_bloom = lsm_tree->lsm_lookup_no_bloom;
+ if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
+ lsm_tree->lsm_lookup_no_bloom = 0;
+ stats->lsm_checkpoint_throttle = lsm_tree->lsm_checkpoint_throttle;
+ if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
+ lsm_tree->lsm_checkpoint_throttle = 0;
+ stats->lsm_merge_throttle = lsm_tree->lsm_merge_throttle;
if (F_ISSET(cst, WT_CONN_STAT_CLEAR))
- __wt_stat_refresh_dsrc_stats(&lsm_tree->stats);
+ lsm_tree->lsm_merge_throttle = 0;
__wt_curstat_dsrc_final(cst);
diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c
index 6c6b185f821..46db76e099c 100644
--- a/src/lsm/lsm_tree.c
+++ b/src/lsm/lsm_tree.c
@@ -141,7 +141,7 @@ __wt_lsm_tree_close_all(WT_SESSION_IMPL *session)
* is no need to decrement the reference count since discard
* is unconditional.
*/
- (void)WT_ATOMIC_ADD4(lsm_tree->refcnt, 1);
+ (void)__wt_atomic_add32(&lsm_tree->refcnt, 1);
WT_TRET(__lsm_tree_close(session, lsm_tree));
WT_TRET(__lsm_tree_discard(session, lsm_tree, 1));
}
@@ -486,15 +486,17 @@ __lsm_tree_find(WT_SESSION_IMPL *session,
* Make sure we win the race to switch on the
* exclusive flag.
*/
- if (!WT_ATOMIC_CAS1(lsm_tree->exclusive, 0, 1))
+ if (!__wt_atomic_cas8(
+ &lsm_tree->exclusive, 0, 1))
return (EBUSY);
/* Make sure there are no readers */
- if (!WT_ATOMIC_CAS4(lsm_tree->refcnt, 0, 1)) {
+ if (!__wt_atomic_cas32(
+ &lsm_tree->refcnt, 0, 1)) {
lsm_tree->exclusive = 0;
return (EBUSY);
}
} else {
- (void)WT_ATOMIC_ADD4(lsm_tree->refcnt, 1);
+ (void)__wt_atomic_add32(&lsm_tree->refcnt, 1);
/*
* We got a reference, check if an exclusive
@@ -503,8 +505,8 @@ __lsm_tree_find(WT_SESSION_IMPL *session,
if (lsm_tree->exclusive) {
WT_ASSERT(session,
lsm_tree->refcnt > 0);
- (void)WT_ATOMIC_SUB4(
- lsm_tree->refcnt, 1);
+ (void)__wt_atomic_sub32(
+ &lsm_tree->refcnt, 1);
return (EBUSY);
}
}
@@ -565,7 +567,7 @@ __lsm_tree_open(WT_SESSION_IMPL *session,
WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST));
/* Start the LSM manager thread if it isn't running. */
- if (WT_ATOMIC_CAS4(conn->lsm_manager.lsm_workers, 0, 1))
+ if (__wt_atomic_cas32(&conn->lsm_manager.lsm_workers, 0, 1))
WT_RET(__wt_lsm_manager_start(session));
/* Make sure no one beat us to it. */
@@ -596,7 +598,7 @@ __lsm_tree_open(WT_SESSION_IMPL *session,
* with getting handles exclusive.
*/
lsm_tree->refcnt = 1;
- lsm_tree->exclusive = (int8_t)exclusive;
+ lsm_tree->exclusive = exclusive ? 1 : 0;
lsm_tree->queue_ref = 0;
/* Set a flush timestamp as a baseline. */
@@ -644,7 +646,7 @@ __wt_lsm_tree_release(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
WT_ASSERT(session, lsm_tree->refcnt > 0);
if (lsm_tree->exclusive)
lsm_tree->exclusive = 0;
- (void)WT_ATOMIC_SUB4(lsm_tree->refcnt, 1);
+ (void)__wt_atomic_sub32(&lsm_tree->refcnt, 1);
}
/* How aggressively to ramp up or down throttle due to level 0 merging */
@@ -839,7 +841,7 @@ __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
/* Update the throttle time. */
__wt_lsm_tree_throttle(session, lsm_tree, 0);
- new_id = WT_ATOMIC_ADD4(lsm_tree->last, 1);
+ new_id = __wt_atomic_add32(&lsm_tree->last, 1);
WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc,
nchunks + 1, &lsm_tree->chunk));
@@ -1097,7 +1099,7 @@ __wt_lsm_tree_truncate(
/* Create the new chunk. */
WT_ERR(__wt_calloc_one(session, &chunk));
- chunk->id = WT_ATOMIC_ADD4(lsm_tree->last, 1);
+ chunk->id = __wt_atomic_add32(&lsm_tree->last, 1);
WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
/* Mark all chunks old. */
@@ -1142,7 +1144,7 @@ __wt_lsm_tree_readlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
* Diagnostic: avoid deadlocks with the schema lock: if we need it for
* an operation, we should already have it.
*/
- F_SET(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
+ F_SET(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK);
return (0);
}
@@ -1155,7 +1157,7 @@ __wt_lsm_tree_readunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
WT_DECL_RET;
- F_CLR(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
+ F_CLR(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK);
if ((ret = __wt_readunlock(session, lsm_tree->rwlock)) != 0)
WT_PANIC_RET(session, ret, "Unlocking an LSM tree");
@@ -1175,7 +1177,7 @@ __wt_lsm_tree_writelock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
* Diagnostic: avoid deadlocks with the schema lock: if we need it for
* an operation, we should already have it.
*/
- F_SET(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
+ F_SET(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK);
return (0);
}
@@ -1188,7 +1190,7 @@ __wt_lsm_tree_writeunlock(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
WT_DECL_RET;
- F_CLR(session, WT_SESSION_NO_CACHE_CHECK | WT_SESSION_NO_SCHEMA_LOCK);
+ F_CLR(session, WT_SESSION_NO_EVICTION | WT_SESSION_NO_SCHEMA_LOCK);
if ((ret = __wt_writeunlock(session, lsm_tree->rwlock)) != 0)
WT_PANIC_RET(session, ret, "Unlocking an LSM tree");
@@ -1207,7 +1209,8 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip)
WT_LSM_TREE *lsm_tree;
time_t begin, end;
uint64_t progress;
- int i, compacting, flushing, locked, ref;
+ uint32_t i;
+ int compacting, flushing, locked, ref;
compacting = flushing = locked = ref = 0;
chunk = NULL;
@@ -1282,7 +1285,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip)
* If we have a chunk, we want to look for it to be on-disk.
* So we need to add a reference to keep it available.
*/
- (void)WT_ATOMIC_ADD4(chunk->refcnt, 1);
+ (void)__wt_atomic_add32(&chunk->refcnt, 1);
ref = 1;
}
@@ -1330,7 +1333,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip)
"Start compacting progress %" PRIu64,
name, chunk->id,
lsm_tree->merge_progressing));
- (void)WT_ATOMIC_SUB4(chunk->refcnt, 1);
+ (void)__wt_atomic_sub32(&chunk->refcnt, 1);
flushing = ref = 0;
compacting = 1;
F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
@@ -1384,7 +1387,7 @@ __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip)
err:
/* Ensure anything we set is cleared. */
if (ref)
- (void)WT_ATOMIC_SUB4(chunk->refcnt, 1);
+ (void)__wt_atomic_sub32(&chunk->refcnt, 1);
if (compacting) {
F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING);
lsm_tree->merge_aggressiveness = 0;
diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c
index c3bee162ea1..8eba0127b8b 100644
--- a/src/lsm/lsm_work_unit.c
+++ b/src/lsm/lsm_work_unit.c
@@ -53,7 +53,7 @@ __lsm_copy_chunks(WT_SESSION_IMPL *session,
* it's safe.
*/
for (i = 0; i < nchunks; i++)
- (void)WT_ATOMIC_ADD4(cookie->chunk_array[i]->refcnt, 1);
+ (void)__wt_atomic_add32(&cookie->chunk_array[i]->refcnt, 1);
err: WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree));
@@ -122,7 +122,7 @@ __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session,
force ? " w/ force" : "",
i, lsm_tree->nchunks, chunk->uri));
- (void)WT_ATOMIC_ADD4(chunk->refcnt, 1);
+ (void)__wt_atomic_add32(&chunk->refcnt, 1);
}
err: WT_RET(__wt_lsm_tree_readunlock(session, lsm_tree));
@@ -145,7 +145,7 @@ __lsm_unpin_chunks(WT_SESSION_IMPL *session, WT_LSM_WORKER_COOKIE *cookie)
if (cookie->chunk_array[i] == NULL)
continue;
WT_ASSERT(session, cookie->chunk_array[i]->refcnt > 0);
- (void)WT_ATOMIC_SUB4(cookie->chunk_array[i]->refcnt, 1);
+ (void)__wt_atomic_sub32(&cookie->chunk_array[i]->refcnt, 1);
}
/* Ensure subsequent calls don't double decrement. */
cookie->nchunks = 0;
@@ -223,7 +223,7 @@ __wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
* See if we win the race to switch on the "busy" flag and
* recheck that the chunk still needs a Bloom filter.
*/
- if (WT_ATOMIC_CAS4(chunk->bloom_busy, 0, 1)) {
+ if (__wt_atomic_cas32(&chunk->bloom_busy, 0, 1)) {
if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) {
ret = __lsm_bloom_create(
session, lsm_tree, chunk, (u_int)i);
@@ -301,17 +301,19 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
* Flush the file before checkpointing: this is the expensive part in
* terms of I/O.
*
- * Use the special eviction isolation level to avoid interfering with
- * an application checkpoint: we have already checked that all of the
- * updates in this chunk are globally visible.
- *
- * !!! We can wait here for checkpoints and fsyncs to complete, which
- * can be a long time.
+ * !!!
+ * We can wait here for checkpoints and fsyncs to complete, which can
+ * take a long time.
*/
if ((ret = __wt_session_get_btree(
session, chunk->uri, NULL, NULL, 0)) == 0) {
+ /*
+ * Set read-uncommitted: we have already checked that all of the
+ * updates in this chunk are globally visible, use the cheapest
+ * possible check in reconciliation.
+ */
saved_isolation = session->txn.isolation;
- session->txn.isolation = WT_ISO_EVICTION;
+ session->txn.isolation = WT_ISO_READ_UNCOMMITTED;
ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES);
session->txn.isolation = saved_isolation;
WT_TRET(__wt_session_release_btree(session));
@@ -412,7 +414,7 @@ __lsm_bloom_create(WT_SESSION_IMPL *session,
* ourselves to get stuck creating bloom filters, the entire tree
* can stall since there may be no worker threads available to flush.
*/
- F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK);
+ F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
WT_ERR(src->get_key(src, &key));
WT_ERR(__wt_bloom_insert(bloom, &key));
@@ -446,7 +448,7 @@ __lsm_bloom_create(WT_SESSION_IMPL *session,
err: if (bloom != NULL)
WT_TRET(__wt_bloom_close(bloom));
- F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_CACHE_CHECK);
+ F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
return (ret);
}
@@ -528,7 +530,7 @@ __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
* Make sure only a single thread is freeing the old chunk array
* at any time.
*/
- if (!WT_ATOMIC_CAS4(lsm_tree->freeing_old_chunks, 0, 1))
+ if (!__wt_atomic_cas32(&lsm_tree->freeing_old_chunks, 0, 1))
return (0);
/*
* Take a copy of the current state of the LSM tree and look for chunks
diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c
index 8ed4a117641..3add3155e17 100644
--- a/src/lsm/lsm_worker.c
+++ b/src/lsm/lsm_worker.c
@@ -65,7 +65,7 @@ __lsm_worker_general_op(
ret = __wt_lsm_checkpoint_chunk(
session, entry->lsm_tree, chunk);
WT_ASSERT(session, chunk->refcnt > 0);
- (void)WT_ATOMIC_SUB4(chunk->refcnt, 1);
+ (void)__wt_atomic_sub32(&chunk->refcnt, 1);
WT_ERR(ret);
}
} else if (entry->type == WT_LSM_WORK_DROP)
diff --git a/src/meta/meta_apply.c b/src/meta/meta_apply.c
index 6d08ce3aa6a..315621f2ae9 100644
--- a/src/meta/meta_apply.c
+++ b/src/meta/meta_apply.c
@@ -32,7 +32,7 @@ __wt_meta_btree_apply(WT_SESSION_IMPL *session,
WT_ERR(cursor->get_key(cursor, &uri));
if (!WT_PREFIX_MATCH(uri, "file:"))
break;
- else if (strcmp(uri, WT_METAFILE_URI) == 0)
+ if (strcmp(uri, WT_METAFILE_URI) == 0)
continue;
/*
diff --git a/src/meta/meta_table.c b/src/meta/meta_table.c
index 227d0fa9a6c..8255f004dab 100644
--- a/src/meta/meta_table.c
+++ b/src/meta/meta_table.c
@@ -12,22 +12,22 @@
* __metadata_turtle --
* Return if a key's value should be taken from the turtle file.
*/
-static int
+static bool
__metadata_turtle(const char *key)
{
switch (key[0]) {
case 'f':
if (strcmp(key, WT_METAFILE_URI) == 0)
- return (1);
+ return (true);
break;
case 'W':
if (strcmp(key, "WiredTiger version") == 0)
- return (1);
+ return (true);
if (strcmp(key, "WiredTiger version string") == 0)
- return (1);
+ return (true);
break;
}
- return (0);
+ return (false);
}
/*
@@ -37,6 +37,8 @@ __metadata_turtle(const char *key)
int
__wt_metadata_open(WT_SESSION_IMPL *session)
{
+ WT_BTREE *btree;
+
if (session->meta_dhandle != NULL)
return (0);
@@ -45,7 +47,24 @@ __wt_metadata_open(WT_SESSION_IMPL *session)
session->meta_dhandle = session->dhandle;
WT_ASSERT(session, session->meta_dhandle != NULL);
- /* The meta_dhandle doesn't need to stay locked -- release it. */
+ /*
+ * Set special flags for the metadata file: eviction (the metadata file
+ * is in-memory and never evicted), logging (the metadata file is always
+ * logged if possible).
+ *
+ * Test flags before setting them so updates can't race in subsequent
+ * opens (the first update is safe because it's single-threaded from
+ * wiredtiger_open).
+ */
+ btree = S2BT(session);
+ if (!F_ISSET(btree, WT_BTREE_IN_MEMORY))
+ F_SET(btree, WT_BTREE_IN_MEMORY);
+ if (!F_ISSET(btree, WT_BTREE_NO_EVICTION))
+ F_SET(btree, WT_BTREE_NO_EVICTION);
+ if (F_ISSET(btree, WT_BTREE_NO_LOGGING))
+ F_CLR(btree, WT_BTREE_NO_LOGGING);
+
+ /* The metadata handle doesn't need to stay locked -- release it. */
return (__wt_session_release_btree(session));
}
@@ -59,9 +78,9 @@ __wt_metadata_cursor(
{
WT_DATA_HANDLE *saved_dhandle;
WT_DECL_RET;
+ int is_dead;
const char *cfg[] =
{ WT_CONFIG_BASE(session, WT_SESSION_open_cursor), config, NULL };
- int is_dead;
saved_dhandle = session->dhandle;
WT_ERR(__wt_metadata_open(session));
diff --git a/src/os_posix/os_alloc.c b/src/os_posix/os_alloc.c
index 4d04f9ac579..eb2482723ec 100644
--- a/src/os_posix/os_alloc.c
+++ b/src/os_posix/os_alloc.c
@@ -58,7 +58,9 @@ __wt_calloc(WT_SESSION_IMPL *session, size_t number, size_t size, void *retp)
WT_STAT_FAST_CONN_INCR(session, memory_allocation);
if ((p = calloc(number, size)) == NULL)
- WT_RET_MSG(session, __wt_errno(), "memory allocation");
+ WT_RET_MSG(session, __wt_errno(),
+ "memory allocation of %" WT_SIZET_FMT " bytes failed",
+ size * number);
*(void **)retp = p;
return (0);
@@ -100,7 +102,9 @@ __wt_realloc(WT_SESSION_IMPL *session,
}
if ((p = realloc(p, bytes_to_allocate)) == NULL)
- WT_RET_MSG(session, __wt_errno(), "memory allocation");
+ WT_RET_MSG(session, __wt_errno(),
+ "memory allocation of %" WT_SIZET_FMT " bytes failed",
+ bytes_to_allocate);
/*
* Clear the allocated memory -- an application might: allocate memory,
@@ -171,7 +175,9 @@ __wt_realloc_aligned(WT_SESSION_IMPL *session,
if ((ret = posix_memalign(&newp,
S2C(session)->buffer_alignment,
bytes_to_allocate)) != 0)
- WT_RET_MSG(session, ret, "memory allocation");
+ WT_RET_MSG(session, ret,
+ "memory allocation of %" WT_SIZET_FMT
+ " bytes failed", bytes_to_allocate);
if (p != NULL)
memcpy(newp, p, bytes_allocated);
diff --git a/src/os_posix/os_mtx_cond.c b/src/os_posix/os_mtx_cond.c
index dfd72dd0cd2..7946b4ab0cc 100644
--- a/src/os_posix/os_mtx_cond.c
+++ b/src/os_posix/os_mtx_cond.c
@@ -41,11 +41,13 @@ err: __wt_free(session, cond);
}
/*
- * __wt_cond_wait --
- * Wait on a mutex, optionally timing out.
+ * __wt_cond_wait_signal --
+ * Wait on a mutex, optionally timing out. If we get it
+ * before the time out period expires, let the caller know.
*/
int
-__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs)
+__wt_cond_wait_signal(
+ WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, int *signalled)
{
struct timespec ts;
WT_DECL_RET;
@@ -54,7 +56,8 @@ __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs)
locked = 0;
/* Fast path if already signalled. */
- if (WT_ATOMIC_ADD4(cond->waiters, 1) == 0)
+ *signalled = 1;
+ if (__wt_atomic_addi32(&cond->waiters, 1) == 0)
return (0);
/*
@@ -88,10 +91,12 @@ __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs)
#ifdef ETIME
ret == ETIME ||
#endif
- ret == ETIMEDOUT)
+ ret == ETIMEDOUT) {
+ *signalled = 0;
ret = 0;
+ }
- (void)WT_ATOMIC_SUB4(cond->waiters, 1);
+ (void)__wt_atomic_subi32(&cond->waiters, 1);
err: if (locked)
WT_TRET(pthread_mutex_unlock(&cond->mtx));
@@ -124,7 +129,7 @@ __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
if (cond->waiters == -1)
return (0);
- if (cond->waiters > 0 || !WT_ATOMIC_CAS4(cond->waiters, 0, -1)) {
+ if (cond->waiters > 0 || !__wt_atomic_casi32(&cond->waiters, 0, -1)) {
WT_ERR(pthread_mutex_lock(&cond->mtx));
locked = 1;
WT_ERR(pthread_cond_broadcast(&cond->cond));
diff --git a/src/os_posix/os_mtx_rw.c b/src/os_posix/os_mtx_rw.c
index cdd4f8a24e1..d47ab197643 100644
--- a/src/os_posix/os_mtx_rw.c
+++ b/src/os_posix/os_mtx_rw.c
@@ -38,6 +38,78 @@
* Joseph Seigh. Note that a similar (but not identical) algorithm was published
* by John Mellor-Crummey and Michael Scott in their landmark paper "Scalable
* Reader-Writer Synchronization for Shared-Memory Multiprocessors".
+ *
+ * The following is an explanation of this code. First, the underlying lock
+ * structure.
+ *
+ * struct {
+ * uint16_t writers; Now serving for writers
+ * uint16_t readers; Now serving for readers
+ * uint16_t users; Next available ticket number
+ * uint16_t __notused; Padding
+ * }
+ *
+ * First, imagine a store's 'take a number' ticket algorithm. A customer takes
+ * a unique ticket number and customers are served in ticket order. In the data
+ * structure, 'writers' is the next writer to be served, 'readers' is the next
+ * reader to be served, and 'users' is the next available ticket number.
+ *
+ * Next, consider exclusive (write) locks. The 'now serving' number for writers
+ * is 'writers'. To lock, 'take a number' and wait until that number is being
+ * served; more specifically, atomically copy and increment the current value of
+ * 'users', and then wait until 'writers' equals that copied number.
+ *
+ * Shared (read) locks are similar. Like writers, readers atomically get the
+ * next number available. However, instead of waiting for 'writers' to equal
+ * their number, they wait for 'readers' to equal their number.
+ *
+ * This has the effect of queuing lock requests in the order they arrive
+ * (incidentally avoiding starvation).
+ *
+ * Each lock/unlock pair requires incrementing both 'readers' and 'writers'.
+ * In the case of a reader, the 'readers' increment happens when the reader
+ * acquires the lock (to allow read-lock sharing), and the 'writers' increment
+ * happens when the reader releases the lock. In the case of a writer, both
+ * 'readers' and 'writers' are incremented when the writer releases the lock.
+ *
+ * For example, consider the following read (R) and write (W) lock requests:
+ *
+ * writers readers users
+ * 0 0 0
+ * R: ticket 0, readers match OK 0 1 1
+ * R: ticket 1, readers match OK 0 2 2
+ * R: ticket 2, readers match OK 0 3 3
+ * W: ticket 3, writers no match block 0 3 4
+ * R: ticket 2, unlock 1 3 4
+ * R: ticket 0, unlock 2 3 4
+ * R: ticket 1, unlock 3 3 4
+ * W: ticket 3, writers match OK 3 3 4
+ *
+ * Note the writer blocks until 'writers' equals its ticket number and it does
+ * not matter if readers unlock in order or not.
+ *
+ * Readers or writers entering the system after the write lock is queued block,
+ * and the next ticket holder (reader or writer) will unblock when the writer
+ * unlocks. An example, continuing from the last line of the above example:
+ *
+ * writers readers users
+ * W: ticket 3, writers match OK 3 3 4
+ * R: ticket 4, readers no match block 3 3 5
+ * R: ticket 5, readers no match block 3 3 6
+ * W: ticket 6, writers no match block 3 3 7
+ * W: ticket 3, unlock 4 4 7
+ * R: ticket 4, readers match OK 4 5 7
+ * R: ticket 5, readers match OK 4 6 7
+ *
+ * The 'users' field is a 2-byte value so the available ticket number wraps at
+ * 64K requests. If a thread's lock request is not granted until the 'users'
+ * field cycles and the same ticket is taken by another thread, we could grant
+ * a lock to two separate threads at the same time, and bad things happen: two
+ * writer threads or a reader thread and a writer thread would run in parallel,
+ * and lock waiters could be skipped if the unlocks race. This is unlikely, it
+ * only happens if a lock request is blocked by 64K other requests. The fix is
+ * to grow the lock structure fields, but the largest atomic instruction we have
+ * is 8 bytes, the structure has no room to grow.
*/
#include "wt_internal.h"
@@ -69,20 +141,31 @@ __wt_rwlock_alloc(
int
__wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
{
- wt_rwlock_t *l;
- uint64_t old, new, pad, users, writers;
+ wt_rwlock_t *l, new, old;
WT_RET(__wt_verbose(
session, WT_VERB_MUTEX, "rwlock: try_readlock %s", rwlock->name));
WT_STAT_FAST_CONN_INCR(session, rwlock_read);
l = &rwlock->rwlock;
- pad = l->s.pad;
- users = l->s.users;
- writers = l->s.writers;
- old = (pad << 48) + (users << 32) + (users << 16) + writers;
- new = (pad << 48) + ((users + 1) << 32) + ((users + 1) << 16) + writers;
- return (WT_ATOMIC_CAS8(l->u, old, new) ? 0 : EBUSY);
+ new = old = *l;
+
+ /*
+ * This read lock can only be granted if the lock was last granted to
+ * a reader and there are no readers or writers blocked on the lock,
+ * that is, if this thread's ticket would be the next ticket granted.
+ * Do the cheap test to see if this can possibly succeed (and confirm
+ * the lock is in the correct state to grant this read lock).
+ */
+ if (old.s.readers != old.s.users)
+ return (EBUSY);
+
+ /*
+ * The replacement lock value is a result of allocating a new ticket and
+ * incrementing the reader value to match it.
+ */
+ new.s.readers = new.s.users = old.s.users + 1;
+ return (__wt_atomic_cas64(&l->u, old.u, new.u) ? 0 : EBUSY);
}
/*
@@ -93,8 +176,7 @@ int
__wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
{
wt_rwlock_t *l;
- uint64_t me;
- uint16_t val;
+ uint16_t ticket;
int pause_cnt;
WT_RET(__wt_verbose(
@@ -102,17 +184,22 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
WT_STAT_FAST_CONN_INCR(session, rwlock_read);
l = &rwlock->rwlock;
- me = WT_ATOMIC_FETCH_ADD8(l->u, (uint64_t)1 << 32);
- val = (uint16_t)(me >> 32);
- for (pause_cnt = 0; val != l->s.readers;) {
+
+ /*
+ * Possibly wrap: if we have more than 64K lockers waiting, the ticket
+ * value will wrap and two lockers will simultaneously be granted the
+ * lock.
+ */
+ ticket = __wt_atomic_fetch_add16(&l->s.users, 1);
+ for (pause_cnt = 0; ticket != l->s.readers;) {
/*
* We failed to get the lock; pause before retrying and if we've
* paused enough, sleep so we don't burn CPU to no purpose. This
* situation happens if there are more threads than cores in the
- * system and we're thrashing on shared resources. Regardless,
- * don't sleep long, all we need is to schedule the other reader
- * threads to complete a few more instructions and increment the
- * reader count.
+ * system and we're thrashing on shared resources.
+ *
+ * Don't sleep long when waiting on a read lock, hopefully we're
+ * waiting on another read thread to increment the reader count.
*/
if (++pause_cnt < 1000)
WT_PAUSE();
@@ -120,6 +207,10 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
__wt_sleep(0, 10);
}
+ /*
+ * We're the only writer of the readers field, so the update does not
+ * need to be atomic.
+ */
++l->s.readers;
return (0);
@@ -138,7 +229,12 @@ __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
session, WT_VERB_MUTEX, "rwlock: read unlock %s", rwlock->name));
l = &rwlock->rwlock;
- WT_ATOMIC_ADD2(l->s.writers, 1);
+
+ /*
+ * Increment the writers value (other readers are doing the same, make
+ * sure we don't race).
+ */
+ (void)__wt_atomic_add16(&l->s.writers, 1);
return (0);
}
@@ -150,20 +246,28 @@ __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
int
__wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
{
- wt_rwlock_t *l;
- uint64_t old, new, pad, readers, users;
+ wt_rwlock_t *l, new, old;
WT_RET(__wt_verbose(
session, WT_VERB_MUTEX, "rwlock: try_writelock %s", rwlock->name));
WT_STAT_FAST_CONN_INCR(session, rwlock_write);
l = &rwlock->rwlock;
- pad = l->s.pad;
- readers = l->s.readers;
- users = l->s.users;
- old = (pad << 48) + (users << 32) + (readers << 16) + users;
- new = (pad << 48) + ((users + 1) << 32) + (readers << 16) + users;
- return (WT_ATOMIC_CAS8(l->u, old, new) ? 0 : EBUSY);
+ old = new = *l;
+
+ /*
+ * This write lock can only be granted if the lock was last granted to
+ * a writer and there are no readers or writers blocked on the lock,
+ * that is, if this thread's ticket would be the next ticket granted.
+ * Do the cheap test to see if this can possibly succeed (and confirm
+ * the lock is in the correct state to grant this write lock).
+ */
+ if (old.s.writers != old.s.users)
+ return (EBUSY);
+
+ /* The replacement lock value is a result of allocating a new ticket. */
+ ++new.s.users;
+ return (__wt_atomic_cas64(&l->u, old.u, new.u) ? 0 : EBUSY);
}
/*
@@ -174,23 +278,33 @@ int
__wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
{
wt_rwlock_t *l;
- uint64_t me;
- uint16_t val;
+ uint16_t ticket;
+ int pause_cnt;
WT_RET(__wt_verbose(
session, WT_VERB_MUTEX, "rwlock: writelock %s", rwlock->name));
WT_STAT_FAST_CONN_INCR(session, rwlock_write);
+ l = &rwlock->rwlock;
+
/*
- * Possibly wrap: if we have more than 64K lockers waiting, the count
- * of writers will wrap and two lockers will simultaneously be granted
- * the write lock.
+ * Possibly wrap: if we have more than 64K lockers waiting, the ticket
+ * value will wrap and two lockers will simultaneously be granted the
+ * lock.
*/
- l = &rwlock->rwlock;
- me = WT_ATOMIC_FETCH_ADD8(l->u, (uint64_t)1 << 32);
- val = (uint16_t)(me >> 32);
- while (val != l->s.writers)
- WT_PAUSE();
+ ticket = __wt_atomic_fetch_add16(&l->s.users, 1);
+ for (pause_cnt = 0; ticket != l->s.writers;) {
+ /*
+ * We failed to get the lock; pause before retrying and if we've
+ * paused enough, sleep so we don't burn CPU to no purpose. This
+ * situation happens if there are more threads than cores in the
+ * system and we're thrashing on shared resources.
+ */
+ if (++pause_cnt < 1000)
+ WT_PAUSE();
+ else
+ __wt_sleep(0, 10);
+ }
return (0);
}
@@ -211,12 +325,23 @@ __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
copy = *l;
+ /*
+ * We're the only writer of the writers/readers fields, so the update
+ * does not need to be atomic; we have to update both values at the
+ * same time though, otherwise we'd potentially race with the thread
+ * next granted the lock.
+ *
+ * Use a memory barrier to ensure the compiler doesn't mess with these
+ * instructions and rework the code in a way that avoids the update as
+ * a unit.
+ */
WT_BARRIER();
++copy.s.writers;
++copy.s.readers;
- l->i.us = copy.i.us;
+ l->i.wr = copy.i.wr;
+
return (0);
}
diff --git a/src/os_posix/os_open.c b/src/os_posix/os_open.c
index 7a4f5fdb38d..ef4662aa369 100644
--- a/src/os_posix/os_open.c
+++ b/src/os_posix/os_open.c
@@ -53,7 +53,7 @@ __wt_open(WT_SESSION_IMPL *session,
hash = __wt_hash_city64(name, strlen(name));
bucket = hash % WT_HASH_ARRAY_SIZE;
__wt_spin_lock(session, &conn->fh_lock);
- SLIST_FOREACH(tfh, &conn->fhhash[bucket], hashl) {
+ TAILQ_FOREACH(tfh, &conn->fhhash[bucket], hashq) {
if (strcmp(name, tfh->name) == 0) {
++tfh->ref;
*fhp = tfh;
@@ -167,7 +167,7 @@ setupfh:
*/
matched = 0;
__wt_spin_lock(session, &conn->fh_lock);
- SLIST_FOREACH(tfh, &conn->fhhash[bucket], hashl) {
+ TAILQ_FOREACH(tfh, &conn->fhhash[bucket], hashq) {
if (strcmp(name, tfh->name) == 0) {
++tfh->ref;
*fhp = tfh;
@@ -177,7 +177,7 @@ setupfh:
}
if (!matched) {
WT_CONN_FILE_INSERT(conn, fh, bucket);
- (void)WT_ATOMIC_ADD4(conn->open_file_count, 1);
+ (void)__wt_atomic_add32(&conn->open_file_count, 1);
*fhp = fh;
}
__wt_spin_unlock(session, &conn->fh_lock);
@@ -213,6 +213,8 @@ __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp)
fh = *fhp;
*fhp = NULL;
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: close", fh->name));
+
__wt_spin_lock(session, &conn->fh_lock);
if (fh == NULL || fh->ref == 0 || --fh->ref > 0) {
__wt_spin_unlock(session, &conn->fh_lock);
@@ -222,7 +224,7 @@ __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp)
/* Remove from the list. */
bucket = fh->name_hash % WT_HASH_ARRAY_SIZE;
WT_CONN_FILE_REMOVE(conn, fh, bucket);
- (void)WT_ATOMIC_SUB4(conn->open_file_count, 1);
+ (void)__wt_atomic_sub32(&conn->open_file_count, 1);
__wt_spin_unlock(session, &conn->fh_lock);
diff --git a/src/os_posix/os_path.c b/src/os_posix/os_path.c
index 07b14b55b44..af28e1b3b56 100644
--- a/src/os_posix/os_path.c
+++ b/src/os_posix/os_path.c
@@ -12,10 +12,10 @@
* __wt_absolute_path --
* Return if a filename is an absolute path.
*/
-int
+bool
__wt_absolute_path(const char *path)
{
- return (path[0] == '/' ? 1 : 0);
+ return (path[0] == '/');
}
/*
diff --git a/src/os_posix/os_remove.c b/src/os_posix/os_remove.c
index 3fc692d8755..96bbba9bab2 100644
--- a/src/os_posix/os_remove.c
+++ b/src/os_posix/os_remove.c
@@ -29,7 +29,7 @@ __remove_file_check(WT_SESSION_IMPL *session, const char *name)
* level should have closed it before removing.
*/
__wt_spin_lock(session, &conn->fh_lock);
- SLIST_FOREACH(fh, &conn->fhhash[bucket], hashl)
+ TAILQ_FOREACH(fh, &conn->fhhash[bucket], hashq)
if (strcmp(name, fh->name) == 0)
break;
__wt_spin_unlock(session, &conn->fh_lock);
diff --git a/src/os_posix/os_thread.c b/src/os_posix/os_thread.c
index e4f24cdb44e..c7222aac6c4 100644
--- a/src/os_posix/os_thread.c
+++ b/src/os_posix/os_thread.c
@@ -19,7 +19,8 @@ __wt_thread_create(WT_SESSION_IMPL *session,
WT_DECL_RET;
/* Spawn a new thread of control. */
- if ((ret = pthread_create(tidret, NULL, func, arg)) == 0)
+ WT_SYSCALL_RETRY(pthread_create(tidret, NULL, func, arg), ret);
+ if (ret == 0)
return (0);
WT_RET_MSG(session, ret, "pthread_create");
}
@@ -33,7 +34,8 @@ __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid)
{
WT_DECL_RET;
- if ((ret = pthread_join(tid, NULL)) == 0)
+ WT_SYSCALL_RETRY(pthread_join(tid, NULL), ret);
+ if (ret == 0)
return (0);
WT_RET_MSG(session, ret, "pthread_join");
diff --git a/src/os_win/os_errno.c b/src/os_win/os_errno.c
index 097c73b5731..a9d3d521052 100644
--- a/src/os_win/os_errno.c
+++ b/src/os_win/os_errno.c
@@ -22,7 +22,7 @@ __wt_map_error_to_windows_error(int error) {
Also validate he do not get any COM errors
(which are negative integers)
*/
- WT_ASSERT(NULL, error > 0 && error > -(windows_error_offset));
+ WT_ASSERT(NULL, error < 0);
return (error + -(windows_error_offset));
}
@@ -96,7 +96,7 @@ __wt_strerror(WT_SESSION_IMPL *session, int error, char *errbuf, size_t errlen)
snprintf(errbuf, errlen, "%s", buf) > 0)
return (errbuf);
if (lasterror != 0 && session != NULL &&
- __wt_buf_set(session, &session->err, buf, strlen(buf)) == 0)
+ __wt_buf_fmt(session, &session->err, "%s", buf) == 0)
return (session->err.data);
}
diff --git a/src/os_win/os_mtx_cond.c b/src/os_win/os_mtx_cond.c
index 51f6d6533c8..14ca5d61282 100644
--- a/src/os_win/os_mtx_cond.c
+++ b/src/os_win/os_mtx_cond.c
@@ -37,13 +37,15 @@ __wt_cond_alloc(WT_SESSION_IMPL *session,
}
/*
- * __wt_cond_wait --
- * Wait on a mutex, optionally timing out.
+ * __wt_cond_wait_signal --
+ * Wait on a mutex, optionally timing out. If we get it
+ * before the time out period expires, let the caller know.
*/
int
-__wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs)
+__wt_cond_wait_signal(
+ WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs, int *signalled)
{
- DWORD milliseconds;
+ DWORD err, milliseconds;
WT_DECL_RET;
uint64_t milliseconds64;
int locked;
@@ -51,7 +53,8 @@ __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs)
locked = 0;
/* Fast path if already signalled. */
- if (WT_ATOMIC_ADD4(cond->waiters, 1) == 0)
+ *signalled = 1;
+ if (__wt_atomic_addi32(&cond->waiters, 1) == 0)
return (0);
/*
@@ -91,17 +94,25 @@ __wt_cond_wait(WT_SESSION_IMPL *session, WT_CONDVAR *cond, uint64_t usecs)
ret = SleepConditionVariableCS(
&cond->cond, &cond->mtx, INFINITE);
+ /*
+ * SleepConditionVariableCS returns non-zero on success, 0 on timeout
+ * or failure. Check for timeout, else convert to a WiredTiger error
+ * value and fail.
+ */
if (ret == 0) {
- if (GetLastError() == ERROR_TIMEOUT) {
- ret = 1;
- }
- }
+ if ((err = GetLastError()) == ERROR_TIMEOUT)
+ *signalled = 0;
+ else
+ ret = __wt_errno();
+ } else
+ ret = 0;
- (void)WT_ATOMIC_SUB4(cond->waiters, 1);
+ (void)__wt_atomic_subi32(&cond->waiters, 1);
if (locked)
LeaveCriticalSection(&cond->mtx);
- if (ret != 0)
+
+ if (ret == 0)
return (0);
WT_RET_MSG(session, ret, "SleepConditionVariableCS");
}
@@ -130,7 +141,7 @@ __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond)
if (cond->waiters == -1)
return (0);
- if (cond->waiters > 0 || !WT_ATOMIC_CAS4(cond->waiters, 0, -1)) {
+ if (cond->waiters > 0 || !__wt_atomic_casi32(&cond->waiters, 0, -1)) {
EnterCriticalSection(&cond->mtx);
locked = 1;
WakeAllConditionVariable(&cond->cond);
diff --git a/src/os_win/os_open.c b/src/os_win/os_open.c
index a77bef63b9d..3bd24369242 100644
--- a/src/os_win/os_open.c
+++ b/src/os_win/os_open.c
@@ -39,7 +39,7 @@ __wt_open(WT_SESSION_IMPL *session,
/* Increment the reference count if we already have the file open. */
matched = 0;
__wt_spin_lock(session, &conn->fh_lock);
- SLIST_FOREACH(tfh, &conn->fhhash[bucket], hashl)
+ TAILQ_FOREACH(tfh, &conn->fhhash[bucket], hashq)
if (strcmp(name, tfh->name) == 0) {
++tfh->ref;
*fhp = tfh;
@@ -160,7 +160,7 @@ setupfh:
*/
matched = 0;
__wt_spin_lock(session, &conn->fh_lock);
- SLIST_FOREACH(tfh, &conn->fhhash[bucket], hashl)
+ TAILQ_FOREACH(tfh, &conn->fhhash[bucket], hashq)
if (strcmp(name, tfh->name) == 0) {
++tfh->ref;
*fhp = tfh;
@@ -169,7 +169,7 @@ setupfh:
}
if (!matched) {
WT_CONN_FILE_INSERT(conn, fh, bucket);
- (void)WT_ATOMIC_ADD4(conn->open_file_count, 1);
+ (void)__wt_atomic_add32(&conn->open_file_count, 1);
*fhp = fh;
}
@@ -217,7 +217,7 @@ __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp)
/* Remove from the list. */
bucket = fh->name_hash % WT_HASH_ARRAY_SIZE;
WT_CONN_FILE_REMOVE(conn, fh, bucket);
- (void)WT_ATOMIC_SUB4(conn->open_file_count, 1);
+ (void)__wt_atomic_sub32(&conn->open_file_count, 1);
__wt_spin_unlock(session, &conn->fh_lock);
diff --git a/src/os_win/os_path.c b/src/os_win/os_path.c
index 89f05e238c4..9d001e50571 100644
--- a/src/os_win/os_path.c
+++ b/src/os_win/os_path.c
@@ -12,7 +12,7 @@
* __wt_absolute_path --
* Return if a filename is an absolute path.
*/
-int
+bool
__wt_absolute_path(const char *path)
{
/*
@@ -21,7 +21,7 @@ __wt_absolute_path(const char *path)
*/
if (strlen(path) >= 3 && isalpha(path[0]) && path[1] == ':')
path += 2;
- return (path[0] == '/' || path[0] == '\\' ? 1 : 0);
+ return (path[0] == '/' || path[0] == '\\');
}
/*
diff --git a/src/os_win/os_remove.c b/src/os_win/os_remove.c
index 0c6396c775f..55b50030064 100644
--- a/src/os_win/os_remove.c
+++ b/src/os_win/os_remove.c
@@ -29,7 +29,7 @@ __remove_file_check(WT_SESSION_IMPL *session, const char *name)
* level should have closed it before removing.
*/
__wt_spin_lock(session, &conn->fh_lock);
- SLIST_FOREACH(fh, &conn->fhhash[bucket], hashl)
+ TAILQ_FOREACH(fh, &conn->fhhash[bucket], hashq)
if (strcmp(name, fh->name) == 0)
break;
__wt_spin_unlock(session, &conn->fh_lock);
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index 37acb28a00b..10daa8b717c 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -27,18 +27,30 @@ typedef struct {
WT_ITEM dsk; /* Temporary disk-image buffer */
- /* Track whether all changes to the page are written. */
+ /*
+ * Track start/stop write generation to decide if all changes to the
+ * page are written.
+ */
+ uint32_t orig_write_gen;
+
+ /*
+ * Track start/stop checkpoint generations to decide if lookaside table
+ * records are correct.
+ */
+ uint64_t orig_btree_checkpoint_gen;
+ uint64_t orig_txn_checkpoint_gen;
+
+ /*
+ * Track maximum transaction ID seen and first unwritten transaction ID.
+ */
uint64_t max_txn;
uint64_t first_dirty_txn;
- uint32_t orig_write_gen;
/*
- * If page updates are skipped because they are as yet unresolved, or
- * the page has updates we cannot discard, the page is left "dirty":
- * the page cannot be discarded and a subsequent reconciliation will
- * be necessary to discard the page.
+ * When we can't mark the page clean (for example, checkpoint found some
+ * uncommitted updates), there's a leave-dirty flag.
*/
- int leave_dirty;
+ int leave_dirty;
/*
* Raw compression (don't get me started, as if normal reconciliation
@@ -153,18 +165,12 @@ typedef struct {
void *dsk; /* Split's disk image */
/*
- * When busy pages get large, we need to be able to evict them
- * even when they contain unresolved updates, or updates which
- * cannot be evicted because of running transactions. In such
- * cases, break the page into multiple blocks, write the blocks
- * that can be evicted, saving lists of updates for blocks that
- * cannot be evicted, then re-instantiate the blocks that cannot
- * be evicted as new, in-memory pages, restoring the updates on
- * those pages.
+ * Saved update list, supporting the WT_EVICT_UPDATE_RESTORE and
+ * WT_EVICT_LOOKASIDE configurations.
*/
- WT_UPD_SKIPPED *skip; /* Skipped updates */
- uint32_t skip_next;
- size_t skip_allocated;
+ WT_SAVE_UPD *supd; /* Saved updates */
+ uint32_t supd_next;
+ size_t supd_allocated;
/*
* The key for a row-store page; no column-store key is needed
@@ -220,12 +226,14 @@ typedef struct {
size_t space_avail; /* Remaining space in this chunk */
/*
- * While reviewing updates for each page, we store skipped updates here,
- * and then move them to per-block areas as the blocks are defined.
+ * Saved update list, supporting the WT_EVICT_UPDATE_RESTORE and
+ * WT_EVICT_LOOKASIDE configurations. While reviewing updates for each
+ * page, we save WT_UPDATE lists here, and then move them to per-block
+ * areas as the blocks are defined.
*/
- WT_UPD_SKIPPED *skip; /* Skipped updates */
- uint32_t skip_next;
- size_t skip_allocated;
+ WT_SAVE_UPD *supd; /* Saved updates */
+ uint32_t supd_next;
+ size_t supd_allocated;
/*
* We don't need to keep the 0th key around on internal pages, the
@@ -277,7 +285,10 @@ typedef struct {
WT_SALVAGE_COOKIE *salvage; /* If it's a salvage operation */
- int tested_ref_state; /* Debugging information */
+ int cache_write_lookaside; /* Used the lookaside table */
+ int cache_write_restore; /* Used update/restoration */
+
+ uint32_t tested_ref_state; /* Debugging information */
} WT_RECONCILE;
static void __rec_bnd_cleanup(WT_SESSION_IMPL *, WT_RECONCILE *, int);
@@ -318,8 +329,11 @@ static int __rec_split_row_promote(
WT_SESSION_IMPL *, WT_RECONCILE *, WT_ITEM *, uint8_t);
static int __rec_split_write(WT_SESSION_IMPL *,
WT_RECONCILE *, WT_BOUNDARY *, WT_ITEM *, int);
+static int __rec_update_las(
+ WT_SESSION_IMPL *, WT_RECONCILE *, uint32_t, WT_BOUNDARY *);
static int __rec_write_init(WT_SESSION_IMPL *,
WT_REF *, uint32_t, WT_SALVAGE_COOKIE *, void *);
+static int __rec_write_status(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static int __rec_write_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static int __rec_write_wrapup_err(
WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
@@ -338,31 +352,19 @@ int
__wt_reconcile(WT_SESSION_IMPL *session,
WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags)
{
- WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_PAGE *page;
WT_PAGE_MODIFY *mod;
WT_RECONCILE *r;
- int page_lock, scan_lock, split_lock;
- conn = S2C(session);
page = ref->page;
mod = page->modify;
- page_lock = scan_lock = split_lock = 0;
-
- /* We're shouldn't get called with a clean page, that's an error. */
- if (!__wt_page_is_modified(page))
- WT_RET_MSG(session, WT_ERROR,
- "Attempt to reconcile a clean page.");
WT_RET(__wt_verbose(session,
WT_VERB_RECONCILE, "%s", __wt_page_type_string(page->type)));
- WT_STAT_FAST_CONN_INCR(session, rec_pages);
- WT_STAT_FAST_DATA_INCR(session, rec_pages);
- if (LF_ISSET(WT_EVICTING)) {
- WT_STAT_FAST_CONN_INCR(session, rec_pages_eviction);
- WT_STAT_FAST_DATA_INCR(session, rec_pages_eviction);
- }
+
+ /* We shouldn't get called with a clean page, that's an error. */
+ WT_ASSERT(session, __wt_page_is_modified(page));
#ifdef HAVE_DIAGNOSTIC
{
@@ -386,39 +388,15 @@ __wt_reconcile(WT_SESSION_IMPL *session,
r = session->reconcile;
/*
- * The compaction process looks at the page's modification information;
- * if compaction is running, acquire the page's lock.
- */
- if (conn->compact_in_memory_pass) {
- WT_PAGE_LOCK(session, page);
- page_lock = 1;
- }
-
- /*
- * Reconciliation reads the lists of updates, so obsolete updates cannot
- * be discarded while reconciliation is in progress.
- */
- for (;;) {
- F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret);
- if (ret == 0)
- break;
- __wt_yield();
- }
- scan_lock = 1;
-
- /*
- * Mark internal pages as splitting to ensure we don't deadlock when
- * performing an in-memory split during a checkpoint.
+ * Reconciliation locks the page for three reasons:
+ * Reconciliation reads the lists of page updates, obsolete updates
+ * cannot be discarded while reconciliation is in progress;
+ * The compaction process reads page modification information, which
+ * reconciliation modifies;
+ * In-memory splits: reconciliation of an internal page cannot handle
+ * a child page splitting during the reconciliation.
*/
- if (WT_PAGE_IS_INTERNAL(page)) {
- for (;;) {
- F_CAS_ATOMIC(page, WT_PAGE_SPLIT_LOCKED, ret);
- if (ret == 0)
- break;
- __wt_yield();
- }
- split_lock = 1;
- }
+ F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION);
/* Reconcile the page. */
switch (page->type) {
@@ -445,19 +423,34 @@ __wt_reconcile(WT_SESSION_IMPL *session,
WT_ILLEGAL_VALUE_SET(session);
}
+ /* Get the final status for the reconciliation. */
+ if (ret == 0)
+ ret = __rec_write_status(session, r, page);
+
/* Wrap up the page reconciliation. */
if (ret == 0)
ret = __rec_write_wrapup(session, r, page);
else
WT_TRET(__rec_write_wrapup_err(session, r, page));
- /* Release the locks we're holding. */
- if (split_lock)
- F_CLR_ATOMIC(page, WT_PAGE_SPLIT_LOCKED);
- if (scan_lock)
- F_CLR_ATOMIC(page, WT_PAGE_SCANNING);
- if (page_lock)
- WT_PAGE_UNLOCK(session, page);
+ /* Release the reconciliation lock. */
+ F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION);
+
+ /* Update statistics. */
+ WT_STAT_FAST_CONN_INCR(session, rec_pages);
+ WT_STAT_FAST_DATA_INCR(session, rec_pages);
+ if (LF_ISSET(WT_EVICTING)) {
+ WT_STAT_FAST_CONN_INCR(session, rec_pages_eviction);
+ WT_STAT_FAST_DATA_INCR(session, rec_pages_eviction);
+ }
+ if (r->cache_write_lookaside) {
+ WT_STAT_FAST_CONN_INCR(session, cache_write_lookaside);
+ WT_STAT_FAST_DATA_INCR(session, cache_write_lookaside);
+ }
+ if (r->cache_write_restore) {
+ WT_STAT_FAST_CONN_INCR(session, cache_write_restore);
+ WT_STAT_FAST_DATA_INCR(session, cache_write_restore);
+ }
/*
* Clean up the boundary structures: some workloads result in millions
@@ -489,6 +482,125 @@ __wt_reconcile(WT_SESSION_IMPL *session,
}
/*
+ * __rec_las_checkpoint_test --
+ * Return if the lookaside table is going to collide with a checkpoint.
+ */
+static inline bool
+__rec_las_checkpoint_test(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_BTREE *btree;
+
+ conn = S2C(session);
+ btree = S2BT(session);
+
+ /*
+ * Running checkpoints can collide with the lookaside table because
+ * reconciliation using the lookaside table writes the key's last
+ * committed value, which might not be the value checkpoint would write.
+ * If reconciliation was configured for lookaside table eviction, this
+ * file participates in checkpoints, and any of the tree or system
+ * transactional generation numbers don't match, there's a possible
+ * collision.
+ *
+ * It's a complicated test, but the alternative is to have checkpoint
+ * drain lookaside table reconciliations, and this isn't a problem for
+ * most workloads.
+ */
+ if (!F_ISSET(r, WT_EVICT_LOOKASIDE))
+ return (false);
+ if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT))
+ return (false);
+ if (r->orig_btree_checkpoint_gen == btree->checkpoint_gen &&
+ r->orig_txn_checkpoint_gen == conn->txn_global.checkpoint_gen &&
+ r->orig_btree_checkpoint_gen == r->orig_txn_checkpoint_gen)
+ return (false);
+ return (true);
+}
+
+/*
+ * __rec_write_status --
+ * Return the final status for reconciliation.
+ */
+static int
+__rec_write_status(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+ WT_PAGE_MODIFY *mod;
+
+ btree = S2BT(session);
+ mod = page->modify;
+
+ /* Check for a lookaside table and checkpoint collision. */
+ if (__rec_las_checkpoint_test(session, r))
+ return (EBUSY);
+
+ /*
+ * Set the page's status based on whether or not we cleaned the page.
+ */
+ if (r->leave_dirty) {
+ /*
+ * Update the page's first unwritten transaction ID.
+ */
+ mod->first_dirty_txn = r->first_dirty_txn;
+
+ /*
+ * The page remains dirty.
+ *
+ * Any checkpoint call cleared the tree's modified flag before
+ * writing pages, so we must explicitly reset it. We insert a
+ * barrier after the change for clarity (the requirement is the
+ * flag be set before a subsequent checkpoint reads it, and
+ * as the current checkpoint is waiting on this reconciliation
+ * to complete, there's no risk of that happening)
+ */
+ btree->modified = 1;
+ WT_FULL_BARRIER();
+
+ /*
+ * Eviction should only be here if following the save/restore
+ * eviction path.
+ */
+ WT_ASSERT(session,
+ !F_ISSET(r, WT_EVICTING) ||
+ F_ISSET(r, WT_EVICT_UPDATE_RESTORE));
+ } else {
+ /*
+ * Track the page's maximum transaction ID (used to decide if
+ * we're likely to be able to evict this page in the future).
+ */
+ mod->rec_max_txn = r->max_txn;
+
+ /*
+ * Track the tree's maximum transaction ID (used to decide if
+ * it's safe to discard the tree). Reconciliation for eviction
+ * is multi-threaded, only update the tree's maximum transaction
+ * ID when doing a checkpoint. That's sufficient, we only care
+ * about the maximum transaction ID of current updates in the
+ * tree, and checkpoint visits every dirty page in the tree.
+ */
+ if (!F_ISSET(r, WT_EVICTING) &&
+ WT_TXNID_LT(btree->rec_max_txn, r->max_txn))
+ btree->rec_max_txn = r->max_txn;
+
+ /*
+ * The page only might be clean; if the write generation is
+ * unchanged since reconciliation started, it's clean.
+ *
+ * If the write generation changed, the page has been written
+ * since reconciliation started and remains dirty (that can't
+ * happen when evicting, the page is exclusively locked).
+ */
+ if (__wt_atomic_cas32(&mod->write_gen, r->orig_write_gen, 0))
+ __wt_cache_dirty_decr(session, page);
+ else
+ WT_ASSERT(session, !F_ISSET(r, WT_EVICTING));
+ }
+
+ return (0);
+}
+
+/*
* __rec_root_write --
* Handle the write of a root page.
*/
@@ -577,7 +689,7 @@ err: __wt_page_out(session, &next);
* __rec_raw_compression_config --
* Configure raw compression.
*/
-static inline int
+static inline bool
__rec_raw_compression_config(
WT_SESSION_IMPL *session, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
{
@@ -588,11 +700,11 @@ __rec_raw_compression_config(
/* Check if raw compression configured. */
if (btree->compressor == NULL ||
btree->compressor->compress_raw == NULL)
- return (0);
+ return (false);
/* Only for row-store and variable-length column-store objects. */
if (page->type == WT_PAGE_COL_FIX)
- return (0);
+ return (false);
/*
* Raw compression cannot support dictionary compression. (Technically,
@@ -602,11 +714,11 @@ __rec_raw_compression_config(
* that seems an unlikely use case.)
*/
if (btree->dictionary != 0)
- return (0);
+ return (false);
/* Raw compression cannot support prefix compression. */
if (btree->prefix_compression != 0)
- return (0);
+ return (false);
/*
* Raw compression is also turned off during salvage: we can't allow
@@ -614,9 +726,9 @@ __rec_raw_compression_config(
* can't manipulate the page size.
*/
if (salvage != NULL)
- return (0);
+ return (false);
- return (1);
+ return (true);
}
/*
@@ -628,10 +740,12 @@ __rec_write_init(WT_SESSION_IMPL *session,
WT_REF *ref, uint32_t flags, WT_SALVAGE_COOKIE *salvage, void *reconcilep)
{
WT_BTREE *btree;
+ WT_CONNECTION_IMPL *conn;
WT_PAGE *page;
WT_RECONCILE *r;
btree = S2BT(session);
+ conn = S2C(session);
page = ref->page;
if ((r = *(WT_RECONCILE **)reconcilep) == NULL) {
@@ -648,9 +762,59 @@ __rec_write_init(WT_SESSION_IMPL *session,
F_SET(&r->dsk, WT_ITEM_ALIGNED);
}
+ /* Reconciliation is not re-entrant, make sure that doesn't happen. */
+ WT_ASSERT(session, r->ref == NULL);
+
/* Remember the configuration. */
r->ref = ref;
r->page = page;
+
+ /*
+ * Save the page's write generation before reading the page.
+ * Save the transaction generations before reading the page.
+ * These are all ordered reads, but we only need one.
+ */
+ r->orig_btree_checkpoint_gen = btree->checkpoint_gen;
+ r->orig_txn_checkpoint_gen = conn->txn_global.checkpoint_gen;
+ WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen);
+
+ /*
+ * Lookaside table eviction is configured when eviction gets aggressive,
+ * adjust the flags for cases we don't support.
+ */
+ if (LF_ISSET(WT_EVICT_LOOKASIDE)) {
+ /*
+ * Saving lookaside table updates into the lookaside table won't
+ * work.
+ */
+ if (F_ISSET(btree, WT_BTREE_LOOKASIDE))
+ LF_CLR(WT_EVICT_LOOKASIDE);
+
+ /*
+ * We don't yet support fixed-length column-store combined with
+ * the lookaside table. It's not hard to do, but the underlying
+ * function that reviews which updates can be written to the
+ * evicted page and which updates need to be written to the
+ * lookaside table needs access to the original value from the
+ * page being evicted, and there's no code path for that in the
+ * case of fixed-length column-store objects. (Row-store and
+ * variable-width column-store objects provide a reference to
+ * the unpacked on-page cell for this purpose, but there isn't
+ * an on-page cell for fixed-length column-store objects.) For
+ * now, turn it off.
+ */
+ if (page->type == WT_PAGE_COL_FIX)
+ LF_CLR(WT_EVICT_LOOKASIDE);
+
+ /*
+ * Check for a lookaside table and checkpoint collision, and if
+ * we find one, turn off the lookaside file (we've gone to all
+ * the effort of getting exclusive access to the page, might as
+ * well try and evict it).
+ */
+ if (__rec_las_checkpoint_test(session, r))
+ LF_CLR(WT_EVICT_LOOKASIDE);
+ }
r->flags = flags;
/* Track if the page can be marked clean. */
@@ -668,8 +832,8 @@ __rec_write_init(WT_SESSION_IMPL *session,
r->all_empty_value = 1;
r->any_empty_value = 0;
- /* The list of cached, skipped updates. */
- r->skip_next = 0;
+ /* The list of saved updates. */
+ r->supd_next = 0;
/*
* Dictionary compression only writes repeated values once. We grow
@@ -714,14 +878,11 @@ __rec_write_init(WT_SESSION_IMPL *session,
r->salvage = salvage;
- /* Save the page's write generation before reading the page. */
- WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen);
-
/*
* Running transactions may update the page after we write it, so
* this is the highest ID we can be confident we will see.
*/
- r->first_dirty_txn = S2C(session)->txn_global.last_running;
+ r->first_dirty_txn = conn->txn_global.last_running;
return (0);
}
@@ -748,7 +909,7 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep)
__rec_bnd_cleanup(session, r, 1);
- __wt_free(session, r->skip);
+ __wt_free(session, r->supd);
__wt_buf_free(session, &r->k.buf);
__wt_buf_free(session, &r->v.buf);
@@ -784,6 +945,9 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy)
if (r->bnd == NULL)
return;
+ /* Reconciliation is not re-entrant, make sure that doesn't happen. */
+ r->ref = NULL;
+
/*
* Free the boundary structures' memory. In the case of normal cleanup,
* discard any memory we won't reuse in the next reconciliation; in the
@@ -799,7 +963,7 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy)
for (bnd = r->bnd, i = 0; i < r->bnd_entries; ++bnd, ++i) {
__wt_free(session, bnd->addr.addr);
__wt_free(session, bnd->dsk);
- __wt_free(session, bnd->skip);
+ __wt_free(session, bnd->supd);
__wt_buf_free(session, &bnd->key);
}
__wt_free(session, r->bnd);
@@ -820,66 +984,84 @@ __rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy)
for (bnd = r->bnd, i = 0; i < last_used; ++bnd, ++i) {
__wt_free(session, bnd->addr.addr);
__wt_free(session, bnd->dsk);
- __wt_free(session, bnd->skip);
+ __wt_free(session, bnd->supd);
}
}
}
/*
- * __rec_skip_update_save --
- * Save a skipped WT_UPDATE list for later restoration.
+ * __rec_block_free --
+ * Helper function to free a block.
*/
static int
-__rec_skip_update_save(
- WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip)
+__rec_block_free(
+ WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+
+ return (bm->free(bm, session, addr, addr_size));
+}
+
+/*
+ * __rec_update_save --
+ * Save a WT_UPDATE list for later restoration.
+ */
+static int
+__rec_update_save(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip, uint64_t txnid)
{
WT_RET(__wt_realloc_def(
- session, &r->skip_allocated, r->skip_next + 1, &r->skip));
- r->skip[r->skip_next].ins = ins;
- r->skip[r->skip_next].rip = rip;
- ++r->skip_next;
+ session, &r->supd_allocated, r->supd_next + 1, &r->supd));
+ r->supd[r->supd_next].ins = ins;
+ r->supd[r->supd_next].rip = rip;
+ r->supd[r->supd_next].onpage_txn = txnid;
+ ++r->supd_next;
return (0);
}
/*
- * __rec_skip_update_move --
- * Move a skipped WT_UPDATE list from the per-page cache to a specific
+ * __rec_update_move --
+ * Move a saved WT_UPDATE list from the per-page cache to a specific
* block's list.
*/
static int
-__rec_skip_update_move(
- WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_UPD_SKIPPED *skip)
+__rec_update_move(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_SAVE_UPD *supd)
{
WT_RET(__wt_realloc_def(
- session, &bnd->skip_allocated, bnd->skip_next + 1, &bnd->skip));
- bnd->skip[bnd->skip_next] = *skip;
- ++bnd->skip_next;
+ session, &bnd->supd_allocated, bnd->supd_next + 1, &bnd->supd));
+ bnd->supd[bnd->supd_next] = *supd;
+ ++bnd->supd_next;
- skip->ins = NULL;
- skip->rip = NULL;
+ supd->ins = NULL;
+ supd->rip = NULL;
return (0);
}
/*
* __rec_txn_read --
- * Return the first visible update in a list (or NULL if none are visible),
- * set a flag if any updates were skipped, track the maximum transaction ID on
- * the page.
+ * Return the update in a list that should be written (or NULL if none can
+ * be written).
*/
-static inline int
+static int
__rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
WT_INSERT *ins, WT_ROW *rip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp)
{
+ WT_BTREE *btree;
WT_DECL_RET;
- WT_ITEM ovfl;
+ WT_DECL_ITEM(tmp);
WT_PAGE *page;
- WT_UPDATE *upd, *upd_list, *upd_ovfl;
+ WT_UPDATE *append, *upd, *upd_list;
size_t notused;
uint64_t max_txn, min_txn, txnid;
- int skipped;
+ int append_origv, skipped;
*updp = NULL;
+ btree = S2BT(session);
page = r->page;
/*
@@ -893,13 +1075,16 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
} else
upd_list = ins->upd;
- skipped = 0;
- for (max_txn = WT_TXN_NONE, min_txn = UINT64_MAX, upd = upd_list;
- upd != NULL; upd = upd->next) {
+ for (skipped = 0,
+ max_txn = WT_TXN_NONE, min_txn = UINT64_MAX,
+ upd = upd_list; upd != NULL; upd = upd->next) {
if ((txnid = upd->txnid) == WT_TXN_ABORTED)
continue;
- /* Track the largest/smallest transaction IDs on the list. */
+ /*
+ * Track the largest/smallest transaction IDs on the list and
+ * the smallest not-globally-visible transaction on the page.
+ */
if (WT_TXNID_LT(max_txn, txnid))
max_txn = txnid;
if (WT_TXNID_LT(txnid, min_txn))
@@ -909,132 +1094,231 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
r->first_dirty_txn = txnid;
/*
- * Record whether any updates were skipped on the way to finding
- * the first visible update.
- *
- * If updates were skipped before the one being written, future
- * reads without intervening modifications to the page could
- * see a different value; if no updates were skipped, the page
- * can safely be marked clean and does not need to be
- * reconciled until modified again.
+ * Find the first update we can use.
*/
- if (*updp == NULL) {
- if (__wt_txn_visible(session, txnid))
- *updp = upd;
- else
+ if (F_ISSET(r, WT_EVICTING)) {
+ /*
+ * Eviction can write any committed update.
+ *
+ * When reconciling for eviction, track whether any
+ * uncommitted updates are found.
+ */
+ if (__wt_txn_committed(session, txnid)) {
+ if (*updp == NULL)
+ *updp = upd;
+ } else
skipped = 1;
+ } else {
+ /*
+ * Checkpoint can only write updates visible as of its
+ * snapshot.
+ *
+ * When reconciling for a checkpoint, track whether any
+ * updates were skipped on the way to finding the first
+ * visible update.
+ */
+ if (*updp == NULL) {
+ if (__wt_txn_visible(session, txnid))
+ *updp = upd;
+ else
+ skipped = 1;
+ }
}
}
/*
+ * If all of the updates were aborted, quit. This test is not strictly
+ * necessary because the above loop exits with skipped not set and the
+ * maximum transaction left at its initial value of WT_TXN_NONE, so
+ * the test below will be branch true and return, but it's cheap and a
+ * little more explicit, and makes Coverity happy.
+ */
+ if (max_txn == WT_TXN_NONE)
+ return (0);
+
+ /*
* Track the maximum transaction ID in the page. We store this in the
- * page at the end of reconciliation if no updates are skipped, it's
- * used to avoid evicting clean pages from memory with changes required
- * to satisfy a snapshot read.
+ * tree at the end of reconciliation in the service of checkpoints, it
+ * is used to avoid discarding trees from memory when they have changes
+ * required to satisfy a snapshot read.
*/
if (WT_TXNID_LT(r->max_txn, max_txn))
r->max_txn = max_txn;
/*
- * If no updates were skipped and all updates are globally visible, the
- * page can be marked clean and we're done, regardless of whether we're
- * evicting or checkpointing.
+ * If there are no skipped updates and all updates are globally visible,
+ * the page can be marked clean and we're done, regardless if evicting
+ * or checkpointing.
*
* We have to check both: the oldest transaction ID may have moved while
- * we were scanning the update list, so it is possible to skip an update
- * but then find that by the end of the scan, all updates are stable.
+ * we were scanning the update list, so it is possible to find a skipped
+ * update, but then find all updates are stable at the end of the scan.
+ *
+ * Skip the visibility check for the lookaside table as a special-case,
+ * we know there are no older readers of that table.
*/
- if (!skipped && __wt_txn_visible_all(session, max_txn))
+ if (!skipped &&
+ (F_ISSET(btree, WT_BTREE_LOOKASIDE) ||
+ __wt_txn_visible_all(session, max_txn)))
return (0);
/*
- * If some updates are not globally visible, or were skipped, the page
- * cannot be marked clean.
+ * In some cases, there had better not be skipped updates or updates not
+ * yet globally visible.
*/
- r->leave_dirty = 1;
-
- /* If we're not evicting, we're done, we know what we'll write. */
- if (!F_ISSET(r, WT_EVICTING))
- return (0);
-
- /* In some cases, there had better not be any updates we can't write. */
- if (F_ISSET(r, WT_SKIP_UPDATE_ERR))
+ if (F_ISSET(r, WT_VISIBILITY_ERR))
WT_PANIC_RET(session, EINVAL,
- "reconciliation illegally skipped an update");
+ "reconciliation error, uncommitted update or update not "
+ "globally visible");
/*
- * If evicting and we aren't able to save/restore the not-yet-visible
- * updates, the page can't be evicted.
+ * If not trying to evict the page, we know what we'll write and we're
+ * done. Because some updates were skipped or are not globally visible,
+ * the page can't be marked clean.
*/
- if (!F_ISSET(r, WT_SKIP_UPDATE_RESTORE))
- return (EBUSY);
+ if (!F_ISSET(r, WT_EVICTING)) {
+ r->leave_dirty = 1;
+ return (0);
+ }
/*
- * Evicting a page with not-yet-visible updates: save and restore the
- * list of updates on a newly instantiated page.
- *
- * The order of the updates on the list matters so we can't move only
- * the unresolved updates, we have to move the entire update list.
+ * Evicting with either uncommitted changes or not-yet-globally-visible
+ * changes. There are two ways to continue, the save/restore eviction
+ * path or the lookaside table eviction path. Both cannot be configured
+ * because the paths track different information. The save/restore path
+ * can handle both uncommitted and not-yet-globally-visible changes, by
+ * evicting most of the page and then creating a new, smaller page into
+ * which we re-instantiate those changes. The lookaside table path can
+ * only handle not-yet-globally-visible changes by writing those changes
+ * into the lookaside table and restoring them on demand if and when the
+ * page is read back into memory.
*
- * Clear the returned update so our caller ignores the key/value pair
- * in the case of an insert/append entry (everything we need is in the
- * update list), and otherwise writes the original on-page key/value
- * pair to which the update list applies.
+ * Both paths are configured outside of reconciliation: the save/restore
+ * path is the WT_EVICT_UPDATE_RESTORE flag, the lookaside table path is
+ * the WT_EVICT_LOOKASIDE flag.
*/
- *updp = NULL;
+ if (!F_ISSET(r, WT_EVICT_LOOKASIDE | WT_EVICT_UPDATE_RESTORE))
+ return (EBUSY);
+ if (skipped && !F_ISSET(r, WT_EVICT_UPDATE_RESTORE))
+ return (EBUSY);
+
+ append_origv = 0;
+ if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE)) {
+ /*
+ * The save/restore eviction path.
+ *
+ * Clear the returned update so our caller ignores the key/value
+ * pair in the case of an insert/append list entry (everything
+ * we need is in the update list), and otherwise writes the
+ * original on-page key/value pair to which the update list
+ * applies.
+ */
+ *updp = NULL;
+
+ /* The page can't be marked clean. */
+ r->leave_dirty = 1;
+
+ /*
+ * A special-case for overflow values, where we can't write the
+ * original on-page value item to disk because it's been updated
+ * or removed.
+ *
+ * What happens is that an overflow value is updated or removed
+ * and its backing blocks freed. If any reader in the system
+ * might still want the value, a copy was cached in the page
+ * reconciliation tracking memory, and the page cell set to
+ * WT_CELL_VALUE_OVFL_RM. Eviction then chose the page and
+ * we're splitting it up in order to push parts of it out of
+ * memory.
+ *
+ * We could write the original on-page value item to disk... if
+ * we had a copy. The cache may not have a copy (a globally
+ * visible update would have kept a value from being cached), or
+ * an update that subsequently became globally visible could
+ * cause a cached value to be discarded. Either way, once there
+ * is a globally visible update, we may not have the original
+ * value.
+ *
+ * Fortunately, if there's a globally visible update we don't
+ * care about the original version, so we simply ignore it, no
+ * transaction can ever try and read it. If there isn't a
+ * globally visible update, there had better be a cached value.
+ *
+ * In the latter case, we could write the value out to disk, but
+ * (1) we are planning on re-instantiating this page in memory,
+ * it isn't going to disk, and (2) the value item is eventually
+ * going to be discarded, that seems like a waste of a write.
+ * Instead, find the cached value and append it to the update
+ * list we're saving for later restoration.
+ */
+ if (vpack != NULL &&
+ vpack->raw == WT_CELL_VALUE_OVFL_RM &&
+ !__wt_txn_visible_all(session, min_txn))
+ append_origv = 1;
+ } else {
+ /*
+ * The lookaside table eviction path.
+ *
+ * If at least one update is globally visible, copy the update
+ * list and ignore the current on-page value. If no update is
+ * globally visible, readers require the page's original value.
+ */
+ if (!__wt_txn_visible_all(session, min_txn))
+ append_origv = 1;
+ }
/*
- * Handle the case were we don't want to write an original on-page value
- * item to disk because it's been updated or removed.
- *
- * Here's the deal: an overflow value was updated or removed and its
- * backing blocks freed. If any transaction in the system might still
- * read the value, a copy was cached in page reconciliation tracking
- * memory, and the page cell set to WT_CELL_VALUE_OVFL_RM. Eviction
- * then chose the page and we're splitting it up in order to push parts
- * of it out of memory.
- *
- * We could write the original on-page value item to disk... if we had
- * a copy. The cache may not have a copy (a globally visible update
- * would have kept a value from ever being cached), or an update that
- * subsequent became globally visible could cause a cached value to be
- * discarded. Either way, once there's a globally visible update, we
- * may not have the value.
- *
- * Fortunately, if there's a globally visible update we don't care about
- * the original version, so we simply ignore it, no transaction can ever
- * try and read it. If there isn't a globally visible update, there had
- * better be a cached value.
- *
- * In the latter case, we could write the value out to disk, but (1) we
- * are planning on re-instantiating this page in memory, it isn't going
- * to disk, and (2) the value item is eventually going to be discarded,
- * that seems like a waste of a write. Instead, find the cached value
- * and append it to the update list we're saving for later restoration.
- */
- if (vpack != NULL && vpack->raw == WT_CELL_VALUE_OVFL_RM &&
- !__wt_txn_visible_all(session, min_txn)) {
- if ((ret = __wt_ovfl_txnc_search(
- page, vpack->data, vpack->size, &ovfl)) != 0)
- WT_PANIC_RET(session, ret,
- "cached overflow item discarded early");
+ * We need the original on-page value for some reason: get a copy and
+ * append it to the end of the update list with a transaction ID that
+ * guarantees its visibility.
+ */
+ if (append_origv) {
+ /*
+ * If we don't have a value cell, it's an insert/append list
+ * key/value pair which simply doesn't exist for some reader;
+ * place a deleted record at the end of the update list.
+ */
+ if (vpack == NULL || vpack->type == WT_CELL_DEL)
+ WT_RET(__wt_update_alloc(
+ session, NULL, &append, &notused));
+ else {
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
+ if ((ret = __wt_page_cell_data_ref(
+ session, page, vpack, tmp)) == 0)
+ ret = __wt_update_alloc(
+ session, tmp, &append, &notused);
+ __wt_scr_free(session, &tmp);
+ WT_RET(ret);
+ }
/*
- * Create an update structure with an impossibly low transaction
- * ID and append it to the update list we're about to save.
- * Restoring that update list when this page is re-instantiated
- * creates an update for the key/value pair visible to every
- * running transaction in the system, ensuring the on-page value
- * will be ignored.
+ * Give the entry an impossibly low transaction ID to ensure its
+ * global visibility, append it to the update list.
+ *
+ * Note the change to the actual reader-accessible update list:
+ * from now on, the original on-page value appears at the end
+ * of the update list, even if this reconciliation subsequently
+ * fails.
*/
- WT_RET(__wt_update_alloc(session, &ovfl, &upd_ovfl, &notused));
- upd_ovfl->txnid = WT_TXN_NONE;
+ append->txnid = WT_TXN_NONE;
for (upd = upd_list; upd->next != NULL; upd = upd->next)
;
- upd->next = upd_ovfl;
+ upd->next = append;
}
- return (__rec_skip_update_save(session, r, ins, rip));
+ /*
+ * The order of the updates on the list matters, we can't move only the
+ * unresolved updates, move the entire update list.
+ *
+ * If we skipped updates, the transaction value is never used. If we
+ * didn't skip updates, the list of updates are eventually written to
+ * the lookaside table, and associated with each update record is the
+ * transaction ID of the update we wrote in the reconciled page; once
+ * that transaction ID is globally visible, we know we no longer need
+ * the lookaside table records, allowing them to be discarded.
+ */
+ return (__rec_update_save(session,
+ r, ins, rip, (*updp == NULL) ? WT_TXN_NONE : (*updp)->txnid));
}
/*
@@ -1104,8 +1388,8 @@ __rec_child_modify(WT_SESSION_IMPL *session,
* to see if the delete is visible to us. Lock down the
* structure.
*/
- if (!WT_ATOMIC_CAS4(
- ref->state, WT_REF_DELETED, WT_REF_LOCKED))
+ if (!__wt_atomic_casv32(
+ &ref->state, WT_REF_DELETED, WT_REF_LOCKED))
break;
ret = __rec_child_deleted(session, r, ref, statep);
WT_PUBLISH(ref->state, WT_REF_DELETED);
@@ -1155,10 +1439,10 @@ __rec_child_modify(WT_SESSION_IMPL *session,
* If called during checkpoint, acquire a hazard pointer
* so the child isn't evicted, it's an in-memory case.
*
- * This call cannot return split/restart, dirty page
- * eviction is shutout during checkpoint, all splits in
- * process will have completed before we walk any pages
- * for checkpoint.
+ * This call cannot return split/restart, eviction of
+ * pages that split into their parent is shutout during
+ * checkpoint, all splits in process will have completed
+ * before we walk any pages for checkpoint.
*/
ret = __wt_page_in(session, ref,
WT_READ_CACHE | WT_READ_NO_EVICT |
@@ -1215,7 +1499,7 @@ in_memory:
* reason to write the cell.
*/
mod = ref->page->modify;
- if (mod != NULL && mod->flags != 0)
+ if (mod != NULL && F_ISSET(mod, WT_PM_REC_MASK))
*statep = WT_CHILD_MODIFIED;
else if (ref->addr == NULL) {
*statep = WT_CHILD_IGNORE;
@@ -1234,37 +1518,32 @@ static int
__rec_child_deleted(
WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *ref, int *statep)
{
- WT_BM *bm;
WT_PAGE_DELETED *page_del;
size_t addr_size;
const uint8_t *addr;
- bm = S2BT(session)->bm;
page_del = ref->page_del;
/*
* Internal pages with child leaf pages in the WT_REF_DELETED state are
* a special case during reconciliation. First, if the deletion was a
* result of a session truncate call, the deletion may not be visible to
- * us. In that case, we proceed as with any change that's not visible
- * during reconciliation by setting the skipped flag and ignoring the
- * change for the purposes of writing the internal page.
+ * us. In that case, we proceed as with any change not visible during
+ * reconciliation by ignoring the change for the purposes of writing the
+ * internal page.
*
* In this case, there must be an associated page-deleted structure, and
* it holds the transaction ID we care about.
+ *
+ * In some cases, there had better not be any updates we can't see.
*/
- if (page_del != NULL && !__wt_txn_visible(session, page_del->txnid)) {
- /*
- * In some cases, there had better not be any updates we can't
- * write.
- */
- if (F_ISSET(r, WT_SKIP_UPDATE_ERR))
- WT_PANIC_RET(session, EINVAL,
- "reconciliation illegally skipped an update");
- }
+ if (F_ISSET(r, WT_VISIBILITY_ERR) &&
+ page_del != NULL && !__wt_txn_visible(session, page_del->txnid))
+ WT_PANIC_RET(session, EINVAL,
+ "reconciliation illegally skipped an update");
/*
- * The deletion is visible to us, deal with any underlying disk blocks.
+ * Deal with any underlying disk blocks.
*
* First, check to see if there is an address associated with this leaf:
* if there isn't, we're done, the underlying page is already gone. If
@@ -1291,7 +1570,7 @@ __rec_child_deleted(
(page_del == NULL ||
__wt_txn_visible_all(session, page_del->txnid))) {
WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
- WT_RET(bm->free(bm, session, addr, addr_size));
+ WT_RET(__rec_block_free(session, addr, addr_size));
if (__wt_off_page(ref->home, ref->addr)) {
__wt_free(session, ((WT_ADDR *)ref->addr)->addr);
@@ -1562,7 +1841,7 @@ static void
__rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd)
{
bnd->offset = 0;
- bnd->recno = 0;
+ bnd->recno = WT_RECNO_OOB;
bnd->entries = 0;
__wt_free(session, bnd->addr.addr);
@@ -1571,9 +1850,9 @@ __rec_split_bnd_init(WT_SESSION_IMPL *session, WT_BOUNDARY *bnd)
bnd->cksum = 0;
__wt_free(session, bnd->dsk);
- __wt_free(session, bnd->skip);
- bnd->skip_next = 0;
- bnd->skip_allocated = 0;
+ __wt_free(session, bnd->supd);
+ bnd->supd_next = 0;
+ bnd->supd_allocated = 0;
/*
* Don't touch the key, we re-use that memory in each new
@@ -1775,9 +2054,13 @@ __rec_split_init(WT_SESSION_IMPL *session,
* __rec_is_checkpoint --
* Return if we're writing a checkpoint.
*/
-static int
-__rec_is_checkpoint(WT_RECONCILE *r, WT_BOUNDARY *bnd)
+static bool
+__rec_is_checkpoint(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_BOUNDARY *bnd)
{
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+
/*
* Check to see if we're going to create a checkpoint.
*
@@ -1792,13 +2075,14 @@ __rec_is_checkpoint(WT_RECONCILE *r, WT_BOUNDARY *bnd)
* we don't do checkpoint writes here; clear the boundary information as
* a reminder and create the checkpoint during wrapup.
*/
- if (bnd == &r->bnd[0] && __wt_ref_is_root(r->ref)) {
+ if (!F_ISSET(btree, WT_BTREE_NO_CHECKPOINT) &&
+ bnd == &r->bnd[0] && __wt_ref_is_root(r->ref)) {
bnd->addr.addr = NULL;
bnd->addr.size = 0;
bnd->addr.type = 0;
- return (1);
+ return (true);
}
- return (0);
+ return (false);
}
/*
@@ -1841,7 +2125,7 @@ __rec_split_row_promote(
WT_DECL_ITEM(update);
WT_DECL_RET;
WT_ITEM *max;
- WT_UPD_SKIPPED *skip;
+ WT_SAVE_UPD *supd;
size_t cnt, len, size;
uint32_t i;
const uint8_t *pa, *pb;
@@ -1892,36 +2176,37 @@ __rec_split_row_promote(
* the last key and smaller than the current key.
*/
max = r->last;
- for (i = r->skip_next; i > 0; --i) {
- skip = &r->skip[i - 1];
- if (skip->ins == NULL)
- WT_ERR(__wt_row_leaf_key(
- session, r->page, skip->rip, update, 0));
- else {
- update->data = WT_INSERT_KEY(skip->ins);
- update->size = WT_INSERT_KEY_SIZE(skip->ins);
- }
+ if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE))
+ for (i = r->supd_next; i > 0; --i) {
+ supd = &r->supd[i - 1];
+ if (supd->ins == NULL)
+ WT_ERR(__wt_row_leaf_key(
+ session, r->page, supd->rip, update, 0));
+ else {
+ update->data = WT_INSERT_KEY(supd->ins);
+ update->size = WT_INSERT_KEY_SIZE(supd->ins);
+ }
- /* Compare against the current key, it must be less. */
- WT_ERR(__wt_compare(
- session, btree->collator, update, r->cur, &cmp));
- if (cmp >= 0)
- continue;
+ /* Compare against the current key, it must be less. */
+ WT_ERR(__wt_compare(
+ session, btree->collator, update, r->cur, &cmp));
+ if (cmp >= 0)
+ continue;
- /* Compare against the last key, it must be greater. */
- WT_ERR(__wt_compare(
- session, btree->collator, update, r->last, &cmp));
- if (cmp >= 0)
- max = update;
+ /* Compare against the last key, it must be greater. */
+ WT_ERR(__wt_compare(
+ session, btree->collator, update, r->last, &cmp));
+ if (cmp >= 0)
+ max = update;
- /*
- * The skipped updates are in key-sort order so the entry we're
- * looking for is either the last one or the next-to-last one
- * in the list. Once we've compared an entry against the last
- * key on the page, we're done.
- */
- break;
- }
+ /*
+ * The saved updates are in key-sort order so the entry
+ * we're looking for is either the last or the next-to-
+ * last one in the list. Once we've compared an entry
+ * against the last key on the page, we're done.
+ */
+ break;
+ }
/*
* The largest key on the last block must sort before the current key,
@@ -2228,7 +2513,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session,
* We track the record number at each column-store split point, set an
* initial value.
*/
- recno = 0;
+ recno = WT_RECNO_OOB;
if (dsk->type == WT_PAGE_COL_VAR)
recno = last->recno;
@@ -2326,10 +2611,8 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session,
WT_RET(compressor->pre_size(compressor, wt_session,
(uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP,
(size_t)r->raw_offsets[slots], &result_len));
- extra_skip = 0;
- if (btree->kencryptor != NULL)
- extra_skip = btree->kencryptor->size_const +
- WT_ENCRYPT_LEN_SIZE;
+ extra_skip = btree->kencryptor == NULL ? 0 :
+ btree->kencryptor->size_const + WT_ENCRYPT_LEN_SIZE;
corrected_page_size = result_len + WT_BLOCK_COMPRESS_SKIP;
WT_RET(bm->write_size(bm, session, &corrected_page_size));
@@ -2477,7 +2760,7 @@ no_slots:
break;
case WT_PAGE_ROW_INT:
case WT_PAGE_ROW_LEAF:
- next->recno = 0;
+ next->recno = WT_RECNO_OOB;
if (!last_block) {
/*
* Confirm there was uncompressed data remaining
@@ -2530,7 +2813,8 @@ no_slots:
*
* If it's not a checkpoint, write the block.
*/
- if (r->bnd_next == 1 && last_block && __rec_is_checkpoint(r, last)) {
+ if (r->bnd_next == 1 &&
+ last_block && __rec_is_checkpoint(session, r, last)) {
if (write_ref == dst)
WT_RET(__wt_buf_set(
session, &r->dsk, dst->mem, dst->size));
@@ -2647,13 +2931,29 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
}
/*
- * We only arrive here with no entries to write if the page was entirely
- * empty, and if the page is empty, we merge it into its parent during
- * the parent's reconciliation. A page with skipped updates isn't truly
- * empty, continue on.
+ * We may arrive here with no entries to write if the page was entirely
+ * empty or if nothing on the page was visible to us.
*/
- if (r->entries == 0 && r->skip_next == 0)
- return (0);
+ if (r->entries == 0) {
+ /*
+ * Pages with skipped or not-yet-globally visible updates aren't
+ * really empty; otherwise, the page is truly empty and we will
+ * merge it into its parent during the parent's reconciliation.
+ */
+ if (r->supd_next == 0)
+ return (0);
+
+ /*
+ * If using the save/restore eviction path, continue with the
+ * write, the page will be restored after we finish.
+ *
+ * If using the lookaside table eviction path, we can't continue
+ * (we need a page to be written, otherwise we won't ever find
+ * the updates for future reads).
+ */
+ if (F_ISSET(r, WT_EVICT_LOOKASIDE))
+ return (EBUSY);
+ }
/* Set the boundary reference and increment the count. */
bnd = &r->bnd[r->bnd_next++];
@@ -2666,9 +2966,8 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
dsk->mem_size = r->dsk.size = WT_PTRDIFF32(r->first_free, dsk);
/* If this is a checkpoint, we're done, otherwise write the page. */
- return (
- __rec_is_checkpoint(r, bnd) ? 0 :
- __rec_split_write(session, r, bnd, &r->dsk, 1));
+ return (__rec_is_checkpoint(session, r, bnd) ?
+ 0 : __rec_split_write(session, r, bnd, &r->dsk, 1));
}
/*
@@ -2794,7 +3093,7 @@ __rec_split_write(WT_SESSION_IMPL *session,
WT_PAGE *page;
WT_PAGE_HEADER *dsk;
WT_PAGE_MODIFY *mod;
- WT_UPD_SKIPPED *skip;
+ WT_SAVE_UPD *supd;
size_t addr_size;
uint32_t bnd_slot, i, j;
int cmp;
@@ -2837,23 +3136,23 @@ __rec_split_write(WT_SESSION_IMPL *session,
bnd->cksum = 0;
/*
- * Check if we've skipped updates that belong to this block, and move
- * any to the per-block structure. Quit as soon as we find a skipped
+ * Check if we've saved updates that belong to this block, and move
+ * any to the per-block structure. Quit as soon as we find a saved
* update that doesn't belong to the block, they're in sorted order.
*
* This code requires a key be filled in for the next block (or the
* last block flag be set, if there's no next block).
*/
- for (i = 0, skip = r->skip; i < r->skip_next; ++i, ++skip) {
- /* The last block gets all remaining skipped updates. */
+ for (i = 0, supd = r->supd; i < r->supd_next; ++i, ++supd) {
+ /* The last block gets all remaining saved updates. */
if (last_block) {
- WT_ERR(__rec_skip_update_move(session, bnd, skip));
+ WT_ERR(__rec_update_move(session, bnd, supd));
continue;
}
/*
- * Get the skipped update's key and compare it with this block's
- * key range. If the skipped update list belongs with the block
+ * Get the saved update's key and compare it with this block's
+ * key range. If the saved update list belongs with the block
* we're about to write, move it to the per-block memory. Check
* only to the first update that doesn't go with the block, they
* must be in sorted order.
@@ -2861,43 +3160,56 @@ __rec_split_write(WT_SESSION_IMPL *session,
switch (page->type) {
case WT_PAGE_COL_FIX:
case WT_PAGE_COL_VAR:
- if (WT_INSERT_RECNO(skip->ins) >= (bnd + 1)->recno)
- goto skip_check_complete;
+ if (WT_INSERT_RECNO(supd->ins) >= (bnd + 1)->recno)
+ goto supd_check_complete;
break;
case WT_PAGE_ROW_LEAF:
- if (skip->ins == NULL)
+ if (supd->ins == NULL)
WT_ERR(__wt_row_leaf_key(
- session, page, skip->rip, key, 0));
+ session, page, supd->rip, key, 0));
else {
- key->data = WT_INSERT_KEY(skip->ins);
- key->size = WT_INSERT_KEY_SIZE(skip->ins);
+ key->data = WT_INSERT_KEY(supd->ins);
+ key->size = WT_INSERT_KEY_SIZE(supd->ins);
}
WT_ERR(__wt_compare(session,
btree->collator, key, &(bnd + 1)->key, &cmp));
if (cmp >= 0)
- goto skip_check_complete;
+ goto supd_check_complete;
break;
WT_ILLEGAL_VALUE_ERR(session);
}
- WT_ERR(__rec_skip_update_move(session, bnd, skip));
+ WT_ERR(__rec_update_move(session, bnd, supd));
}
-skip_check_complete:
+supd_check_complete:
/*
* If there are updates that weren't moved to the block, shuffle them to
- * the beginning of the cached list (we maintain the skipped updates in
- * sorted order, new skipped updates must be appended to the list).
+ * the beginning of the cached list (we maintain the saved updates in
+ * sorted order, new saved updates must be appended to the list).
+ */
+ for (j = 0; i < r->supd_next; ++j, ++i)
+ r->supd[j] = r->supd[i];
+ r->supd_next = j;
+
+ /*
+ * If using the lookaside table eviction path and we found updates that
+ * weren't globally visible when reconciling this page, note that in the
+ * page header.
*/
- for (j = 0; i < r->skip_next; ++j, ++i)
- r->skip[j] = r->skip[i];
- r->skip_next = j;
+ if (F_ISSET(r, WT_EVICT_LOOKASIDE) && bnd->supd != NULL) {
+ F_SET(dsk, WT_PAGE_LAS_UPDATE);
+ r->cache_write_lookaside = 1;
+ }
/*
- * If we had to skip updates in order to build this disk image, we can't
- * actually write it. Instead, we will re-instantiate the page using the
- * disk image and the list of updates we skipped.
+ * If using the save/restore eviction path and we had to skip updates in
+ * order to build this disk image, we can't actually write it. Instead,
+ * we will re-instantiate the page using the disk image and the list of
+ * updates we skipped.
*/
- if (bnd->skip != NULL) {
+ if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) {
+ r->cache_write_restore = 1;
+
/*
* If the buffer is compressed (raw compression was configured),
* we have to decompress it so we can instantiate it later. It's
@@ -2963,12 +3275,148 @@ skip_check_complete:
WT_ERR(__wt_strndup(session, addr, addr_size, &bnd->addr.addr));
bnd->addr.size = (uint8_t)addr_size;
+ /*
+ * If using the lookaside table eviction path and we found updates that
+ * weren't globally visible when reconciling this page, copy them into
+ * the database's lookaside store.
+ */
+ if (F_ISSET(r, WT_EVICT_LOOKASIDE) && bnd->supd != NULL)
+ ret = __rec_update_las(session, r, btree->id, bnd);
+
done:
err: __wt_scr_free(session, &key);
return (ret);
}
/*
+ * __rec_update_las --
+ * Copy a set of updates into the database's lookaside buffer.
+ */
+static int
+__rec_update_las(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, uint32_t btree_id, WT_BOUNDARY *bnd)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_ITEM(key);
+ WT_DECL_RET;
+ WT_ITEM las_addr, las_value;
+ WT_PAGE *page;
+ WT_SAVE_UPD *list;
+ WT_UPDATE *upd;
+ uint64_t las_counter;
+ uint32_t i, session_flags, slot;
+ uint8_t *p;
+
+ cursor = NULL;
+ WT_CLEAR(las_addr);
+ WT_CLEAR(las_value);
+ page = r->page;
+
+ /*
+ * We're writing lookaside records: start instantiating them on pages
+ * we read (with the right flag set), and start sweeping the file.
+ */
+ __wt_las_set_written(session);
+
+ WT_ERR(__wt_las_cursor(session, &cursor, &session_flags));
+
+ /* Ensure enough room for a column-store key without checking. */
+ WT_ERR(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key));
+
+ /*
+ * Each key in the lookaside table is associated with a block, and those
+ * blocks are freed and reallocated to other pages as pages in the tree
+ * are modified and reconciled. We want to be sure we don't add records
+ * to the lookaside table, then discard the block to which they apply,
+ * then write a new block to the same address, and then apply the old
+ * records to the new block when it's read. We don't want to clean old
+ * records out of the lookaside table every time we free a block because
+ * that happens a lot and would be costly; instead, we clean out the old
+ * records when adding new records into the lookaside table. This works
+ * because we only read from the lookaside table for pages marked with
+ * the WT_PAGE_LAS_UPDATE flag: that flag won't be set if we rewrite a
+ * block with no lookaside records, so the lookaside table won't be
+ * checked when the block is read, even if there are lookaside table
+ * records matching that block. If we rewrite a block that has lookaside
+ * records, we'll run this code, discarding any old records that might
+ * exist.
+ */
+ WT_ERR(__wt_las_remove_block(
+ session, cursor, btree_id, bnd->addr.addr, bnd->addr.size));
+
+ /* Lookaside table key component: block address. */
+ las_addr.data = bnd->addr.addr;
+ las_addr.size = bnd->addr.size;
+
+ /* Enter each update in the boundary's list into the lookaside store. */
+ for (las_counter = 0, i = 0,
+ list = bnd->supd; i < bnd->supd_next; ++i, ++list) {
+ /* Lookaside table key component: source key. */
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ p = key->mem;
+ WT_ERR(
+ __wt_vpack_uint(&p, 0, WT_INSERT_RECNO(list->ins)));
+ key->size = WT_PTRDIFF(p, key->data);
+
+ break;
+ case WT_PAGE_ROW_LEAF:
+ if (list->ins == NULL)
+ WT_ERR(__wt_row_leaf_key(
+ session, page, list->rip, key, 0));
+ else {
+ key->data = WT_INSERT_KEY(list->ins);
+ key->size = WT_INSERT_KEY_SIZE(list->ins);
+ }
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ /* Lookaside table value component: update reference. */
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ upd = list->ins->upd;
+ break;
+ case WT_PAGE_ROW_LEAF:
+ if (list->ins == NULL) {
+ slot = WT_ROW_SLOT(page, list->rip);
+ upd = page->pg_row_upd[slot];
+ } else
+ upd = list->ins->upd;
+ break;
+ WT_ILLEGAL_VALUE_ERR(session);
+ }
+
+ /*
+ * Walk the list of updates, storing each key/value pair into
+ * the lookaside table.
+ */
+ do {
+ cursor->set_key(cursor, btree_id,
+ &las_addr, ++las_counter, list->onpage_txn, key);
+
+ if (WT_UPDATE_DELETED_ISSET(upd))
+ las_value.size = 0;
+ else {
+ las_value.data = WT_UPDATE_DATA(upd);
+ las_value.size = upd->size;
+ }
+ cursor->set_value(
+ cursor, upd->txnid, upd->size, &las_value);
+
+ WT_ERR(cursor->insert(cursor));
+ } while ((upd = upd->next) != NULL);
+ }
+
+err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
+
+ __wt_scr_free(session, &key);
+ return (ret);
+}
+
+/*
* __wt_bulk_init --
* Bulk insert initialization.
*/
@@ -3008,7 +3456,7 @@ __wt_bulk_init(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
recno = 1;
break;
case BTREE_ROW:
- recno = 0;
+ recno = WT_RECNO_OOB;
break;
WT_ILLEGAL_VALUE(session);
}
@@ -3049,6 +3497,7 @@ __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk)
WT_RET(__rec_split_finish(session, r));
WT_RET(__rec_write_wrapup(session, r, r->page));
+ WT_RET(__rec_write_status(session, r, r->page));
/* Mark the page's parent and the tree dirty. */
parent = r->ref->home;
@@ -3824,7 +4273,7 @@ record_loop: /*
* Write a placeholder.
*/
WT_ASSERT(session,
- F_ISSET(r, WT_SKIP_UPDATE_RESTORE));
+ F_ISSET(r, WT_EVICT_UPDATE_RESTORE));
data = "@";
size = 1;
@@ -4207,7 +4656,7 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
vtype = state == WT_CHILD_PROXY ?
WT_CELL_ADDR_DEL : (u_int)vpack->raw;
}
- __rec_cell_build_addr(r, p, size, vtype, 0);
+ __rec_cell_build_addr(r, p, size, vtype, WT_RECNO_OOB);
CHILD_RELEASE_ERR(session, hazard, ref);
/*
@@ -4294,7 +4743,7 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
addr = &multi->addr;
__rec_cell_build_addr(
- r, addr->addr, addr->size, __rec_vtype(addr), 0);
+ r, addr->addr, addr->size, __rec_vtype(addr), WT_RECNO_OOB);
/* Boundary: split or write the page. */
if (key->len + val->len > r->space_avail)
@@ -4450,7 +4899,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
* Assert the case.
*/
WT_ASSERT(session,
- F_ISSET(r, WT_SKIP_UPDATE_RESTORE));
+ F_ISSET(r, WT_EVICT_UPDATE_RESTORE));
/*
* If the key is also a removed overflow item,
@@ -4777,13 +5226,11 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
static int
__rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page)
{
- WT_BM *bm;
WT_DECL_RET;
WT_PAGE_MODIFY *mod;
WT_MULTI *multi;
uint32_t i;
- bm = S2BT(session)->bm;
mod = page->modify;
/*
@@ -4799,17 +5246,17 @@ __rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page)
__wt_free(session, multi->key.ikey);
break;
}
- if (multi->skip == NULL) {
+ if (multi->supd == NULL) {
if (multi->addr.reuse)
multi->addr.addr = NULL;
else {
- WT_RET(bm->free(bm, session,
+ WT_RET(__rec_block_free(session,
multi->addr.addr, multi->addr.size));
__wt_free(session, multi->addr.addr);
}
} else {
- __wt_free(session, multi->skip);
- __wt_free(session, multi->skip_dsk);
+ __wt_free(session, multi->supd);
+ __wt_free(session, multi->supd_dsk);
}
}
__wt_free(session, mod->mod_multi);
@@ -4882,7 +5329,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
*/
WT_RET(__wt_ref_info(
session, ref, &addr, &addr_size, NULL));
- WT_RET(bm->free(bm, session, addr, addr_size));
+ WT_RET(__rec_block_free(session, addr, addr_size));
if (__wt_off_page(ref->home, ref->addr)) {
__wt_free(
session, ((WT_ADDR *)ref->addr)->addr);
@@ -4908,7 +5355,7 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
* are checkpoints, and must be explicitly dropped.
*/
if (!__wt_ref_is_root(ref))
- WT_RET(bm->free(bm, session,
+ WT_RET(__rec_block_free(session,
mod->mod_replace.addr, mod->mod_replace.size));
/* Discard the replacement page's address. */
@@ -4962,14 +5409,14 @@ __rec_write_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
* nothing to write. Allocate, then initialize the array of
* replacement blocks.
*/
- if (bnd->skip != NULL) {
+ if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) {
WT_RET(__wt_calloc_def(
session, r->bnd_next, &mod->mod_multi));
multi = mod->mod_multi;
- multi->skip = bnd->skip;
- multi->skip_entries = bnd->skip_next;
- bnd->skip = NULL;
- multi->skip_dsk = bnd->dsk;
+ multi->supd = bnd->supd;
+ multi->supd_entries = bnd->supd_next;
+ bnd->supd = NULL;
+ multi->supd_dsk = bnd->dsk;
bnd->dsk = NULL;
mod->mod_multi_entries = 1;
@@ -5068,50 +5515,6 @@ err: __wt_scr_free(session, &tkey);
F_SET(mod, WT_PM_REC_MULTIBLOCK);
break;
}
-
- /*
- * If updates were skipped, the tree isn't clean. The checkpoint call
- * cleared the tree's modified value before calling the eviction thread,
- * so we must explicitly reset the tree's modified flag. We insert a
- * barrier after the change for clarity (the requirement is the value
- * be set before a subsequent checkpoint reads it, and because the
- * current checkpoint is waiting on this reconciliation to complete,
- * there's no risk of that happening).
- */
- if (r->leave_dirty) {
- mod->first_dirty_txn = r->first_dirty_txn;
-
- btree->modified = 1;
- WT_FULL_BARRIER();
- } else {
- /*
- * If no updates were skipped, we have a new maximum transaction
- * written for the page (used to decide if a clean page can be
- * evicted). Set the highest transaction ID for the page.
- *
- * Track the highest transaction ID for the tree (used to decide
- * if it's safe to discard all of the pages in the tree without
- * further checking). Reconciliation in the service of eviction
- * is multi-threaded, only update the tree's maximum transaction
- * ID when doing a checkpoint. That's sufficient, we only care
- * about the highest transaction ID of any update currently in
- * the tree, and checkpoint visits every dirty page in the tree.
- */
- mod->rec_max_txn = r->max_txn;
- if (!F_ISSET(r, WT_EVICTING) &&
- WT_TXNID_LT(btree->rec_max_txn, r->max_txn))
- btree->rec_max_txn = r->max_txn;
-
- /*
- * The page only might be clean; if the write generation is
- * unchanged since reconciliation started, it's clean. If the
- * write generation changed, the page has been written since
- * we started reconciliation and remains dirty.
- */
- if (WT_ATOMIC_CAS4(mod->write_gen, r->orig_write_gen, 0))
- __wt_cache_dirty_decr(session, page);
- }
-
return (0);
}
@@ -5122,14 +5525,12 @@ err: __wt_scr_free(session, &tkey);
static int
__rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
{
- WT_BM *bm;
WT_BOUNDARY *bnd;
WT_DECL_RET;
WT_MULTI *multi;
WT_PAGE_MODIFY *mod;
uint32_t i;
- bm = S2BT(session)->bm;
mod = page->modify;
/*
@@ -5160,7 +5561,7 @@ __rec_write_wrapup_err(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
if (bnd->addr.reuse)
bnd->addr.addr = NULL;
else {
- WT_TRET(bm->free(bm, session,
+ WT_TRET(__rec_block_free(session,
bnd->addr.addr, bnd->addr.size));
__wt_free(session, bnd->addr.addr);
}
@@ -5203,18 +5604,18 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
WT_RET(__wt_row_ikey_alloc(session, 0,
bnd->key.data, bnd->key.size, &multi->key.ikey));
- if (bnd->skip == NULL) {
+ if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) {
+ multi->supd = bnd->supd;
+ multi->supd_entries = bnd->supd_next;
+ bnd->supd = NULL;
+ multi->supd_dsk = bnd->dsk;
+ bnd->dsk = NULL;
+ } else {
multi->addr = bnd->addr;
multi->addr.reuse = 0;
multi->size = bnd->size;
multi->cksum = bnd->cksum;
bnd->addr.addr = NULL;
- } else {
- multi->skip = bnd->skip;
- multi->skip_entries = bnd->skip_next;
- bnd->skip = NULL;
- multi->skip_dsk = bnd->dsk;
- bnd->dsk = NULL;
}
}
mod->mod_multi_entries = r->bnd_next;
@@ -5243,18 +5644,18 @@ __rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) {
multi->key.recno = bnd->recno;
- if (bnd->skip == NULL) {
+ if (F_ISSET(r, WT_EVICT_UPDATE_RESTORE) && bnd->supd != NULL) {
+ multi->supd = bnd->supd;
+ multi->supd_entries = bnd->supd_next;
+ bnd->supd = NULL;
+ multi->supd_dsk = bnd->dsk;
+ bnd->dsk = NULL;
+ } else {
multi->addr = bnd->addr;
multi->addr.reuse = 0;
multi->size = bnd->size;
multi->cksum = bnd->cksum;
bnd->addr.addr = NULL;
- } else {
- multi->skip = bnd->skip;
- multi->skip_entries = bnd->skip_next;
- bnd->skip = NULL;
- multi->skip_dsk = bnd->dsk;
- bnd->dsk = NULL;
}
}
mod->mod_multi_entries = r->bnd_next;
diff --git a/src/schema/schema_list.c b/src/schema/schema_list.c
index a36fd696079..d091a5d94da 100644
--- a/src/schema/schema_list.c
+++ b/src/schema/schema_list.c
@@ -29,8 +29,8 @@ __schema_add_table(WT_SESSION_IMPL *session,
WT_RET(ret);
bucket = table->name_hash % WT_HASH_ARRAY_SIZE;
- SLIST_INSERT_HEAD(&session->tables, table, l);
- SLIST_INSERT_HEAD(&session->tablehash[bucket], table, hashl);
+ TAILQ_INSERT_HEAD(&session->tables, table, q);
+ TAILQ_INSERT_HEAD(&session->tablehash[bucket], table, hashq);
*tablep = table;
return (0);
@@ -51,7 +51,7 @@ __schema_find_table(WT_SESSION_IMPL *session,
bucket = __wt_hash_city64(name, namelen) % WT_HASH_ARRAY_SIZE;
restart:
- SLIST_FOREACH(table, &session->tablehash[bucket], hashl) {
+ TAILQ_FOREACH(table, &session->tablehash[bucket], hashq) {
tablename = table->name;
(void)WT_PREFIX_SKIP(tablename, "table:");
if (WT_STRING_MATCH(tablename, name, namelen)) {
@@ -228,8 +228,8 @@ __wt_schema_remove_table(WT_SESSION_IMPL *session, WT_TABLE *table)
WT_ASSERT(session, table->refcnt <= 1);
bucket = table->name_hash % WT_HASH_ARRAY_SIZE;
- SLIST_REMOVE(&session->tables, table, __wt_table, l);
- SLIST_REMOVE(&session->tablehash[bucket], table, __wt_table, hashl);
+ TAILQ_REMOVE(&session->tables, table, q);
+ TAILQ_REMOVE(&session->tablehash[bucket], table, hashq);
return (__wt_schema_destroy_table(session, &table));
}
@@ -243,7 +243,7 @@ __wt_schema_close_tables(WT_SESSION_IMPL *session)
WT_DECL_RET;
WT_TABLE *table;
- while ((table = SLIST_FIRST(&session->tables)) != NULL)
+ while ((table = TAILQ_FIRST(&session->tables)) != NULL)
WT_TRET(__wt_schema_remove_table(session, table));
return (ret);
}
diff --git a/src/schema/schema_stat.c b/src/schema/schema_stat.c
index dea797f823d..e9439abe16f 100644
--- a/src/schema/schema_stat.c
+++ b/src/schema/schema_stat.c
@@ -90,7 +90,7 @@ __wt_curstat_table_init(WT_SESSION_IMPL *session,
if (i == 0)
*stats = *new;
else
- __wt_stat_aggregate_dsrc_stats(new, stats);
+ __wt_stat_dsrc_aggregate_single(new, stats);
WT_ERR(stat_cursor->close(stat_cursor));
}
@@ -102,7 +102,7 @@ __wt_curstat_table_init(WT_SESSION_IMPL *session,
WT_ERR(__wt_curstat_open(
session, buf->data, cfg, &stat_cursor));
new = (WT_DSRC_STATS *)WT_CURSOR_STATS(stat_cursor);
- __wt_stat_aggregate_dsrc_stats(new, stats);
+ __wt_stat_dsrc_aggregate_single(new, stats);
WT_ERR(stat_cursor->close(stat_cursor));
}
diff --git a/src/session/session_api.c b/src/session/session_api.c
index ef9735a8b98..a1f5618a317 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -383,6 +383,22 @@ err: if (cursor != NULL)
}
/*
+ * __wt_session_create --
+ * Internal version of WT_SESSION::create.
+ */
+int
+__wt_session_create(
+ WT_SESSION_IMPL *session, const char *uri, const char *config)
+{
+ WT_DECL_RET;
+
+ WT_WITH_SCHEMA_LOCK(session,
+ WT_WITH_TABLE_LOCK(session,
+ ret = __wt_schema_create(session, uri, config)));
+ return (ret);
+}
+
+/*
* __session_create --
* WT_SESSION->create method.
*/
@@ -423,9 +439,7 @@ __session_create(WT_SESSION *wt_session, const char *uri, const char *config)
WT_ERR_NOTFOUND_OK(ret);
}
- WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_TABLE_LOCK(session,
- ret = __wt_schema_create(session, uri, config)));
+ ret = __wt_session_create(session, uri, config);
err: API_END_RET_NOTFOUND_MAP(session, ret);
}
@@ -529,6 +543,21 @@ __session_compact(WT_SESSION *wt_session, const char *uri, const char *config)
}
/*
+ * __wt_session_drop --
+ * Internal version of WT_SESSION::drop.
+ */
+int
+__wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
+{
+ WT_DECL_RET;
+
+ WT_WITH_SCHEMA_LOCK(session,
+ WT_WITH_TABLE_LOCK(session,
+ ret = __wt_schema_drop(session, uri, cfg)));
+ return (ret);
+}
+
+/*
* __session_drop --
* WT_SESSION->drop method.
*/
@@ -544,9 +573,7 @@ __session_drop(WT_SESSION *wt_session, const char *uri, const char *config)
/* Disallow objects in the WiredTiger name space. */
WT_ERR(__wt_str_name_check(session, uri));
- WT_WITH_SCHEMA_LOCK(session,
- WT_WITH_TABLE_LOCK(session,
- ret = __wt_schema_drop(session, uri, cfg)));
+ ret = __wt_session_drop(session, uri, cfg);
err: /* Note: drop operations cannot be unrolled (yet?). */
API_END_RET_NOTFOUND_MAP(session, ret);
@@ -800,7 +827,7 @@ __session_commit_transaction(WT_SESSION *wt_session, const char *config)
WT_STAT_FAST_CONN_INCR(session, txn_commit);
txn = &session->txn;
- if (F_ISSET(txn, WT_TXN_ERROR)) {
+ if (F_ISSET(txn, WT_TXN_ERROR) && txn->mod_count != 0) {
__wt_errx(session, "failed transaction requires rollback");
ret = EINVAL;
}
@@ -915,7 +942,7 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config)
* If our LSN is smaller than the current sync LSN then our
* transaction is stable. We're done.
*/
- if (WT_LOG_CMP(&session->bg_sync_lsn, &log->sync_lsn) <= 0)
+ if (__wt_log_cmp(&session->bg_sync_lsn, &log->sync_lsn) <= 0)
goto err;
/*
@@ -937,7 +964,7 @@ __session_transaction_sync(WT_SESSION *wt_session, const char *config)
* Keep checking the LSNs until we find it is stable or we reach
* our timeout.
*/
- while (WT_LOG_CMP(&session->bg_sync_lsn, &log->sync_lsn) > 0) {
+ while (__wt_log_cmp(&session->bg_sync_lsn, &log->sync_lsn) > 0) {
WT_ERR(__wt_cond_signal(session, conn->log_file_cond));
WT_ERR(__wt_epoch(session, &now));
waited_ms = WT_TIMEDIFF(now, start) / WT_MILLION;
@@ -1001,7 +1028,7 @@ __session_checkpoint(WT_SESSION *wt_session, const char *config)
* operations, but checkpoint does enough I/O it may be called upon to
* perform slow operations for the block manager.
*/
- F_SET(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_CACHE_CHECK);
+ F_SET(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION);
/*
* Only one checkpoint can be active at a time, and checkpoints must run
@@ -1016,7 +1043,7 @@ __session_checkpoint(WT_SESSION *wt_session, const char *config)
WT_STAT_FAST_CONN_SET(session, txn_checkpoint_running, 0);
-err: F_CLR(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_CACHE_CHECK);
+err: F_CLR(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION);
API_END_RET_NOTFOUND_MAP(session, ret);
}
@@ -1166,8 +1193,8 @@ __wt_open_session(WT_CONNECTION_IMPL *conn,
if (i == conn->session_size)
WT_ERR_MSG(session, ENOMEM,
"only configured to support %" PRIu32 " sessions"
- " (including %" PRIu32 " internal)",
- conn->session_size, WT_NUM_INTERNAL_SESSIONS);
+ " (including %d additional internal sessions)",
+ conn->session_size, WT_EXTRA_INTERNAL_SESSIONS);
/*
* If the active session count is increasing, update it. We don't worry
@@ -1190,7 +1217,7 @@ __wt_open_session(WT_CONNECTION_IMPL *conn,
event_handler == NULL ? session->event_handler : event_handler);
TAILQ_INIT(&session_ret->cursors);
- SLIST_INIT(&session_ret->dhandles);
+ TAILQ_INIT(&session_ret->dhandles);
/*
* If we don't have one, allocate the dhandle hash array.
* Allocate the table hash array as well.
@@ -1202,8 +1229,8 @@ __wt_open_session(WT_CONNECTION_IMPL *conn,
WT_ERR(__wt_calloc(session_ret, WT_HASH_ARRAY_SIZE,
sizeof(struct __tables_hash), &session_ret->tablehash));
for (i = 0; i < WT_HASH_ARRAY_SIZE; i++) {
- SLIST_INIT(&session_ret->dhhash[i]);
- SLIST_INIT(&session_ret->tablehash[i]);
+ TAILQ_INIT(&session_ret->dhhash[i]);
+ TAILQ_INIT(&session_ret->tablehash[i]);
}
/* Initialize transaction support: default to read-committed. */
diff --git a/src/session/session_dhandle.c b/src/session/session_dhandle.c
index be8ca494778..dd0b50cc094 100644
--- a/src/session/session_dhandle.c
+++ b/src/session/session_dhandle.c
@@ -25,8 +25,8 @@ __session_add_dhandle(
dhandle_cache->dhandle = session->dhandle;
bucket = dhandle_cache->dhandle->name_hash % WT_HASH_ARRAY_SIZE;
- SLIST_INSERT_HEAD(&session->dhandles, dhandle_cache, l);
- SLIST_INSERT_HEAD(&session->dhhash[bucket], dhandle_cache, hashl);
+ TAILQ_INSERT_HEAD(&session->dhandles, dhandle_cache, q);
+ TAILQ_INSERT_HEAD(&session->dhhash[bucket], dhandle_cache, hashq);
if (dhandle_cachep != NULL)
*dhandle_cachep = dhandle_cache;
@@ -36,6 +36,61 @@ __session_add_dhandle(
}
/*
+ * __session_discard_dhandle --
+ * Remove a data handle from the session cache.
+ */
+static void
+__session_discard_dhandle(
+ WT_SESSION_IMPL *session, WT_DATA_HANDLE_CACHE *dhandle_cache)
+{
+ uint64_t bucket;
+
+ bucket = dhandle_cache->dhandle->name_hash % WT_HASH_ARRAY_SIZE;
+ TAILQ_REMOVE(&session->dhandles, dhandle_cache, q);
+ TAILQ_REMOVE(&session->dhhash[bucket], dhandle_cache, hashq);
+
+ (void)__wt_atomic_sub32(&dhandle_cache->dhandle->session_ref, 1);
+
+ __wt_overwrite_and_free(session, dhandle_cache);
+}
+
+/*
+ * __session_find_dhandle --
+ * Search for a data handle in the session cache.
+ */
+static void
+__session_find_dhandle(WT_SESSION_IMPL *session,
+ const char *uri, const char *checkpoint,
+ WT_DATA_HANDLE_CACHE **dhandle_cachep)
+{
+ WT_DATA_HANDLE *dhandle;
+ WT_DATA_HANDLE_CACHE *dhandle_cache;
+ uint64_t bucket;
+
+ dhandle = NULL;
+
+ bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE;
+retry: TAILQ_FOREACH(dhandle_cache, &session->dhhash[bucket], hashq) {
+ dhandle = dhandle_cache->dhandle;
+ if (WT_DHANDLE_INACTIVE(dhandle) && !WT_IS_METADATA(dhandle)) {
+ __session_discard_dhandle(session, dhandle_cache);
+ /* We deleted our entry, retry from the start. */
+ goto retry;
+ }
+
+ if (strcmp(uri, dhandle->name) != 0)
+ continue;
+ if (checkpoint == NULL && dhandle->checkpoint == NULL)
+ break;
+ if (checkpoint != NULL && dhandle->checkpoint != NULL &&
+ strcmp(checkpoint, dhandle->checkpoint) == 0)
+ break;
+ }
+
+ *dhandle_cachep = dhandle_cache;
+}
+
+/*
* __wt_session_lock_dhandle --
* Return when the current data handle is either (a) open with the
* requested lock mode; or (b) closed and write locked. If exclusive
@@ -173,6 +228,7 @@ __wt_session_release_btree(WT_SESSION_IMPL *session)
{
WT_BTREE *btree;
WT_DATA_HANDLE *dhandle;
+ WT_DATA_HANDLE_CACHE *dhandle_cache;
WT_DECL_RET;
int locked, write_locked;
@@ -185,6 +241,13 @@ __wt_session_release_btree(WT_SESSION_IMPL *session)
* If we had special flags set, close the handle so that future access
* can get a handle without special flags.
*/
+ if (F_ISSET(dhandle, WT_DHANDLE_DISCARD | WT_DHANDLE_DISCARD_FORCE)) {
+ __session_find_dhandle(session,
+ dhandle->name, dhandle->checkpoint, &dhandle_cache);
+ if (dhandle_cache != NULL)
+ __session_discard_dhandle(session, dhandle_cache);
+ }
+
if (F_ISSET(dhandle, WT_DHANDLE_DISCARD_FORCE)) {
ret = __wt_conn_btree_sync_and_close(session, 0, 1);
F_CLR(dhandle, WT_DHANDLE_DISCARD_FORCE);
@@ -272,26 +335,6 @@ retry: WT_RET(__wt_meta_checkpoint_last_name(
}
/*
- * __session_discard_btree --
- * Discard our reference to the btree.
- */
-static void
-__session_discard_btree(
- WT_SESSION_IMPL *session, WT_DATA_HANDLE_CACHE *dhandle_cache)
-{
- uint64_t bucket;
-
- bucket = dhandle_cache->dhandle->name_hash % WT_HASH_ARRAY_SIZE;
- SLIST_REMOVE(
- &session->dhandles, dhandle_cache, __wt_data_handle_cache, l);
- SLIST_REMOVE(&session->dhhash[bucket],
- dhandle_cache, __wt_data_handle_cache, hashl);
-
- (void)WT_ATOMIC_SUB4(dhandle_cache->dhandle->session_ref, 1);
- __wt_overwrite_and_free(session, dhandle_cache);
-}
-
-/*
* __wt_session_close_cache --
* Close any cached handles in a session.
*/
@@ -300,8 +343,8 @@ __wt_session_close_cache(WT_SESSION_IMPL *session)
{
WT_DATA_HANDLE_CACHE *dhandle_cache;
- while ((dhandle_cache = SLIST_FIRST(&session->dhandles)) != NULL)
- __session_discard_btree(session, dhandle_cache);
+ while ((dhandle_cache = TAILQ_FIRST(&session->dhandles)) != NULL)
+ __session_discard_dhandle(session, dhandle_cache);
}
/*
@@ -329,18 +372,18 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session)
WT_STAT_FAST_CONN_INCR(session, dh_session_sweeps);
- dhandle_cache = SLIST_FIRST(&session->dhandles);
+ dhandle_cache = TAILQ_FIRST(&session->dhandles);
while (dhandle_cache != NULL) {
- dhandle_cache_next = SLIST_NEXT(dhandle_cache, l);
+ dhandle_cache_next = TAILQ_NEXT(dhandle_cache, q);
dhandle = dhandle_cache->dhandle;
if (dhandle != session->dhandle &&
dhandle->session_inuse == 0 &&
- (F_ISSET(dhandle, WT_DHANDLE_DEAD) ||
+ (WT_DHANDLE_INACTIVE(dhandle) ||
(dhandle->timeofdeath != 0 &&
now - dhandle->timeofdeath > conn->sweep_idle_time))) {
WT_STAT_FAST_CONN_INCR(session, dh_session_handles);
WT_ASSERT(session, !WT_IS_METADATA(dhandle));
- __session_discard_btree(session, dhandle_cache);
+ __session_discard_dhandle(session, dhandle_cache);
}
dhandle_cache = dhandle_cache_next;
}
@@ -348,51 +391,37 @@ __session_dhandle_sweep(WT_SESSION_IMPL *session)
}
/*
- * __session_dhandle_find_shared --
+ * __session_find_shared_dhandle --
* Search for a data handle in the connection and add it to a session's
* cache. Since the data handle isn't locked, this must be called holding
* the handle list lock, and we must increment the handle's reference
* count before releasing it.
*/
static int
-__session_dhandle_find_shared(
+__session_find_shared_dhandle(
WT_SESSION_IMPL *session, const char *uri, const char *checkpoint)
{
WT_RET(__wt_conn_dhandle_find(session, uri, checkpoint));
- (void)WT_ATOMIC_ADD4(session->dhandle->session_ref, 1);
+ (void)__wt_atomic_add32(&session->dhandle->session_ref, 1);
return (0);
}
+
/*
- * __session_dhandle_find --
+ * __session_get_dhandle --
* Search for a data handle, first in the session cache, then in the
* connection.
*/
static int
-__session_dhandle_find(
+__session_get_dhandle(
WT_SESSION_IMPL *session, const char *uri, const char *checkpoint)
{
- WT_DATA_HANDLE *dhandle;
WT_DATA_HANDLE_CACHE *dhandle_cache;
WT_DECL_RET;
- uint64_t bucket;
- bucket = __wt_hash_city64(uri, strlen(uri)) % WT_HASH_ARRAY_SIZE;
-retry: SLIST_FOREACH(dhandle_cache, &session->dhhash[bucket], hashl) {
- dhandle = dhandle_cache->dhandle;
- if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) {
- WT_ASSERT(session, !WT_IS_METADATA(dhandle));
- __session_discard_btree(session, dhandle_cache);
- /* We deleted our entry, retry from the start. */
- goto retry;
- }
- if (strcmp(uri, dhandle->name) != 0)
- continue;
- if ((checkpoint == NULL && dhandle->checkpoint == NULL) ||
- (checkpoint != NULL && dhandle->checkpoint != NULL &&
- strcmp(checkpoint, dhandle->checkpoint) == 0)) {
- session->dhandle = dhandle;
- return (0);
- }
+ __session_find_dhandle(session, uri, checkpoint, &dhandle_cache);
+ if (dhandle_cache != NULL) {
+ session->dhandle = dhandle_cache->dhandle;
+ return (0);
}
/*
@@ -400,7 +429,7 @@ retry: SLIST_FOREACH(dhandle_cache, &session->dhhash[bucket], hashl) {
* handle list and cache the handle we find.
*/
WT_WITH_HANDLE_LIST_LOCK(session, ret =
- __session_dhandle_find_shared(session, uri, checkpoint));
+ __session_find_shared_dhandle(session, uri, checkpoint));
if (ret == 0)
ret = __session_add_dhandle(session, NULL);
@@ -422,7 +451,7 @@ __wt_session_get_btree(WT_SESSION_IMPL *session,
WT_ASSERT(session, !F_ISSET(session, WT_SESSION_NO_DATA_HANDLES));
for (;;) {
- WT_RET(__session_dhandle_find(session, uri, checkpoint));
+ WT_RET(__session_get_dhandle(session, uri, checkpoint));
dhandle = session->dhandle;
/*
diff --git a/src/support/pow.c b/src/support/pow.c
index 8e42113a2ee..0f50bfe56a1 100644
--- a/src/support/pow.c
+++ b/src/support/pow.c
@@ -100,7 +100,7 @@ __wt_log2_int(uint32_t n)
* __wt_ispo2 --
* Return if a number is a power-of-two.
*/
-int
+bool
__wt_ispo2(uint32_t v)
{
/*
diff --git a/src/support/rand.c b/src/support/rand.c
index caac04d3529..f5ecb12633e 100644
--- a/src/support/rand.c
+++ b/src/support/rand.c
@@ -84,8 +84,11 @@ __wt_random(WT_RAND_STATE volatile * rnd_state)
* to initialize the state, or initializes with a seed that results in a
* short period.
*/
- if (z == 0 || w == 0)
- __wt_random_init(rnd_state);
+ if (z == 0 || w == 0) {
+ __wt_random_init(&rnd);
+ w = M_W(rnd);
+ z = M_Z(rnd);
+ }
M_Z(rnd) = z = 36969 * (z & 65535) + (z >> 16);
M_W(rnd) = w = 18000 * (w & 65535) + (w >> 16);
diff --git a/src/support/stat.c b/src/support/stat.c
index b0e7d660587..79248b0652c 100644
--- a/src/support/stat.c
+++ b/src/support/stat.c
@@ -2,672 +2,1016 @@
#include "wt_internal.h"
+static const char * const __stats_dsrc_desc[] = {
+ "block-manager: file allocation unit size",
+ "block-manager: blocks allocated",
+ "block-manager: checkpoint size",
+ "block-manager: allocations requiring file extension",
+ "block-manager: blocks freed",
+ "block-manager: file magic number",
+ "block-manager: file major version number",
+ "block-manager: minor version number",
+ "block-manager: file bytes available for reuse",
+ "block-manager: file size in bytes",
+ "LSM: bloom filters in the LSM tree",
+ "LSM: bloom filter false positives",
+ "LSM: bloom filter hits",
+ "LSM: bloom filter misses",
+ "LSM: bloom filter pages evicted from cache",
+ "LSM: bloom filter pages read into cache",
+ "LSM: total size of bloom filters",
+ "btree: btree checkpoint generation",
+ "btree: column-store variable-size deleted values",
+ "btree: column-store fixed-size leaf pages",
+ "btree: column-store internal pages",
+ "btree: column-store variable-size RLE encoded values",
+ "btree: column-store variable-size leaf pages",
+ "btree: pages rewritten by compaction",
+ "btree: number of key/value pairs",
+ "btree: fixed-record size",
+ "btree: maximum tree depth",
+ "btree: maximum internal page key size",
+ "btree: maximum internal page size",
+ "btree: maximum leaf page key size",
+ "btree: maximum leaf page size",
+ "btree: maximum leaf page value size",
+ "btree: overflow pages",
+ "btree: row-store internal pages",
+ "btree: row-store leaf pages",
+ "cache: bytes read into cache",
+ "cache: bytes written from cache",
+ "cache: checkpoint blocked page eviction",
+ "cache: unmodified pages evicted",
+ "cache: page split during eviction deepened the tree",
+ "cache: modified pages evicted",
+ "cache: data source pages selected for eviction unable to be evicted",
+ "cache: hazard pointer blocked page eviction",
+ "cache: internal pages evicted",
+ "cache: pages split during eviction",
+ "cache: in-memory page splits",
+ "cache: in-memory page passed criteria to be split",
+ "cache: overflow values cached in memory",
+ "cache: pages read into cache",
+ "cache: pages read into cache requiring lookaside entries",
+ "cache: overflow pages read into cache",
+ "cache: pages written from cache",
+ "cache: page written requiring lookaside records",
+ "cache: pages written requiring in-memory restoration",
+ "compression: raw compression call failed, no additional data available",
+ "compression: raw compression call failed, additional data available",
+ "compression: raw compression call succeeded",
+ "compression: compressed pages read",
+ "compression: compressed pages written",
+ "compression: page written failed to compress",
+ "compression: page written was too small to compress",
+ "cursor: create calls",
+ "cursor: insert calls",
+ "cursor: bulk-loaded cursor-insert calls",
+ "cursor: cursor-insert key and value bytes inserted",
+ "cursor: next calls",
+ "cursor: prev calls",
+ "cursor: remove calls",
+ "cursor: cursor-remove key bytes removed",
+ "cursor: reset calls",
+ "cursor: restarted searches",
+ "cursor: search calls",
+ "cursor: search near calls",
+ "cursor: update calls",
+ "cursor: cursor-update value bytes updated",
+ "LSM: sleep for LSM checkpoint throttle",
+ "LSM: chunks in the LSM tree",
+ "LSM: highest merge generation in the LSM tree",
+ "LSM: queries that could have benefited from a Bloom filter that did not exist",
+ "LSM: sleep for LSM merge throttle",
+ "reconciliation: dictionary matches",
+ "reconciliation: internal page multi-block writes",
+ "reconciliation: leaf page multi-block writes",
+ "reconciliation: maximum blocks required for a page",
+ "reconciliation: internal-page overflow keys",
+ "reconciliation: leaf-page overflow keys",
+ "reconciliation: overflow values written",
+ "reconciliation: pages deleted",
+ "reconciliation: page checksum matches",
+ "reconciliation: page reconciliation calls",
+ "reconciliation: page reconciliation calls for eviction",
+ "reconciliation: leaf page key bytes discarded using prefix compression",
+ "reconciliation: internal page key bytes discarded using suffix compression",
+ "session: object compaction",
+ "session: open cursor count",
+ "transaction: update conflicts",
+};
+
+const char *
+__wt_stat_dsrc_desc(int slot)
+{
+ return (__stats_dsrc_desc[slot]);
+}
+
void
-__wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats)
+__wt_stat_dsrc_init_single(WT_DSRC_STATS *stats)
{
- /* Clear, so can also be called for reinitialization. */
memset(stats, 0, sizeof(*stats));
+}
+
+void
+__wt_stat_dsrc_init(WT_DATA_HANDLE *handle)
+{
+ int i;
- stats->block_extension.desc =
- "block-manager: allocations requiring file extension";
- stats->block_alloc.desc = "block-manager: blocks allocated";
- stats->block_free.desc = "block-manager: blocks freed";
- stats->block_checkpoint_size.desc = "block-manager: checkpoint size";
- stats->allocation_size.desc =
- "block-manager: file allocation unit size";
- stats->block_reuse_bytes.desc =
- "block-manager: file bytes available for reuse";
- stats->block_magic.desc = "block-manager: file magic number";
- stats->block_major.desc = "block-manager: file major version number";
- stats->block_size.desc = "block-manager: file size in bytes";
- stats->block_minor.desc = "block-manager: minor version number";
- stats->btree_checkpoint_generation.desc =
- "btree: btree checkpoint generation";
- stats->btree_column_fix.desc =
- "btree: column-store fixed-size leaf pages";
- stats->btree_column_internal.desc =
- "btree: column-store internal pages";
- stats->btree_column_deleted.desc =
- "btree: column-store variable-size deleted values";
- stats->btree_column_variable.desc =
- "btree: column-store variable-size leaf pages";
- stats->btree_fixed_len.desc = "btree: fixed-record size";
- stats->btree_maxintlkey.desc = "btree: maximum internal page key size";
- stats->btree_maxintlpage.desc = "btree: maximum internal page size";
- stats->btree_maxleafkey.desc = "btree: maximum leaf page key size";
- stats->btree_maxleafpage.desc = "btree: maximum leaf page size";
- stats->btree_maxleafvalue.desc = "btree: maximum leaf page value size";
- stats->btree_maximum_depth.desc = "btree: maximum tree depth";
- stats->btree_entries.desc = "btree: number of key/value pairs";
- stats->btree_overflow.desc = "btree: overflow pages";
- stats->btree_compact_rewrite.desc =
- "btree: pages rewritten by compaction";
- stats->btree_row_internal.desc = "btree: row-store internal pages";
- stats->btree_row_leaf.desc = "btree: row-store leaf pages";
- stats->cache_bytes_read.desc = "cache: bytes read into cache";
- stats->cache_bytes_write.desc = "cache: bytes written from cache";
- stats->cache_eviction_checkpoint.desc =
- "cache: checkpoint blocked page eviction";
- stats->cache_eviction_fail.desc =
- "cache: data source pages selected for eviction unable to be evicted";
- stats->cache_eviction_hazard.desc =
- "cache: hazard pointer blocked page eviction";
- stats->cache_inmem_split.desc = "cache: in-memory page splits";
- stats->cache_eviction_internal.desc = "cache: internal pages evicted";
- stats->cache_eviction_dirty.desc = "cache: modified pages evicted";
- stats->cache_read_overflow.desc =
- "cache: overflow pages read into cache";
- stats->cache_overflow_value.desc =
- "cache: overflow values cached in memory";
- stats->cache_eviction_deepen.desc =
- "cache: page split during eviction deepened the tree";
- stats->cache_read.desc = "cache: pages read into cache";
- stats->cache_eviction_split.desc =
- "cache: pages split during eviction";
- stats->cache_write.desc = "cache: pages written from cache";
- stats->cache_eviction_clean.desc = "cache: unmodified pages evicted";
- stats->compress_read.desc = "compression: compressed pages read";
- stats->compress_write.desc = "compression: compressed pages written";
- stats->compress_write_fail.desc =
- "compression: page written failed to compress";
- stats->compress_write_too_small.desc =
- "compression: page written was too small to compress";
- stats->compress_raw_fail_temporary.desc =
- "compression: raw compression call failed, additional data available";
- stats->compress_raw_fail.desc =
- "compression: raw compression call failed, no additional data available";
- stats->compress_raw_ok.desc =
- "compression: raw compression call succeeded";
- stats->cursor_insert_bulk.desc =
- "cursor: bulk-loaded cursor-insert calls";
- stats->cursor_create.desc = "cursor: create calls";
- stats->cursor_insert_bytes.desc =
- "cursor: cursor-insert key and value bytes inserted";
- stats->cursor_remove_bytes.desc =
- "cursor: cursor-remove key bytes removed";
- stats->cursor_update_bytes.desc =
- "cursor: cursor-update value bytes updated";
- stats->cursor_insert.desc = "cursor: insert calls";
- stats->cursor_next.desc = "cursor: next calls";
- stats->cursor_prev.desc = "cursor: prev calls";
- stats->cursor_remove.desc = "cursor: remove calls";
- stats->cursor_reset.desc = "cursor: reset calls";
- stats->cursor_search.desc = "cursor: search calls";
- stats->cursor_search_near.desc = "cursor: search near calls";
- stats->cursor_update.desc = "cursor: update calls";
- stats->bloom_false_positive.desc = "LSM: bloom filter false positives";
- stats->bloom_hit.desc = "LSM: bloom filter hits";
- stats->bloom_miss.desc = "LSM: bloom filter misses";
- stats->bloom_page_evict.desc =
- "LSM: bloom filter pages evicted from cache";
- stats->bloom_page_read.desc =
- "LSM: bloom filter pages read into cache";
- stats->bloom_count.desc = "LSM: bloom filters in the LSM tree";
- stats->lsm_chunk_count.desc = "LSM: chunks in the LSM tree";
- stats->lsm_generation_max.desc =
- "LSM: highest merge generation in the LSM tree";
- stats->lsm_lookup_no_bloom.desc =
- "LSM: queries that could have benefited from a Bloom filter that did not exist";
- stats->lsm_checkpoint_throttle.desc =
- "LSM: sleep for LSM checkpoint throttle";
- stats->lsm_merge_throttle.desc = "LSM: sleep for LSM merge throttle";
- stats->bloom_size.desc = "LSM: total size of bloom filters";
- stats->rec_dictionary.desc = "reconciliation: dictionary matches";
- stats->rec_suffix_compression.desc =
- "reconciliation: internal page key bytes discarded using suffix compression";
- stats->rec_multiblock_internal.desc =
- "reconciliation: internal page multi-block writes";
- stats->rec_overflow_key_internal.desc =
- "reconciliation: internal-page overflow keys";
- stats->rec_prefix_compression.desc =
- "reconciliation: leaf page key bytes discarded using prefix compression";
- stats->rec_multiblock_leaf.desc =
- "reconciliation: leaf page multi-block writes";
- stats->rec_overflow_key_leaf.desc =
- "reconciliation: leaf-page overflow keys";
- stats->rec_multiblock_max.desc =
- "reconciliation: maximum blocks required for a page";
- stats->rec_overflow_value.desc =
- "reconciliation: overflow values written";
- stats->rec_page_match.desc = "reconciliation: page checksum matches";
- stats->rec_pages.desc = "reconciliation: page reconciliation calls";
- stats->rec_pages_eviction.desc =
- "reconciliation: page reconciliation calls for eviction";
- stats->rec_page_delete.desc = "reconciliation: pages deleted";
- stats->session_compact.desc = "session: object compaction";
- stats->session_cursor_open.desc = "session: open cursor count";
- stats->txn_update_conflict.desc = "transaction: update conflicts";
+ for (i = 0; i < WT_COUNTER_SLOTS; ++i) {
+ handle->stats[i] = &handle->stat_array[i];
+ __wt_stat_dsrc_init_single(handle->stats[i]);
+ }
}
void
-__wt_stat_refresh_dsrc_stats(void *stats_arg)
+__wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats)
{
- WT_DSRC_STATS *stats;
+ stats->block_extension = 0;
+ stats->block_alloc = 0;
+ stats->block_free = 0;
+ stats->block_checkpoint_size = 0;
+ stats->allocation_size = 0;
+ stats->block_reuse_bytes = 0;
+ stats->block_magic = 0;
+ stats->block_major = 0;
+ stats->block_size = 0;
+ stats->block_minor = 0;
+ /* not clearing btree_checkpoint_generation */
+ stats->btree_column_fix = 0;
+ stats->btree_column_internal = 0;
+ stats->btree_column_deleted = 0;
+ stats->btree_column_variable = 0;
+ stats->btree_column_rle = 0;
+ stats->btree_fixed_len = 0;
+ stats->btree_maxintlkey = 0;
+ stats->btree_maxintlpage = 0;
+ stats->btree_maxleafkey = 0;
+ stats->btree_maxleafpage = 0;
+ stats->btree_maxleafvalue = 0;
+ stats->btree_maximum_depth = 0;
+ stats->btree_entries = 0;
+ stats->btree_overflow = 0;
+ stats->btree_compact_rewrite = 0;
+ stats->btree_row_internal = 0;
+ stats->btree_row_leaf = 0;
+ stats->cache_bytes_read = 0;
+ stats->cache_bytes_write = 0;
+ stats->cache_eviction_checkpoint = 0;
+ stats->cache_eviction_fail = 0;
+ stats->cache_eviction_hazard = 0;
+ stats->cache_inmem_splittable = 0;
+ stats->cache_inmem_split = 0;
+ stats->cache_eviction_internal = 0;
+ stats->cache_eviction_dirty = 0;
+ stats->cache_read_overflow = 0;
+ stats->cache_overflow_value = 0;
+ stats->cache_eviction_deepen = 0;
+ stats->cache_write_lookaside = 0;
+ stats->cache_read = 0;
+ stats->cache_read_lookaside = 0;
+ stats->cache_eviction_split = 0;
+ stats->cache_write = 0;
+ stats->cache_write_restore = 0;
+ stats->cache_eviction_clean = 0;
+ stats->compress_read = 0;
+ stats->compress_write = 0;
+ stats->compress_write_fail = 0;
+ stats->compress_write_too_small = 0;
+ stats->compress_raw_fail_temporary = 0;
+ stats->compress_raw_fail = 0;
+ stats->compress_raw_ok = 0;
+ stats->cursor_insert_bulk = 0;
+ stats->cursor_create = 0;
+ stats->cursor_insert_bytes = 0;
+ stats->cursor_remove_bytes = 0;
+ stats->cursor_update_bytes = 0;
+ stats->cursor_insert = 0;
+ stats->cursor_next = 0;
+ stats->cursor_prev = 0;
+ stats->cursor_remove = 0;
+ stats->cursor_reset = 0;
+ stats->cursor_restart = 0;
+ stats->cursor_search = 0;
+ stats->cursor_search_near = 0;
+ stats->cursor_update = 0;
+ stats->bloom_false_positive = 0;
+ stats->bloom_hit = 0;
+ stats->bloom_miss = 0;
+ stats->bloom_page_evict = 0;
+ stats->bloom_page_read = 0;
+ stats->bloom_count = 0;
+ stats->lsm_chunk_count = 0;
+ stats->lsm_generation_max = 0;
+ stats->lsm_lookup_no_bloom = 0;
+ stats->lsm_checkpoint_throttle = 0;
+ stats->lsm_merge_throttle = 0;
+ stats->bloom_size = 0;
+ stats->rec_dictionary = 0;
+ stats->rec_suffix_compression = 0;
+ stats->rec_multiblock_internal = 0;
+ stats->rec_overflow_key_internal = 0;
+ stats->rec_prefix_compression = 0;
+ stats->rec_multiblock_leaf = 0;
+ stats->rec_overflow_key_leaf = 0;
+ stats->rec_multiblock_max = 0;
+ stats->rec_overflow_value = 0;
+ stats->rec_page_match = 0;
+ stats->rec_pages = 0;
+ stats->rec_pages_eviction = 0;
+ stats->rec_page_delete = 0;
+ stats->session_compact = 0;
+ /* not clearing session_cursor_open */
+ stats->txn_update_conflict = 0;
+}
+
+void
+__wt_stat_dsrc_clear_all(WT_DSRC_STATS **stats)
+{
+ u_int i;
- stats = (WT_DSRC_STATS *)stats_arg;
- stats->block_extension.v = 0;
- stats->block_alloc.v = 0;
- stats->block_free.v = 0;
- stats->block_checkpoint_size.v = 0;
- stats->allocation_size.v = 0;
- stats->block_reuse_bytes.v = 0;
- stats->block_magic.v = 0;
- stats->block_major.v = 0;
- stats->block_size.v = 0;
- stats->block_minor.v = 0;
- stats->btree_column_fix.v = 0;
- stats->btree_column_internal.v = 0;
- stats->btree_column_deleted.v = 0;
- stats->btree_column_variable.v = 0;
- stats->btree_fixed_len.v = 0;
- stats->btree_maxintlkey.v = 0;
- stats->btree_maxintlpage.v = 0;
- stats->btree_maxleafkey.v = 0;
- stats->btree_maxleafpage.v = 0;
- stats->btree_maxleafvalue.v = 0;
- stats->btree_maximum_depth.v = 0;
- stats->btree_entries.v = 0;
- stats->btree_overflow.v = 0;
- stats->btree_compact_rewrite.v = 0;
- stats->btree_row_internal.v = 0;
- stats->btree_row_leaf.v = 0;
- stats->cache_bytes_read.v = 0;
- stats->cache_bytes_write.v = 0;
- stats->cache_eviction_checkpoint.v = 0;
- stats->cache_eviction_fail.v = 0;
- stats->cache_eviction_hazard.v = 0;
- stats->cache_inmem_split.v = 0;
- stats->cache_eviction_internal.v = 0;
- stats->cache_eviction_dirty.v = 0;
- stats->cache_read_overflow.v = 0;
- stats->cache_overflow_value.v = 0;
- stats->cache_eviction_deepen.v = 0;
- stats->cache_read.v = 0;
- stats->cache_eviction_split.v = 0;
- stats->cache_write.v = 0;
- stats->cache_eviction_clean.v = 0;
- stats->compress_read.v = 0;
- stats->compress_write.v = 0;
- stats->compress_write_fail.v = 0;
- stats->compress_write_too_small.v = 0;
- stats->compress_raw_fail_temporary.v = 0;
- stats->compress_raw_fail.v = 0;
- stats->compress_raw_ok.v = 0;
- stats->cursor_insert_bulk.v = 0;
- stats->cursor_create.v = 0;
- stats->cursor_insert_bytes.v = 0;
- stats->cursor_remove_bytes.v = 0;
- stats->cursor_update_bytes.v = 0;
- stats->cursor_insert.v = 0;
- stats->cursor_next.v = 0;
- stats->cursor_prev.v = 0;
- stats->cursor_remove.v = 0;
- stats->cursor_reset.v = 0;
- stats->cursor_search.v = 0;
- stats->cursor_search_near.v = 0;
- stats->cursor_update.v = 0;
- stats->bloom_false_positive.v = 0;
- stats->bloom_hit.v = 0;
- stats->bloom_miss.v = 0;
- stats->bloom_page_evict.v = 0;
- stats->bloom_page_read.v = 0;
- stats->bloom_count.v = 0;
- stats->lsm_chunk_count.v = 0;
- stats->lsm_generation_max.v = 0;
- stats->lsm_lookup_no_bloom.v = 0;
- stats->lsm_checkpoint_throttle.v = 0;
- stats->lsm_merge_throttle.v = 0;
- stats->bloom_size.v = 0;
- stats->rec_dictionary.v = 0;
- stats->rec_suffix_compression.v = 0;
- stats->rec_multiblock_internal.v = 0;
- stats->rec_overflow_key_internal.v = 0;
- stats->rec_prefix_compression.v = 0;
- stats->rec_multiblock_leaf.v = 0;
- stats->rec_overflow_key_leaf.v = 0;
- stats->rec_multiblock_max.v = 0;
- stats->rec_overflow_value.v = 0;
- stats->rec_page_match.v = 0;
- stats->rec_pages.v = 0;
- stats->rec_pages_eviction.v = 0;
- stats->rec_page_delete.v = 0;
- stats->session_compact.v = 0;
- stats->txn_update_conflict.v = 0;
+ for (i = 0; i < WT_COUNTER_SLOTS; ++i)
+ __wt_stat_dsrc_clear_single(stats[i]);
+}
+
+void
+__wt_stat_dsrc_aggregate_single(
+ WT_DSRC_STATS *from, WT_DSRC_STATS *to)
+{
+ to->block_extension += from->block_extension;
+ to->block_alloc += from->block_alloc;
+ to->block_free += from->block_free;
+ to->block_checkpoint_size += from->block_checkpoint_size;
+ to->allocation_size = from->allocation_size;
+ to->block_reuse_bytes += from->block_reuse_bytes;
+ to->block_magic = from->block_magic;
+ to->block_major = from->block_major;
+ to->block_size += from->block_size;
+ to->block_minor = from->block_minor;
+ to->btree_checkpoint_generation += from->btree_checkpoint_generation;
+ to->btree_column_fix += from->btree_column_fix;
+ to->btree_column_internal += from->btree_column_internal;
+ to->btree_column_deleted += from->btree_column_deleted;
+ to->btree_column_variable += from->btree_column_variable;
+ to->btree_column_rle += from->btree_column_rle;
+ to->btree_fixed_len = from->btree_fixed_len;
+ if (from->btree_maxintlkey > to->btree_maxintlkey)
+ to->btree_maxintlkey = from->btree_maxintlkey;
+ if (from->btree_maxintlpage > to->btree_maxintlpage)
+ to->btree_maxintlpage = from->btree_maxintlpage;
+ if (from->btree_maxleafkey > to->btree_maxleafkey)
+ to->btree_maxleafkey = from->btree_maxleafkey;
+ if (from->btree_maxleafpage > to->btree_maxleafpage)
+ to->btree_maxleafpage = from->btree_maxleafpage;
+ if (from->btree_maxleafvalue > to->btree_maxleafvalue)
+ to->btree_maxleafvalue = from->btree_maxleafvalue;
+ if (from->btree_maximum_depth > to->btree_maximum_depth)
+ to->btree_maximum_depth = from->btree_maximum_depth;
+ to->btree_entries += from->btree_entries;
+ to->btree_overflow += from->btree_overflow;
+ to->btree_compact_rewrite += from->btree_compact_rewrite;
+ to->btree_row_internal += from->btree_row_internal;
+ to->btree_row_leaf += from->btree_row_leaf;
+ to->cache_bytes_read += from->cache_bytes_read;
+ to->cache_bytes_write += from->cache_bytes_write;
+ to->cache_eviction_checkpoint += from->cache_eviction_checkpoint;
+ to->cache_eviction_fail += from->cache_eviction_fail;
+ to->cache_eviction_hazard += from->cache_eviction_hazard;
+ to->cache_inmem_splittable += from->cache_inmem_splittable;
+ to->cache_inmem_split += from->cache_inmem_split;
+ to->cache_eviction_internal += from->cache_eviction_internal;
+ to->cache_eviction_dirty += from->cache_eviction_dirty;
+ to->cache_read_overflow += from->cache_read_overflow;
+ to->cache_overflow_value += from->cache_overflow_value;
+ to->cache_eviction_deepen += from->cache_eviction_deepen;
+ to->cache_write_lookaside += from->cache_write_lookaside;
+ to->cache_read += from->cache_read;
+ to->cache_read_lookaside += from->cache_read_lookaside;
+ to->cache_eviction_split += from->cache_eviction_split;
+ to->cache_write += from->cache_write;
+ to->cache_write_restore += from->cache_write_restore;
+ to->cache_eviction_clean += from->cache_eviction_clean;
+ to->compress_read += from->compress_read;
+ to->compress_write += from->compress_write;
+ to->compress_write_fail += from->compress_write_fail;
+ to->compress_write_too_small += from->compress_write_too_small;
+ to->compress_raw_fail_temporary += from->compress_raw_fail_temporary;
+ to->compress_raw_fail += from->compress_raw_fail;
+ to->compress_raw_ok += from->compress_raw_ok;
+ to->cursor_insert_bulk += from->cursor_insert_bulk;
+ to->cursor_create += from->cursor_create;
+ to->cursor_insert_bytes += from->cursor_insert_bytes;
+ to->cursor_remove_bytes += from->cursor_remove_bytes;
+ to->cursor_update_bytes += from->cursor_update_bytes;
+ to->cursor_insert += from->cursor_insert;
+ to->cursor_next += from->cursor_next;
+ to->cursor_prev += from->cursor_prev;
+ to->cursor_remove += from->cursor_remove;
+ to->cursor_reset += from->cursor_reset;
+ to->cursor_restart += from->cursor_restart;
+ to->cursor_search += from->cursor_search;
+ to->cursor_search_near += from->cursor_search_near;
+ to->cursor_update += from->cursor_update;
+ to->bloom_false_positive += from->bloom_false_positive;
+ to->bloom_hit += from->bloom_hit;
+ to->bloom_miss += from->bloom_miss;
+ to->bloom_page_evict += from->bloom_page_evict;
+ to->bloom_page_read += from->bloom_page_read;
+ to->bloom_count += from->bloom_count;
+ to->lsm_chunk_count += from->lsm_chunk_count;
+ if (from->lsm_generation_max > to->lsm_generation_max)
+ to->lsm_generation_max = from->lsm_generation_max;
+ to->lsm_lookup_no_bloom += from->lsm_lookup_no_bloom;
+ to->lsm_checkpoint_throttle += from->lsm_checkpoint_throttle;
+ to->lsm_merge_throttle += from->lsm_merge_throttle;
+ to->bloom_size += from->bloom_size;
+ to->rec_dictionary += from->rec_dictionary;
+ to->rec_suffix_compression += from->rec_suffix_compression;
+ to->rec_multiblock_internal += from->rec_multiblock_internal;
+ to->rec_overflow_key_internal += from->rec_overflow_key_internal;
+ to->rec_prefix_compression += from->rec_prefix_compression;
+ to->rec_multiblock_leaf += from->rec_multiblock_leaf;
+ to->rec_overflow_key_leaf += from->rec_overflow_key_leaf;
+ if (from->rec_multiblock_max > to->rec_multiblock_max)
+ to->rec_multiblock_max = from->rec_multiblock_max;
+ to->rec_overflow_value += from->rec_overflow_value;
+ to->rec_page_match += from->rec_page_match;
+ to->rec_pages += from->rec_pages;
+ to->rec_pages_eviction += from->rec_pages_eviction;
+ to->rec_page_delete += from->rec_page_delete;
+ to->session_compact += from->session_compact;
+ to->session_cursor_open += from->session_cursor_open;
+ to->txn_update_conflict += from->txn_update_conflict;
}
void
-__wt_stat_aggregate_dsrc_stats(const void *child, const void *parent)
+__wt_stat_dsrc_aggregate(
+ WT_DSRC_STATS **from, WT_DSRC_STATS *to)
{
- WT_DSRC_STATS *c, *p;
+ int64_t v;
- c = (WT_DSRC_STATS *)child;
- p = (WT_DSRC_STATS *)parent;
- p->block_extension.v += c->block_extension.v;
- p->block_alloc.v += c->block_alloc.v;
- p->block_free.v += c->block_free.v;
- p->block_checkpoint_size.v += c->block_checkpoint_size.v;
- p->block_reuse_bytes.v += c->block_reuse_bytes.v;
- p->block_size.v += c->block_size.v;
- p->btree_checkpoint_generation.v += c->btree_checkpoint_generation.v;
- p->btree_column_fix.v += c->btree_column_fix.v;
- p->btree_column_internal.v += c->btree_column_internal.v;
- p->btree_column_deleted.v += c->btree_column_deleted.v;
- p->btree_column_variable.v += c->btree_column_variable.v;
- if (c->btree_maxintlkey.v > p->btree_maxintlkey.v)
- p->btree_maxintlkey.v = c->btree_maxintlkey.v;
- if (c->btree_maxintlpage.v > p->btree_maxintlpage.v)
- p->btree_maxintlpage.v = c->btree_maxintlpage.v;
- if (c->btree_maxleafkey.v > p->btree_maxleafkey.v)
- p->btree_maxleafkey.v = c->btree_maxleafkey.v;
- if (c->btree_maxleafpage.v > p->btree_maxleafpage.v)
- p->btree_maxleafpage.v = c->btree_maxleafpage.v;
- if (c->btree_maxleafvalue.v > p->btree_maxleafvalue.v)
- p->btree_maxleafvalue.v = c->btree_maxleafvalue.v;
- if (c->btree_maximum_depth.v > p->btree_maximum_depth.v)
- p->btree_maximum_depth.v = c->btree_maximum_depth.v;
- p->btree_entries.v += c->btree_entries.v;
- p->btree_overflow.v += c->btree_overflow.v;
- p->btree_compact_rewrite.v += c->btree_compact_rewrite.v;
- p->btree_row_internal.v += c->btree_row_internal.v;
- p->btree_row_leaf.v += c->btree_row_leaf.v;
- p->cache_bytes_read.v += c->cache_bytes_read.v;
- p->cache_bytes_write.v += c->cache_bytes_write.v;
- p->cache_eviction_checkpoint.v += c->cache_eviction_checkpoint.v;
- p->cache_eviction_fail.v += c->cache_eviction_fail.v;
- p->cache_eviction_hazard.v += c->cache_eviction_hazard.v;
- p->cache_inmem_split.v += c->cache_inmem_split.v;
- p->cache_eviction_internal.v += c->cache_eviction_internal.v;
- p->cache_eviction_dirty.v += c->cache_eviction_dirty.v;
- p->cache_read_overflow.v += c->cache_read_overflow.v;
- p->cache_overflow_value.v += c->cache_overflow_value.v;
- p->cache_eviction_deepen.v += c->cache_eviction_deepen.v;
- p->cache_read.v += c->cache_read.v;
- p->cache_eviction_split.v += c->cache_eviction_split.v;
- p->cache_write.v += c->cache_write.v;
- p->cache_eviction_clean.v += c->cache_eviction_clean.v;
- p->compress_read.v += c->compress_read.v;
- p->compress_write.v += c->compress_write.v;
- p->compress_write_fail.v += c->compress_write_fail.v;
- p->compress_write_too_small.v += c->compress_write_too_small.v;
- p->compress_raw_fail_temporary.v += c->compress_raw_fail_temporary.v;
- p->compress_raw_fail.v += c->compress_raw_fail.v;
- p->compress_raw_ok.v += c->compress_raw_ok.v;
- p->cursor_insert_bulk.v += c->cursor_insert_bulk.v;
- p->cursor_create.v += c->cursor_create.v;
- p->cursor_insert_bytes.v += c->cursor_insert_bytes.v;
- p->cursor_remove_bytes.v += c->cursor_remove_bytes.v;
- p->cursor_update_bytes.v += c->cursor_update_bytes.v;
- p->cursor_insert.v += c->cursor_insert.v;
- p->cursor_next.v += c->cursor_next.v;
- p->cursor_prev.v += c->cursor_prev.v;
- p->cursor_remove.v += c->cursor_remove.v;
- p->cursor_reset.v += c->cursor_reset.v;
- p->cursor_search.v += c->cursor_search.v;
- p->cursor_search_near.v += c->cursor_search_near.v;
- p->cursor_update.v += c->cursor_update.v;
- p->bloom_false_positive.v += c->bloom_false_positive.v;
- p->bloom_hit.v += c->bloom_hit.v;
- p->bloom_miss.v += c->bloom_miss.v;
- p->bloom_page_evict.v += c->bloom_page_evict.v;
- p->bloom_page_read.v += c->bloom_page_read.v;
- p->bloom_count.v += c->bloom_count.v;
- p->lsm_chunk_count.v += c->lsm_chunk_count.v;
- if (c->lsm_generation_max.v > p->lsm_generation_max.v)
- p->lsm_generation_max.v = c->lsm_generation_max.v;
- p->lsm_lookup_no_bloom.v += c->lsm_lookup_no_bloom.v;
- p->lsm_checkpoint_throttle.v += c->lsm_checkpoint_throttle.v;
- p->lsm_merge_throttle.v += c->lsm_merge_throttle.v;
- p->bloom_size.v += c->bloom_size.v;
- p->rec_dictionary.v += c->rec_dictionary.v;
- p->rec_suffix_compression.v += c->rec_suffix_compression.v;
- p->rec_multiblock_internal.v += c->rec_multiblock_internal.v;
- p->rec_overflow_key_internal.v += c->rec_overflow_key_internal.v;
- p->rec_prefix_compression.v += c->rec_prefix_compression.v;
- p->rec_multiblock_leaf.v += c->rec_multiblock_leaf.v;
- p->rec_overflow_key_leaf.v += c->rec_overflow_key_leaf.v;
- if (c->rec_multiblock_max.v > p->rec_multiblock_max.v)
- p->rec_multiblock_max.v = c->rec_multiblock_max.v;
- p->rec_overflow_value.v += c->rec_overflow_value.v;
- p->rec_page_match.v += c->rec_page_match.v;
- p->rec_pages.v += c->rec_pages.v;
- p->rec_pages_eviction.v += c->rec_pages_eviction.v;
- p->rec_page_delete.v += c->rec_page_delete.v;
- p->session_compact.v += c->session_compact.v;
- p->session_cursor_open.v += c->session_cursor_open.v;
- p->txn_update_conflict.v += c->txn_update_conflict.v;
+ to->block_extension += WT_STAT_READ(from, block_extension);
+ to->block_alloc += WT_STAT_READ(from, block_alloc);
+ to->block_free += WT_STAT_READ(from, block_free);
+ to->block_checkpoint_size +=
+ WT_STAT_READ(from, block_checkpoint_size);
+ to->allocation_size = from[0]->allocation_size;
+ to->block_reuse_bytes += WT_STAT_READ(from, block_reuse_bytes);
+ to->block_magic = from[0]->block_magic;
+ to->block_major = from[0]->block_major;
+ to->block_size += WT_STAT_READ(from, block_size);
+ to->block_minor = from[0]->block_minor;
+ to->btree_checkpoint_generation +=
+ WT_STAT_READ(from, btree_checkpoint_generation);
+ to->btree_column_fix += WT_STAT_READ(from, btree_column_fix);
+ to->btree_column_internal +=
+ WT_STAT_READ(from, btree_column_internal);
+ to->btree_column_deleted += WT_STAT_READ(from, btree_column_deleted);
+ to->btree_column_variable +=
+ WT_STAT_READ(from, btree_column_variable);
+ to->btree_column_rle += WT_STAT_READ(from, btree_column_rle);
+ to->btree_fixed_len = from[0]->btree_fixed_len;
+ if ((v = WT_STAT_READ(from, btree_maxintlkey)) >
+ to->btree_maxintlkey)
+ to->btree_maxintlkey = v;
+ if ((v = WT_STAT_READ(from, btree_maxintlpage)) >
+ to->btree_maxintlpage)
+ to->btree_maxintlpage = v;
+ if ((v = WT_STAT_READ(from, btree_maxleafkey)) >
+ to->btree_maxleafkey)
+ to->btree_maxleafkey = v;
+ if ((v = WT_STAT_READ(from, btree_maxleafpage)) >
+ to->btree_maxleafpage)
+ to->btree_maxleafpage = v;
+ if ((v = WT_STAT_READ(from, btree_maxleafvalue)) >
+ to->btree_maxleafvalue)
+ to->btree_maxleafvalue = v;
+ if ((v = WT_STAT_READ(from, btree_maximum_depth)) >
+ to->btree_maximum_depth)
+ to->btree_maximum_depth = v;
+ to->btree_entries += WT_STAT_READ(from, btree_entries);
+ to->btree_overflow += WT_STAT_READ(from, btree_overflow);
+ to->btree_compact_rewrite +=
+ WT_STAT_READ(from, btree_compact_rewrite);
+ to->btree_row_internal += WT_STAT_READ(from, btree_row_internal);
+ to->btree_row_leaf += WT_STAT_READ(from, btree_row_leaf);
+ to->cache_bytes_read += WT_STAT_READ(from, cache_bytes_read);
+ to->cache_bytes_write += WT_STAT_READ(from, cache_bytes_write);
+ to->cache_eviction_checkpoint +=
+ WT_STAT_READ(from, cache_eviction_checkpoint);
+ to->cache_eviction_fail += WT_STAT_READ(from, cache_eviction_fail);
+ to->cache_eviction_hazard +=
+ WT_STAT_READ(from, cache_eviction_hazard);
+ to->cache_inmem_splittable +=
+ WT_STAT_READ(from, cache_inmem_splittable);
+ to->cache_inmem_split += WT_STAT_READ(from, cache_inmem_split);
+ to->cache_eviction_internal +=
+ WT_STAT_READ(from, cache_eviction_internal);
+ to->cache_eviction_dirty += WT_STAT_READ(from, cache_eviction_dirty);
+ to->cache_read_overflow += WT_STAT_READ(from, cache_read_overflow);
+ to->cache_overflow_value += WT_STAT_READ(from, cache_overflow_value);
+ to->cache_eviction_deepen +=
+ WT_STAT_READ(from, cache_eviction_deepen);
+ to->cache_write_lookaside +=
+ WT_STAT_READ(from, cache_write_lookaside);
+ to->cache_read += WT_STAT_READ(from, cache_read);
+ to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside);
+ to->cache_eviction_split += WT_STAT_READ(from, cache_eviction_split);
+ to->cache_write += WT_STAT_READ(from, cache_write);
+ to->cache_write_restore += WT_STAT_READ(from, cache_write_restore);
+ to->cache_eviction_clean += WT_STAT_READ(from, cache_eviction_clean);
+ to->compress_read += WT_STAT_READ(from, compress_read);
+ to->compress_write += WT_STAT_READ(from, compress_write);
+ to->compress_write_fail += WT_STAT_READ(from, compress_write_fail);
+ to->compress_write_too_small +=
+ WT_STAT_READ(from, compress_write_too_small);
+ to->compress_raw_fail_temporary +=
+ WT_STAT_READ(from, compress_raw_fail_temporary);
+ to->compress_raw_fail += WT_STAT_READ(from, compress_raw_fail);
+ to->compress_raw_ok += WT_STAT_READ(from, compress_raw_ok);
+ to->cursor_insert_bulk += WT_STAT_READ(from, cursor_insert_bulk);
+ to->cursor_create += WT_STAT_READ(from, cursor_create);
+ to->cursor_insert_bytes += WT_STAT_READ(from, cursor_insert_bytes);
+ to->cursor_remove_bytes += WT_STAT_READ(from, cursor_remove_bytes);
+ to->cursor_update_bytes += WT_STAT_READ(from, cursor_update_bytes);
+ to->cursor_insert += WT_STAT_READ(from, cursor_insert);
+ to->cursor_next += WT_STAT_READ(from, cursor_next);
+ to->cursor_prev += WT_STAT_READ(from, cursor_prev);
+ to->cursor_remove += WT_STAT_READ(from, cursor_remove);
+ to->cursor_reset += WT_STAT_READ(from, cursor_reset);
+ to->cursor_restart += WT_STAT_READ(from, cursor_restart);
+ to->cursor_search += WT_STAT_READ(from, cursor_search);
+ to->cursor_search_near += WT_STAT_READ(from, cursor_search_near);
+ to->cursor_update += WT_STAT_READ(from, cursor_update);
+ to->bloom_false_positive += WT_STAT_READ(from, bloom_false_positive);
+ to->bloom_hit += WT_STAT_READ(from, bloom_hit);
+ to->bloom_miss += WT_STAT_READ(from, bloom_miss);
+ to->bloom_page_evict += WT_STAT_READ(from, bloom_page_evict);
+ to->bloom_page_read += WT_STAT_READ(from, bloom_page_read);
+ to->bloom_count += WT_STAT_READ(from, bloom_count);
+ to->lsm_chunk_count += WT_STAT_READ(from, lsm_chunk_count);
+ if ((v = WT_STAT_READ(from, lsm_generation_max)) >
+ to->lsm_generation_max)
+ to->lsm_generation_max = v;
+ to->lsm_lookup_no_bloom += WT_STAT_READ(from, lsm_lookup_no_bloom);
+ to->lsm_checkpoint_throttle +=
+ WT_STAT_READ(from, lsm_checkpoint_throttle);
+ to->lsm_merge_throttle += WT_STAT_READ(from, lsm_merge_throttle);
+ to->bloom_size += WT_STAT_READ(from, bloom_size);
+ to->rec_dictionary += WT_STAT_READ(from, rec_dictionary);
+ to->rec_suffix_compression +=
+ WT_STAT_READ(from, rec_suffix_compression);
+ to->rec_multiblock_internal +=
+ WT_STAT_READ(from, rec_multiblock_internal);
+ to->rec_overflow_key_internal +=
+ WT_STAT_READ(from, rec_overflow_key_internal);
+ to->rec_prefix_compression +=
+ WT_STAT_READ(from, rec_prefix_compression);
+ to->rec_multiblock_leaf += WT_STAT_READ(from, rec_multiblock_leaf);
+ to->rec_overflow_key_leaf +=
+ WT_STAT_READ(from, rec_overflow_key_leaf);
+ if ((v = WT_STAT_READ(from, rec_multiblock_max)) >
+ to->rec_multiblock_max)
+ to->rec_multiblock_max = v;
+ to->rec_overflow_value += WT_STAT_READ(from, rec_overflow_value);
+ to->rec_page_match += WT_STAT_READ(from, rec_page_match);
+ to->rec_pages += WT_STAT_READ(from, rec_pages);
+ to->rec_pages_eviction += WT_STAT_READ(from, rec_pages_eviction);
+ to->rec_page_delete += WT_STAT_READ(from, rec_page_delete);
+ to->session_compact += WT_STAT_READ(from, session_compact);
+ to->session_cursor_open += WT_STAT_READ(from, session_cursor_open);
+ to->txn_update_conflict += WT_STAT_READ(from, txn_update_conflict);
+}
+
+static const char * const __stats_connection_desc[] = {
+ "async: number of allocation state races",
+ "async: number of operation slots viewed for allocation",
+ "async: current work queue length",
+ "async: number of flush calls",
+ "async: number of times operation allocation failed",
+ "async: maximum work queue length",
+ "async: number of times worker found no work",
+ "async: total allocations",
+ "async: total compact calls",
+ "async: total insert calls",
+ "async: total remove calls",
+ "async: total search calls",
+ "async: total update calls",
+ "block-manager: mapped bytes read",
+ "block-manager: bytes read",
+ "block-manager: bytes written",
+ "block-manager: mapped blocks read",
+ "block-manager: blocks pre-loaded",
+ "block-manager: blocks read",
+ "block-manager: blocks written",
+ "cache: tracked dirty bytes in the cache",
+ "cache: tracked bytes belonging to internal pages in the cache",
+ "cache: bytes currently in the cache",
+ "cache: tracked bytes belonging to leaf pages in the cache",
+ "cache: maximum bytes configured",
+ "cache: tracked bytes belonging to overflow pages in the cache",
+ "cache: bytes read into cache",
+ "cache: bytes written from cache",
+ "cache: pages evicted by application threads",
+ "cache: checkpoint blocked page eviction",
+ "cache: unmodified pages evicted",
+ "cache: page split during eviction deepened the tree",
+ "cache: modified pages evicted",
+ "cache: pages selected for eviction unable to be evicted",
+ "cache: pages evicted because they exceeded the in-memory maximum",
+ "cache: pages evicted because they had chains of deleted items",
+ "cache: failed eviction of pages that exceeded the in-memory maximum",
+ "cache: hazard pointer blocked page eviction",
+ "cache: internal pages evicted",
+ "cache: maximum page size at eviction",
+ "cache: eviction server candidate queue empty when topping up",
+ "cache: eviction server candidate queue not empty when topping up",
+ "cache: eviction server evicting pages",
+ "cache: eviction server populating queue, but not evicting pages",
+ "cache: eviction server unable to reach eviction goal",
+ "cache: pages split during eviction",
+ "cache: pages walked for eviction",
+ "cache: eviction worker thread evicting pages",
+ "cache: in-memory page splits",
+ "cache: in-memory page passed criteria to be split",
+ "cache: lookaside table insert calls",
+ "cache: lookaside table remove calls",
+ "cache: percentage overhead",
+ "cache: tracked dirty pages in the cache",
+ "cache: pages currently held in the cache",
+ "cache: pages read into cache",
+ "cache: pages read into cache requiring lookaside entries",
+ "cache: pages written from cache",
+ "cache: page written requiring lookaside records",
+ "cache: pages written requiring in-memory restoration",
+ "connection: pthread mutex condition wait calls",
+ "cursor: cursor create calls",
+ "cursor: cursor insert calls",
+ "cursor: cursor next calls",
+ "cursor: cursor prev calls",
+ "cursor: cursor remove calls",
+ "cursor: cursor reset calls",
+ "cursor: cursor restarted searches",
+ "cursor: cursor search calls",
+ "cursor: cursor search near calls",
+ "cursor: cursor update calls",
+ "data-handle: connection data handles currently active",
+ "data-handle: session dhandles swept",
+ "data-handle: session sweep attempts",
+ "data-handle: connection sweep dhandles closed",
+ "data-handle: connection sweep candidate became referenced",
+ "data-handle: connection sweep dhandles removed from hash list",
+ "data-handle: connection sweep time-of-death sets",
+ "data-handle: connection sweeps",
+ "connection: files currently open",
+ "log: total log buffer size",
+ "log: log bytes of payload data",
+ "log: log bytes written",
+ "log: yields waiting for previous log file close",
+ "log: total size of compressed records",
+ "log: total in-memory size of compressed records",
+ "log: log records too small to compress",
+ "log: log records not compressed",
+ "log: log records compressed",
+ "log: maximum log file size",
+ "log: pre-allocated log files prepared",
+ "log: number of pre-allocated log files to create",
+ "log: pre-allocated log files used",
+ "log: log release advances write LSN",
+ "log: records processed by log scan",
+ "log: log scan records requiring two reads",
+ "log: log scan operations",
+ "log: consolidated slot closures",
+ "log: written slots coalesced",
+ "log: logging bytes consolidated",
+ "log: consolidated slot joins",
+ "log: consolidated slot join races",
+ "log: busy returns attempting to switch slots",
+ "log: consolidated slot join transitions",
+ "log: consolidated slot unbuffered writes",
+ "log: log sync operations",
+ "log: log sync_dir operations",
+ "log: log server thread advances write LSN",
+ "log: log write operations",
+ "LSM: sleep for LSM checkpoint throttle",
+ "LSM: sleep for LSM merge throttle",
+ "LSM: rows merged in an LSM tree",
+ "LSM: application work units currently queued",
+ "LSM: merge work units currently queued",
+ "LSM: tree queue hit maximum",
+ "LSM: switch work units currently queued",
+ "LSM: tree maintenance operations scheduled",
+ "LSM: tree maintenance operations discarded",
+ "LSM: tree maintenance operations executed",
+ "connection: memory allocations",
+ "connection: memory frees",
+ "connection: memory re-allocations",
+ "thread-yield: page acquire busy blocked",
+ "thread-yield: page acquire eviction blocked",
+ "thread-yield: page acquire locked blocked",
+ "thread-yield: page acquire read blocked",
+ "thread-yield: page acquire time sleeping (usecs)",
+ "connection: total read I/Os",
+ "reconciliation: page reconciliation calls",
+ "reconciliation: page reconciliation calls for eviction",
+ "reconciliation: split bytes currently awaiting free",
+ "reconciliation: split objects currently awaiting free",
+ "connection: pthread mutex shared lock read-lock calls",
+ "connection: pthread mutex shared lock write-lock calls",
+ "session: open cursor count",
+ "session: open session count",
+ "transaction: transaction begins",
+ "transaction: transaction checkpoints",
+ "transaction: transaction checkpoint generation",
+ "transaction: transaction checkpoint currently running",
+ "transaction: transaction checkpoint max time (msecs)",
+ "transaction: transaction checkpoint min time (msecs)",
+ "transaction: transaction checkpoint most recent time (msecs)",
+ "transaction: transaction checkpoint total time (msecs)",
+ "transaction: transactions committed",
+ "transaction: transaction failures due to cache overflow",
+ "transaction: transaction range of IDs currently pinned by a checkpoint",
+ "transaction: transaction range of IDs currently pinned",
+ "transaction: transactions rolled back",
+ "transaction: transaction sync calls",
+ "connection: total write I/Os",
+};
+
+const char *
+__wt_stat_connection_desc(int slot)
+{
+ return (__stats_connection_desc[slot]);
}
void
-__wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats)
+__wt_stat_connection_init_single(WT_CONNECTION_STATS *stats)
{
- /* Clear, so can also be called for reinitialization. */
memset(stats, 0, sizeof(*stats));
+}
+
+void
+__wt_stat_connection_init(WT_CONNECTION_IMPL *handle)
+{
+ int i;
- stats->async_cur_queue.desc = "async: current work queue length";
- stats->async_max_queue.desc = "async: maximum work queue length";
- stats->async_alloc_race.desc =
- "async: number of allocation state races";
- stats->async_flush.desc = "async: number of flush calls";
- stats->async_alloc_view.desc =
- "async: number of operation slots viewed for allocation";
- stats->async_full.desc =
- "async: number of times operation allocation failed";
- stats->async_nowork.desc =
- "async: number of times worker found no work";
- stats->async_op_alloc.desc = "async: total allocations";
- stats->async_op_compact.desc = "async: total compact calls";
- stats->async_op_insert.desc = "async: total insert calls";
- stats->async_op_remove.desc = "async: total remove calls";
- stats->async_op_search.desc = "async: total search calls";
- stats->async_op_update.desc = "async: total update calls";
- stats->block_preload.desc = "block-manager: blocks pre-loaded";
- stats->block_read.desc = "block-manager: blocks read";
- stats->block_write.desc = "block-manager: blocks written";
- stats->block_byte_read.desc = "block-manager: bytes read";
- stats->block_byte_write.desc = "block-manager: bytes written";
- stats->block_map_read.desc = "block-manager: mapped blocks read";
- stats->block_byte_map_read.desc = "block-manager: mapped bytes read";
- stats->cache_bytes_inuse.desc = "cache: bytes currently in the cache";
- stats->cache_bytes_read.desc = "cache: bytes read into cache";
- stats->cache_bytes_write.desc = "cache: bytes written from cache";
- stats->cache_eviction_checkpoint.desc =
- "cache: checkpoint blocked page eviction";
- stats->cache_eviction_queue_empty.desc =
- "cache: eviction server candidate queue empty when topping up";
- stats->cache_eviction_queue_not_empty.desc =
- "cache: eviction server candidate queue not empty when topping up";
- stats->cache_eviction_server_evicting.desc =
- "cache: eviction server evicting pages";
- stats->cache_eviction_server_not_evicting.desc =
- "cache: eviction server populating queue, but not evicting pages";
- stats->cache_eviction_slow.desc =
- "cache: eviction server unable to reach eviction goal";
- stats->cache_eviction_worker_evicting.desc =
- "cache: eviction worker thread evicting pages";
- stats->cache_eviction_force_fail.desc =
- "cache: failed eviction of pages that exceeded the in-memory maximum";
- stats->cache_eviction_hazard.desc =
- "cache: hazard pointer blocked page eviction";
- stats->cache_inmem_split.desc = "cache: in-memory page splits";
- stats->cache_eviction_internal.desc = "cache: internal pages evicted";
- stats->cache_bytes_max.desc = "cache: maximum bytes configured";
- stats->cache_eviction_maximum_page_size.desc =
- "cache: maximum page size at eviction";
- stats->cache_eviction_dirty.desc = "cache: modified pages evicted";
- stats->cache_eviction_deepen.desc =
- "cache: page split during eviction deepened the tree";
- stats->cache_pages_inuse.desc =
- "cache: pages currently held in the cache";
- stats->cache_eviction_force.desc =
- "cache: pages evicted because they exceeded the in-memory maximum";
- stats->cache_eviction_force_delete.desc =
- "cache: pages evicted because they had chains of deleted items";
- stats->cache_eviction_app.desc =
- "cache: pages evicted by application threads";
- stats->cache_read.desc = "cache: pages read into cache";
- stats->cache_eviction_fail.desc =
- "cache: pages selected for eviction unable to be evicted";
- stats->cache_eviction_split.desc =
- "cache: pages split during eviction";
- stats->cache_eviction_walk.desc = "cache: pages walked for eviction";
- stats->cache_write.desc = "cache: pages written from cache";
- stats->cache_overhead.desc = "cache: percentage overhead";
- stats->cache_bytes_internal.desc =
- "cache: tracked bytes belonging to internal pages in the cache";
- stats->cache_bytes_leaf.desc =
- "cache: tracked bytes belonging to leaf pages in the cache";
- stats->cache_bytes_overflow.desc =
- "cache: tracked bytes belonging to overflow pages in the cache";
- stats->cache_bytes_dirty.desc =
- "cache: tracked dirty bytes in the cache";
- stats->cache_pages_dirty.desc =
- "cache: tracked dirty pages in the cache";
- stats->cache_eviction_clean.desc = "cache: unmodified pages evicted";
- stats->file_open.desc = "connection: files currently open";
- stats->memory_allocation.desc = "connection: memory allocations";
- stats->memory_free.desc = "connection: memory frees";
- stats->memory_grow.desc = "connection: memory re-allocations";
- stats->cond_wait.desc =
- "connection: pthread mutex condition wait calls";
- stats->rwlock_read.desc =
- "connection: pthread mutex shared lock read-lock calls";
- stats->rwlock_write.desc =
- "connection: pthread mutex shared lock write-lock calls";
- stats->read_io.desc = "connection: total read I/Os";
- stats->write_io.desc = "connection: total write I/Os";
- stats->cursor_create.desc = "cursor: cursor create calls";
- stats->cursor_insert.desc = "cursor: cursor insert calls";
- stats->cursor_next.desc = "cursor: cursor next calls";
- stats->cursor_prev.desc = "cursor: cursor prev calls";
- stats->cursor_remove.desc = "cursor: cursor remove calls";
- stats->cursor_reset.desc = "cursor: cursor reset calls";
- stats->cursor_search.desc = "cursor: cursor search calls";
- stats->cursor_search_near.desc = "cursor: cursor search near calls";
- stats->cursor_update.desc = "cursor: cursor update calls";
- stats->dh_conn_ref.desc =
- "data-handle: connection candidate referenced";
- stats->dh_conn_handles.desc = "data-handle: connection dhandles swept";
- stats->dh_conn_sweeps.desc = "data-handle: connection sweeps";
- stats->dh_conn_tod.desc = "data-handle: connection time-of-death sets";
- stats->dh_session_handles.desc = "data-handle: session dhandles swept";
- stats->dh_session_sweeps.desc = "data-handle: session sweep attempts";
- stats->log_slot_closes.desc = "log: consolidated slot closures";
- stats->log_slot_races.desc = "log: consolidated slot join races";
- stats->log_slot_transitions.desc =
- "log: consolidated slot join transitions";
- stats->log_slot_joins.desc = "log: consolidated slot joins";
- stats->log_slot_toosmall.desc =
- "log: failed to find a slot large enough for record";
- stats->log_bytes_payload.desc = "log: log bytes of payload data";
- stats->log_bytes_written.desc = "log: log bytes written";
- stats->log_compress_writes.desc = "log: log records compressed";
- stats->log_compress_write_fails.desc =
- "log: log records not compressed";
- stats->log_compress_small.desc =
- "log: log records too small to compress";
- stats->log_release_write_lsn.desc =
- "log: log release advances write LSN";
- stats->log_scans.desc = "log: log scan operations";
- stats->log_scan_rereads.desc =
- "log: log scan records requiring two reads";
- stats->log_write_lsn.desc =
- "log: log server thread advances write LSN";
- stats->log_sync.desc = "log: log sync operations";
- stats->log_sync_dir.desc = "log: log sync_dir operations";
- stats->log_writes.desc = "log: log write operations";
- stats->log_slot_consolidated.desc = "log: logging bytes consolidated";
- stats->log_max_filesize.desc = "log: maximum log file size";
- stats->log_prealloc_max.desc =
- "log: number of pre-allocated log files to create";
- stats->log_prealloc_files.desc =
- "log: pre-allocated log files prepared";
- stats->log_prealloc_used.desc = "log: pre-allocated log files used";
- stats->log_slot_toobig.desc = "log: record size exceeded maximum";
- stats->log_scan_records.desc = "log: records processed by log scan";
- stats->log_compress_mem.desc =
- "log: total in-memory size of compressed records";
- stats->log_buffer_size.desc = "log: total log buffer size";
- stats->log_compress_len.desc = "log: total size of compressed records";
- stats->log_slot_coalesced.desc = "log: written slots coalesced";
- stats->log_close_yields.desc =
- "log: yields waiting for previous log file close";
- stats->lsm_work_queue_app.desc =
- "LSM: application work units currently queued";
- stats->lsm_work_queue_manager.desc =
- "LSM: merge work units currently queued";
- stats->lsm_rows_merged.desc = "LSM: rows merged in an LSM tree";
- stats->lsm_checkpoint_throttle.desc =
- "LSM: sleep for LSM checkpoint throttle";
- stats->lsm_merge_throttle.desc = "LSM: sleep for LSM merge throttle";
- stats->lsm_work_queue_switch.desc =
- "LSM: switch work units currently queued";
- stats->lsm_work_units_discarded.desc =
- "LSM: tree maintenance operations discarded";
- stats->lsm_work_units_done.desc =
- "LSM: tree maintenance operations executed";
- stats->lsm_work_units_created.desc =
- "LSM: tree maintenance operations scheduled";
- stats->lsm_work_queue_max.desc = "LSM: tree queue hit maximum";
- stats->rec_pages.desc = "reconciliation: page reconciliation calls";
- stats->rec_pages_eviction.desc =
- "reconciliation: page reconciliation calls for eviction";
- stats->rec_split_stashed_bytes.desc =
- "reconciliation: split bytes currently awaiting free";
- stats->rec_split_stashed_objects.desc =
- "reconciliation: split objects currently awaiting free";
- stats->session_cursor_open.desc = "session: open cursor count";
- stats->session_open.desc = "session: open session count";
- stats->page_busy_blocked.desc =
- "thread-yield: page acquire busy blocked";
- stats->page_forcible_evict_blocked.desc =
- "thread-yield: page acquire eviction blocked";
- stats->page_locked_blocked.desc =
- "thread-yield: page acquire locked blocked";
- stats->page_read_blocked.desc =
- "thread-yield: page acquire read blocked";
- stats->page_sleep.desc =
- "thread-yield: page acquire time sleeping (usecs)";
- stats->txn_begin.desc = "transaction: transaction begins";
- stats->txn_checkpoint_running.desc =
- "transaction: transaction checkpoint currently running";
- stats->txn_checkpoint_generation.desc =
- "transaction: transaction checkpoint generation";
- stats->txn_checkpoint_time_max.desc =
- "transaction: transaction checkpoint max time (msecs)";
- stats->txn_checkpoint_time_min.desc =
- "transaction: transaction checkpoint min time (msecs)";
- stats->txn_checkpoint_time_recent.desc =
- "transaction: transaction checkpoint most recent time (msecs)";
- stats->txn_checkpoint_time_total.desc =
- "transaction: transaction checkpoint total time (msecs)";
- stats->txn_checkpoint.desc = "transaction: transaction checkpoints";
- stats->txn_fail_cache.desc =
- "transaction: transaction failures due to cache overflow";
- stats->txn_pinned_range.desc =
- "transaction: transaction range of IDs currently pinned";
- stats->txn_pinned_checkpoint_range.desc =
- "transaction: transaction range of IDs currently pinned by a checkpoint";
- stats->txn_sync.desc = "transaction: transaction sync calls";
- stats->txn_commit.desc = "transaction: transactions committed";
- stats->txn_rollback.desc = "transaction: transactions rolled back";
+ for (i = 0; i < WT_COUNTER_SLOTS; ++i) {
+ handle->stats[i] = &handle->stat_array[i];
+ __wt_stat_connection_init_single(handle->stats[i]);
+ }
}
void
-__wt_stat_refresh_connection_stats(void *stats_arg)
+__wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
{
- WT_CONNECTION_STATS *stats;
+ stats->async_cur_queue = 0;
+ /* not clearing async_max_queue */
+ stats->async_alloc_race = 0;
+ stats->async_flush = 0;
+ stats->async_alloc_view = 0;
+ stats->async_full = 0;
+ stats->async_nowork = 0;
+ stats->async_op_alloc = 0;
+ stats->async_op_compact = 0;
+ stats->async_op_insert = 0;
+ stats->async_op_remove = 0;
+ stats->async_op_search = 0;
+ stats->async_op_update = 0;
+ stats->block_preload = 0;
+ stats->block_read = 0;
+ stats->block_write = 0;
+ stats->block_byte_read = 0;
+ stats->block_byte_write = 0;
+ stats->block_map_read = 0;
+ stats->block_byte_map_read = 0;
+ /* not clearing cache_bytes_inuse */
+ stats->cache_bytes_read = 0;
+ stats->cache_bytes_write = 0;
+ stats->cache_eviction_checkpoint = 0;
+ stats->cache_eviction_queue_empty = 0;
+ stats->cache_eviction_queue_not_empty = 0;
+ stats->cache_eviction_server_evicting = 0;
+ stats->cache_eviction_server_not_evicting = 0;
+ stats->cache_eviction_slow = 0;
+ stats->cache_eviction_worker_evicting = 0;
+ stats->cache_eviction_force_fail = 0;
+ stats->cache_eviction_hazard = 0;
+ stats->cache_inmem_splittable = 0;
+ stats->cache_inmem_split = 0;
+ stats->cache_eviction_internal = 0;
+ stats->cache_lookaside_insert = 0;
+ stats->cache_lookaside_remove = 0;
+ /* not clearing cache_bytes_max */
+ /* not clearing cache_eviction_maximum_page_size */
+ stats->cache_eviction_dirty = 0;
+ stats->cache_eviction_deepen = 0;
+ stats->cache_write_lookaside = 0;
+ /* not clearing cache_pages_inuse */
+ stats->cache_eviction_force = 0;
+ stats->cache_eviction_force_delete = 0;
+ stats->cache_eviction_app = 0;
+ stats->cache_read = 0;
+ stats->cache_read_lookaside = 0;
+ stats->cache_eviction_fail = 0;
+ stats->cache_eviction_split = 0;
+ stats->cache_eviction_walk = 0;
+ stats->cache_write = 0;
+ stats->cache_write_restore = 0;
+ /* not clearing cache_overhead */
+ /* not clearing cache_bytes_internal */
+ /* not clearing cache_bytes_leaf */
+ /* not clearing cache_bytes_overflow */
+ /* not clearing cache_bytes_dirty */
+ /* not clearing cache_pages_dirty */
+ stats->cache_eviction_clean = 0;
+ /* not clearing file_open */
+ stats->memory_allocation = 0;
+ stats->memory_free = 0;
+ stats->memory_grow = 0;
+ stats->cond_wait = 0;
+ stats->rwlock_read = 0;
+ stats->rwlock_write = 0;
+ stats->read_io = 0;
+ stats->write_io = 0;
+ stats->cursor_create = 0;
+ stats->cursor_insert = 0;
+ stats->cursor_next = 0;
+ stats->cursor_prev = 0;
+ stats->cursor_remove = 0;
+ stats->cursor_reset = 0;
+ stats->cursor_restart = 0;
+ stats->cursor_search = 0;
+ stats->cursor_search_near = 0;
+ stats->cursor_update = 0;
+ /* not clearing dh_conn_handle_count */
+ stats->dh_sweep_ref = 0;
+ stats->dh_sweep_close = 0;
+ stats->dh_sweep_remove = 0;
+ stats->dh_sweep_tod = 0;
+ stats->dh_sweeps = 0;
+ stats->dh_session_handles = 0;
+ stats->dh_session_sweeps = 0;
+ stats->log_slot_switch_busy = 0;
+ stats->log_slot_closes = 0;
+ stats->log_slot_races = 0;
+ stats->log_slot_transitions = 0;
+ stats->log_slot_joins = 0;
+ stats->log_slot_unbuffered = 0;
+ stats->log_bytes_payload = 0;
+ stats->log_bytes_written = 0;
+ stats->log_compress_writes = 0;
+ stats->log_compress_write_fails = 0;
+ stats->log_compress_small = 0;
+ stats->log_release_write_lsn = 0;
+ stats->log_scans = 0;
+ stats->log_scan_rereads = 0;
+ stats->log_write_lsn = 0;
+ stats->log_sync = 0;
+ stats->log_sync_dir = 0;
+ stats->log_writes = 0;
+ stats->log_slot_consolidated = 0;
+ /* not clearing log_max_filesize */
+ /* not clearing log_prealloc_max */
+ stats->log_prealloc_files = 0;
+ stats->log_prealloc_used = 0;
+ stats->log_scan_records = 0;
+ stats->log_compress_mem = 0;
+ /* not clearing log_buffer_size */
+ stats->log_compress_len = 0;
+ stats->log_slot_coalesced = 0;
+ stats->log_close_yields = 0;
+ /* not clearing lsm_work_queue_app */
+ /* not clearing lsm_work_queue_manager */
+ stats->lsm_rows_merged = 0;
+ stats->lsm_checkpoint_throttle = 0;
+ stats->lsm_merge_throttle = 0;
+ /* not clearing lsm_work_queue_switch */
+ stats->lsm_work_units_discarded = 0;
+ stats->lsm_work_units_done = 0;
+ stats->lsm_work_units_created = 0;
+ stats->lsm_work_queue_max = 0;
+ stats->rec_pages = 0;
+ stats->rec_pages_eviction = 0;
+ /* not clearing rec_split_stashed_bytes */
+ /* not clearing rec_split_stashed_objects */
+ /* not clearing session_cursor_open */
+ /* not clearing session_open */
+ stats->page_busy_blocked = 0;
+ stats->page_forcible_evict_blocked = 0;
+ stats->page_locked_blocked = 0;
+ stats->page_read_blocked = 0;
+ stats->page_sleep = 0;
+ stats->txn_begin = 0;
+ /* not clearing txn_checkpoint_running */
+ /* not clearing txn_checkpoint_generation */
+ /* not clearing txn_checkpoint_time_max */
+ /* not clearing txn_checkpoint_time_min */
+ /* not clearing txn_checkpoint_time_recent */
+ /* not clearing txn_checkpoint_time_total */
+ stats->txn_checkpoint = 0;
+ stats->txn_fail_cache = 0;
+ /* not clearing txn_pinned_range */
+ /* not clearing txn_pinned_checkpoint_range */
+ stats->txn_sync = 0;
+ stats->txn_commit = 0;
+ stats->txn_rollback = 0;
+}
+
+void
+__wt_stat_connection_clear_all(WT_CONNECTION_STATS **stats)
+{
+ u_int i;
- stats = (WT_CONNECTION_STATS *)stats_arg;
- stats->async_cur_queue.v = 0;
- stats->async_alloc_race.v = 0;
- stats->async_flush.v = 0;
- stats->async_alloc_view.v = 0;
- stats->async_full.v = 0;
- stats->async_nowork.v = 0;
- stats->async_op_alloc.v = 0;
- stats->async_op_compact.v = 0;
- stats->async_op_insert.v = 0;
- stats->async_op_remove.v = 0;
- stats->async_op_search.v = 0;
- stats->async_op_update.v = 0;
- stats->block_preload.v = 0;
- stats->block_read.v = 0;
- stats->block_write.v = 0;
- stats->block_byte_read.v = 0;
- stats->block_byte_write.v = 0;
- stats->block_map_read.v = 0;
- stats->block_byte_map_read.v = 0;
- stats->cache_bytes_read.v = 0;
- stats->cache_bytes_write.v = 0;
- stats->cache_eviction_checkpoint.v = 0;
- stats->cache_eviction_queue_empty.v = 0;
- stats->cache_eviction_queue_not_empty.v = 0;
- stats->cache_eviction_server_evicting.v = 0;
- stats->cache_eviction_server_not_evicting.v = 0;
- stats->cache_eviction_slow.v = 0;
- stats->cache_eviction_worker_evicting.v = 0;
- stats->cache_eviction_force_fail.v = 0;
- stats->cache_eviction_hazard.v = 0;
- stats->cache_inmem_split.v = 0;
- stats->cache_eviction_internal.v = 0;
- stats->cache_eviction_dirty.v = 0;
- stats->cache_eviction_deepen.v = 0;
- stats->cache_eviction_force.v = 0;
- stats->cache_eviction_force_delete.v = 0;
- stats->cache_eviction_app.v = 0;
- stats->cache_read.v = 0;
- stats->cache_eviction_fail.v = 0;
- stats->cache_eviction_split.v = 0;
- stats->cache_eviction_walk.v = 0;
- stats->cache_write.v = 0;
- stats->cache_eviction_clean.v = 0;
- stats->memory_allocation.v = 0;
- stats->memory_free.v = 0;
- stats->memory_grow.v = 0;
- stats->cond_wait.v = 0;
- stats->rwlock_read.v = 0;
- stats->rwlock_write.v = 0;
- stats->read_io.v = 0;
- stats->write_io.v = 0;
- stats->cursor_create.v = 0;
- stats->cursor_insert.v = 0;
- stats->cursor_next.v = 0;
- stats->cursor_prev.v = 0;
- stats->cursor_remove.v = 0;
- stats->cursor_reset.v = 0;
- stats->cursor_search.v = 0;
- stats->cursor_search_near.v = 0;
- stats->cursor_update.v = 0;
- stats->dh_conn_ref.v = 0;
- stats->dh_conn_handles.v = 0;
- stats->dh_conn_sweeps.v = 0;
- stats->dh_conn_tod.v = 0;
- stats->dh_session_handles.v = 0;
- stats->dh_session_sweeps.v = 0;
- stats->log_slot_closes.v = 0;
- stats->log_slot_races.v = 0;
- stats->log_slot_transitions.v = 0;
- stats->log_slot_joins.v = 0;
- stats->log_slot_toosmall.v = 0;
- stats->log_bytes_payload.v = 0;
- stats->log_bytes_written.v = 0;
- stats->log_compress_writes.v = 0;
- stats->log_compress_write_fails.v = 0;
- stats->log_compress_small.v = 0;
- stats->log_release_write_lsn.v = 0;
- stats->log_scans.v = 0;
- stats->log_scan_rereads.v = 0;
- stats->log_write_lsn.v = 0;
- stats->log_sync.v = 0;
- stats->log_sync_dir.v = 0;
- stats->log_writes.v = 0;
- stats->log_slot_consolidated.v = 0;
- stats->log_prealloc_files.v = 0;
- stats->log_prealloc_used.v = 0;
- stats->log_slot_toobig.v = 0;
- stats->log_scan_records.v = 0;
- stats->log_compress_mem.v = 0;
- stats->log_compress_len.v = 0;
- stats->log_slot_coalesced.v = 0;
- stats->log_close_yields.v = 0;
- stats->lsm_rows_merged.v = 0;
- stats->lsm_checkpoint_throttle.v = 0;
- stats->lsm_merge_throttle.v = 0;
- stats->lsm_work_units_discarded.v = 0;
- stats->lsm_work_units_done.v = 0;
- stats->lsm_work_units_created.v = 0;
- stats->lsm_work_queue_max.v = 0;
- stats->rec_pages.v = 0;
- stats->rec_pages_eviction.v = 0;
- stats->page_busy_blocked.v = 0;
- stats->page_forcible_evict_blocked.v = 0;
- stats->page_locked_blocked.v = 0;
- stats->page_read_blocked.v = 0;
- stats->page_sleep.v = 0;
- stats->txn_begin.v = 0;
- stats->txn_checkpoint.v = 0;
- stats->txn_fail_cache.v = 0;
- stats->txn_sync.v = 0;
- stats->txn_commit.v = 0;
- stats->txn_rollback.v = 0;
+ for (i = 0; i < WT_COUNTER_SLOTS; ++i)
+ __wt_stat_connection_clear_single(stats[i]);
+}
+
+void
+__wt_stat_connection_aggregate(
+ WT_CONNECTION_STATS **from, WT_CONNECTION_STATS *to)
+{
+ to->async_cur_queue += WT_STAT_READ(from, async_cur_queue);
+ to->async_max_queue += WT_STAT_READ(from, async_max_queue);
+ to->async_alloc_race += WT_STAT_READ(from, async_alloc_race);
+ to->async_flush += WT_STAT_READ(from, async_flush);
+ to->async_alloc_view += WT_STAT_READ(from, async_alloc_view);
+ to->async_full += WT_STAT_READ(from, async_full);
+ to->async_nowork += WT_STAT_READ(from, async_nowork);
+ to->async_op_alloc += WT_STAT_READ(from, async_op_alloc);
+ to->async_op_compact += WT_STAT_READ(from, async_op_compact);
+ to->async_op_insert += WT_STAT_READ(from, async_op_insert);
+ to->async_op_remove += WT_STAT_READ(from, async_op_remove);
+ to->async_op_search += WT_STAT_READ(from, async_op_search);
+ to->async_op_update += WT_STAT_READ(from, async_op_update);
+ to->block_preload += WT_STAT_READ(from, block_preload);
+ to->block_read += WT_STAT_READ(from, block_read);
+ to->block_write += WT_STAT_READ(from, block_write);
+ to->block_byte_read += WT_STAT_READ(from, block_byte_read);
+ to->block_byte_write += WT_STAT_READ(from, block_byte_write);
+ to->block_map_read += WT_STAT_READ(from, block_map_read);
+ to->block_byte_map_read += WT_STAT_READ(from, block_byte_map_read);
+ to->cache_bytes_inuse += WT_STAT_READ(from, cache_bytes_inuse);
+ to->cache_bytes_read += WT_STAT_READ(from, cache_bytes_read);
+ to->cache_bytes_write += WT_STAT_READ(from, cache_bytes_write);
+ to->cache_eviction_checkpoint +=
+ WT_STAT_READ(from, cache_eviction_checkpoint);
+ to->cache_eviction_queue_empty +=
+ WT_STAT_READ(from, cache_eviction_queue_empty);
+ to->cache_eviction_queue_not_empty +=
+ WT_STAT_READ(from, cache_eviction_queue_not_empty);
+ to->cache_eviction_server_evicting +=
+ WT_STAT_READ(from, cache_eviction_server_evicting);
+ to->cache_eviction_server_not_evicting +=
+ WT_STAT_READ(from, cache_eviction_server_not_evicting);
+ to->cache_eviction_slow += WT_STAT_READ(from, cache_eviction_slow);
+ to->cache_eviction_worker_evicting +=
+ WT_STAT_READ(from, cache_eviction_worker_evicting);
+ to->cache_eviction_force_fail +=
+ WT_STAT_READ(from, cache_eviction_force_fail);
+ to->cache_eviction_hazard +=
+ WT_STAT_READ(from, cache_eviction_hazard);
+ to->cache_inmem_splittable +=
+ WT_STAT_READ(from, cache_inmem_splittable);
+ to->cache_inmem_split += WT_STAT_READ(from, cache_inmem_split);
+ to->cache_eviction_internal +=
+ WT_STAT_READ(from, cache_eviction_internal);
+ to->cache_lookaside_insert +=
+ WT_STAT_READ(from, cache_lookaside_insert);
+ to->cache_lookaside_remove +=
+ WT_STAT_READ(from, cache_lookaside_remove);
+ to->cache_bytes_max += WT_STAT_READ(from, cache_bytes_max);
+ to->cache_eviction_maximum_page_size +=
+ WT_STAT_READ(from, cache_eviction_maximum_page_size);
+ to->cache_eviction_dirty += WT_STAT_READ(from, cache_eviction_dirty);
+ to->cache_eviction_deepen +=
+ WT_STAT_READ(from, cache_eviction_deepen);
+ to->cache_write_lookaside +=
+ WT_STAT_READ(from, cache_write_lookaside);
+ to->cache_pages_inuse += WT_STAT_READ(from, cache_pages_inuse);
+ to->cache_eviction_force += WT_STAT_READ(from, cache_eviction_force);
+ to->cache_eviction_force_delete +=
+ WT_STAT_READ(from, cache_eviction_force_delete);
+ to->cache_eviction_app += WT_STAT_READ(from, cache_eviction_app);
+ to->cache_read += WT_STAT_READ(from, cache_read);
+ to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside);
+ to->cache_eviction_fail += WT_STAT_READ(from, cache_eviction_fail);
+ to->cache_eviction_split += WT_STAT_READ(from, cache_eviction_split);
+ to->cache_eviction_walk += WT_STAT_READ(from, cache_eviction_walk);
+ to->cache_write += WT_STAT_READ(from, cache_write);
+ to->cache_write_restore += WT_STAT_READ(from, cache_write_restore);
+ to->cache_overhead += WT_STAT_READ(from, cache_overhead);
+ to->cache_bytes_internal += WT_STAT_READ(from, cache_bytes_internal);
+ to->cache_bytes_leaf += WT_STAT_READ(from, cache_bytes_leaf);
+ to->cache_bytes_overflow += WT_STAT_READ(from, cache_bytes_overflow);
+ to->cache_bytes_dirty += WT_STAT_READ(from, cache_bytes_dirty);
+ to->cache_pages_dirty += WT_STAT_READ(from, cache_pages_dirty);
+ to->cache_eviction_clean += WT_STAT_READ(from, cache_eviction_clean);
+ to->file_open += WT_STAT_READ(from, file_open);
+ to->memory_allocation += WT_STAT_READ(from, memory_allocation);
+ to->memory_free += WT_STAT_READ(from, memory_free);
+ to->memory_grow += WT_STAT_READ(from, memory_grow);
+ to->cond_wait += WT_STAT_READ(from, cond_wait);
+ to->rwlock_read += WT_STAT_READ(from, rwlock_read);
+ to->rwlock_write += WT_STAT_READ(from, rwlock_write);
+ to->read_io += WT_STAT_READ(from, read_io);
+ to->write_io += WT_STAT_READ(from, write_io);
+ to->cursor_create += WT_STAT_READ(from, cursor_create);
+ to->cursor_insert += WT_STAT_READ(from, cursor_insert);
+ to->cursor_next += WT_STAT_READ(from, cursor_next);
+ to->cursor_prev += WT_STAT_READ(from, cursor_prev);
+ to->cursor_remove += WT_STAT_READ(from, cursor_remove);
+ to->cursor_reset += WT_STAT_READ(from, cursor_reset);
+ to->cursor_restart += WT_STAT_READ(from, cursor_restart);
+ to->cursor_search += WT_STAT_READ(from, cursor_search);
+ to->cursor_search_near += WT_STAT_READ(from, cursor_search_near);
+ to->cursor_update += WT_STAT_READ(from, cursor_update);
+ to->dh_conn_handle_count += WT_STAT_READ(from, dh_conn_handle_count);
+ to->dh_sweep_ref += WT_STAT_READ(from, dh_sweep_ref);
+ to->dh_sweep_close += WT_STAT_READ(from, dh_sweep_close);
+ to->dh_sweep_remove += WT_STAT_READ(from, dh_sweep_remove);
+ to->dh_sweep_tod += WT_STAT_READ(from, dh_sweep_tod);
+ to->dh_sweeps += WT_STAT_READ(from, dh_sweeps);
+ to->dh_session_handles += WT_STAT_READ(from, dh_session_handles);
+ to->dh_session_sweeps += WT_STAT_READ(from, dh_session_sweeps);
+ to->log_slot_switch_busy += WT_STAT_READ(from, log_slot_switch_busy);
+ to->log_slot_closes += WT_STAT_READ(from, log_slot_closes);
+ to->log_slot_races += WT_STAT_READ(from, log_slot_races);
+ to->log_slot_transitions += WT_STAT_READ(from, log_slot_transitions);
+ to->log_slot_joins += WT_STAT_READ(from, log_slot_joins);
+ to->log_slot_unbuffered += WT_STAT_READ(from, log_slot_unbuffered);
+ to->log_bytes_payload += WT_STAT_READ(from, log_bytes_payload);
+ to->log_bytes_written += WT_STAT_READ(from, log_bytes_written);
+ to->log_compress_writes += WT_STAT_READ(from, log_compress_writes);
+ to->log_compress_write_fails +=
+ WT_STAT_READ(from, log_compress_write_fails);
+ to->log_compress_small += WT_STAT_READ(from, log_compress_small);
+ to->log_release_write_lsn +=
+ WT_STAT_READ(from, log_release_write_lsn);
+ to->log_scans += WT_STAT_READ(from, log_scans);
+ to->log_scan_rereads += WT_STAT_READ(from, log_scan_rereads);
+ to->log_write_lsn += WT_STAT_READ(from, log_write_lsn);
+ to->log_sync += WT_STAT_READ(from, log_sync);
+ to->log_sync_dir += WT_STAT_READ(from, log_sync_dir);
+ to->log_writes += WT_STAT_READ(from, log_writes);
+ to->log_slot_consolidated +=
+ WT_STAT_READ(from, log_slot_consolidated);
+ to->log_max_filesize += WT_STAT_READ(from, log_max_filesize);
+ to->log_prealloc_max += WT_STAT_READ(from, log_prealloc_max);
+ to->log_prealloc_files += WT_STAT_READ(from, log_prealloc_files);
+ to->log_prealloc_used += WT_STAT_READ(from, log_prealloc_used);
+ to->log_scan_records += WT_STAT_READ(from, log_scan_records);
+ to->log_compress_mem += WT_STAT_READ(from, log_compress_mem);
+ to->log_buffer_size += WT_STAT_READ(from, log_buffer_size);
+ to->log_compress_len += WT_STAT_READ(from, log_compress_len);
+ to->log_slot_coalesced += WT_STAT_READ(from, log_slot_coalesced);
+ to->log_close_yields += WT_STAT_READ(from, log_close_yields);
+ to->lsm_work_queue_app += WT_STAT_READ(from, lsm_work_queue_app);
+ to->lsm_work_queue_manager +=
+ WT_STAT_READ(from, lsm_work_queue_manager);
+ to->lsm_rows_merged += WT_STAT_READ(from, lsm_rows_merged);
+ to->lsm_checkpoint_throttle +=
+ WT_STAT_READ(from, lsm_checkpoint_throttle);
+ to->lsm_merge_throttle += WT_STAT_READ(from, lsm_merge_throttle);
+ to->lsm_work_queue_switch +=
+ WT_STAT_READ(from, lsm_work_queue_switch);
+ to->lsm_work_units_discarded +=
+ WT_STAT_READ(from, lsm_work_units_discarded);
+ to->lsm_work_units_done += WT_STAT_READ(from, lsm_work_units_done);
+ to->lsm_work_units_created +=
+ WT_STAT_READ(from, lsm_work_units_created);
+ to->lsm_work_queue_max += WT_STAT_READ(from, lsm_work_queue_max);
+ to->rec_pages += WT_STAT_READ(from, rec_pages);
+ to->rec_pages_eviction += WT_STAT_READ(from, rec_pages_eviction);
+ to->rec_split_stashed_bytes +=
+ WT_STAT_READ(from, rec_split_stashed_bytes);
+ to->rec_split_stashed_objects +=
+ WT_STAT_READ(from, rec_split_stashed_objects);
+ to->session_cursor_open += WT_STAT_READ(from, session_cursor_open);
+ to->session_open += WT_STAT_READ(from, session_open);
+ to->page_busy_blocked += WT_STAT_READ(from, page_busy_blocked);
+ to->page_forcible_evict_blocked +=
+ WT_STAT_READ(from, page_forcible_evict_blocked);
+ to->page_locked_blocked += WT_STAT_READ(from, page_locked_blocked);
+ to->page_read_blocked += WT_STAT_READ(from, page_read_blocked);
+ to->page_sleep += WT_STAT_READ(from, page_sleep);
+ to->txn_begin += WT_STAT_READ(from, txn_begin);
+ to->txn_checkpoint_running +=
+ WT_STAT_READ(from, txn_checkpoint_running);
+ to->txn_checkpoint_generation +=
+ WT_STAT_READ(from, txn_checkpoint_generation);
+ to->txn_checkpoint_time_max +=
+ WT_STAT_READ(from, txn_checkpoint_time_max);
+ to->txn_checkpoint_time_min +=
+ WT_STAT_READ(from, txn_checkpoint_time_min);
+ to->txn_checkpoint_time_recent +=
+ WT_STAT_READ(from, txn_checkpoint_time_recent);
+ to->txn_checkpoint_time_total +=
+ WT_STAT_READ(from, txn_checkpoint_time_total);
+ to->txn_checkpoint += WT_STAT_READ(from, txn_checkpoint);
+ to->txn_fail_cache += WT_STAT_READ(from, txn_fail_cache);
+ to->txn_pinned_range += WT_STAT_READ(from, txn_pinned_range);
+ to->txn_pinned_checkpoint_range +=
+ WT_STAT_READ(from, txn_pinned_checkpoint_range);
+ to->txn_sync += WT_STAT_READ(from, txn_sync);
+ to->txn_commit += WT_STAT_READ(from, txn_commit);
+ to->txn_rollback += WT_STAT_READ(from, txn_rollback);
}
diff --git a/src/txn/txn.c b/src/txn/txn.c
index 9e8def39fb0..e81f8a68251 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -134,7 +134,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
if ((count = txn_global->scan_count) < 0)
WT_PAUSE();
} while (count < 0 ||
- !WT_ATOMIC_CAS4(txn_global->scan_count, count, count + 1));
+ !__wt_atomic_casiv32(&txn_global->scan_count, count, count + 1));
current_id = snap_min = txn_global->current;
prev_oldest_id = txn_global->oldest_id;
@@ -147,7 +147,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
/* Check that the oldest ID has not moved in the meantime. */
if (prev_oldest_id == txn_global->oldest_id) {
WT_ASSERT(session, txn_global->scan_count > 0);
- (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1);
+ (void)__wt_atomic_subiv32(&txn_global->scan_count, 1);
return;
}
}
@@ -182,12 +182,8 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
txn_state->snap_min = snap_min;
- /* Update the last running ID if we have a much newer value. */
- if (snap_min > txn_global->last_running + 100)
- txn_global->last_running = snap_min;
-
WT_ASSERT(session, txn_global->scan_count > 0);
- (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1);
+ (void)__wt_atomic_subiv32(&txn_global->scan_count, 1);
__txn_sort_snapshot(session, n, current_id);
}
@@ -212,7 +208,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
WT_SESSION_IMPL *oldest_session;
WT_TXN_GLOBAL *txn_global;
WT_TXN_STATE *s;
- uint64_t current_id, id, oldest_id, prev_oldest_id, snap_min;
+ uint64_t current_id, id, last_running, oldest_id, prev_oldest_id;
uint32_t i, session_cnt;
int32_t count;
int last_running_moved;
@@ -220,7 +216,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
conn = S2C(session);
txn_global = &conn->txn_global;
- current_id = snap_min = txn_global->current;
+ current_id = last_running = txn_global->current;
oldest_session = NULL;
prev_oldest_id = txn_global->oldest_id;
@@ -241,11 +237,11 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
if ((count = txn_global->scan_count) < 0)
WT_PAUSE();
} while (count < 0 ||
- !WT_ATOMIC_CAS4(txn_global->scan_count, count, count + 1));
+ !__wt_atomic_casiv32(&txn_global->scan_count, count, count + 1));
/* The oldest ID cannot change until the scan count goes to zero. */
prev_oldest_id = txn_global->oldest_id;
- current_id = oldest_id = snap_min = txn_global->current;
+ current_id = oldest_id = last_running = txn_global->current;
/* Walk the array of concurrent transactions. */
WT_ORDERED_READ(session_cnt, conn->session_cnt);
@@ -260,8 +256,8 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
*/
if ((id = s->id) != WT_TXN_NONE &&
WT_TXNID_LE(prev_oldest_id, id) &&
- WT_TXNID_LT(id, snap_min))
- snap_min = id;
+ WT_TXNID_LT(id, last_running))
+ last_running = id;
/*
* !!!
@@ -278,8 +274,8 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
}
}
- if (WT_TXNID_LT(snap_min, oldest_id))
- oldest_id = snap_min;
+ if (WT_TXNID_LT(last_running, oldest_id))
+ oldest_id = last_running;
/* The oldest ID can't move past any named snapshots. */
if ((id = txn_global->nsnap_oldest_id) != WT_TXN_NONE &&
@@ -287,25 +283,25 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
oldest_id = id;
/* Update the last running ID. */
- if (WT_TXNID_LT(txn_global->last_running, snap_min)) {
- txn_global->last_running = snap_min;
- last_running_moved = 1;
- } else
- last_running_moved = 0;
+ last_running_moved =
+ WT_TXNID_LT(txn_global->last_running, last_running);
/* Update the oldest ID. */
- if (WT_TXNID_LT(prev_oldest_id, oldest_id) &&
- WT_ATOMIC_CAS4(txn_global->scan_count, 1, -1)) {
+ if ((WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) &&
+ __wt_atomic_casiv32(&txn_global->scan_count, 1, -1)) {
WT_ORDERED_READ(session_cnt, conn->session_cnt);
for (i = 0, s = txn_global->states; i < session_cnt; i++, s++) {
if ((id = s->id) != WT_TXN_NONE &&
- WT_TXNID_LT(id, oldest_id))
- oldest_id = id;
+ WT_TXNID_LT(id, last_running))
+ last_running = id;
if ((id = s->snap_min) != WT_TXN_NONE &&
WT_TXNID_LT(id, oldest_id))
oldest_id = id;
}
+ if (WT_TXNID_LT(last_running, oldest_id))
+ oldest_id = last_running;
+
#ifdef HAVE_DIAGNOSTIC
/*
* Make sure the ID doesn't move past any named snapshots.
@@ -318,8 +314,11 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
WT_ASSERT(session,
id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id));
#endif
+ if (WT_TXNID_LT(txn_global->last_running, last_running))
+ txn_global->last_running = last_running;
if (WT_TXNID_LT(txn_global->oldest_id, oldest_id))
txn_global->oldest_id = oldest_id;
+ WT_ASSERT(session, txn_global->scan_count == -1);
txn_global->scan_count = 0;
} else {
if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) &&
@@ -334,7 +333,7 @@ __wt_txn_update_oldest(WT_SESSION_IMPL *session, int force)
oldest_session->txn.snap_min);
}
WT_ASSERT(session, txn_global->scan_count > 0);
- (void)WT_ATOMIC_SUB4(txn_global->scan_count, 1);
+ (void)__wt_atomic_subiv32(&txn_global->scan_count, 1);
}
}
@@ -400,7 +399,6 @@ __wt_txn_release(WT_SESSION_IMPL *session)
WT_TXN *txn;
WT_TXN_GLOBAL *txn_global;
WT_TXN_STATE *txn_state;
- int was_oldest;
txn = &session->txn;
WT_ASSERT(session, txn->mod_count == 0);
@@ -408,7 +406,6 @@ __wt_txn_release(WT_SESSION_IMPL *session)
txn_global = &S2C(session)->txn_global;
txn_state = WT_SESSION_TXN_STATE(session);
- was_oldest = 0;
/* Clear the transaction's ID from the global table. */
if (WT_SESSION_IS_CHECKPOINT(session)) {
@@ -419,12 +416,12 @@ __wt_txn_release(WT_SESSION_IMPL *session)
txn_global->checkpoint_id = 0;
txn_global->checkpoint_pinned = WT_TXN_NONE;
} else if (F_ISSET(txn, WT_TXN_HAS_ID)) {
+ WT_ASSERT(session,
+ !WT_TXNID_LT(txn->id, txn_global->last_running));
+
WT_ASSERT(session, txn_state->id != WT_TXN_NONE &&
txn->id != WT_TXN_NONE);
WT_PUBLISH(txn_state->id, WT_TXN_NONE);
-
- /* Quick check for the oldest transaction. */
- was_oldest = (txn->id == txn_global->last_running);
txn->id = WT_TXN_NONE;
}
@@ -443,14 +440,6 @@ __wt_txn_release(WT_SESSION_IMPL *session)
txn->isolation = session->isolation;
/* Ensure the transaction flags are cleared on exit */
txn->flags = 0;
-
- /*
- * When the oldest transaction in the system completes, bump the oldest
- * ID. This is racy and so not guaranteed, but in practice it keeps
- * the oldest ID from falling too far behind.
- */
- if (was_oldest)
- __wt_txn_update_oldest(session, 1);
}
/*
@@ -469,7 +458,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
txn = &session->txn;
conn = S2C(session);
- WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR));
+ WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0);
if (!F_ISSET(txn, WT_TXN_RUNNING))
WT_RET_MSG(session, EINVAL, "No transaction is active");
@@ -593,6 +582,7 @@ __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[])
switch (op->type) {
case WT_TXN_OP_BASIC:
case WT_TXN_OP_INMEM:
+ WT_ASSERT(session, op->u.upd->txnid == txn->id);
op->u.upd->txnid = WT_TXN_ABORTED;
break;
case WT_TXN_OP_REF:
@@ -660,20 +650,29 @@ __wt_txn_stats_update(WT_SESSION_IMPL *session)
{
WT_TXN_GLOBAL *txn_global;
WT_CONNECTION_IMPL *conn;
- WT_CONNECTION_STATS *stats;
+ WT_CONNECTION_STATS **stats;
uint64_t checkpoint_pinned;
conn = S2C(session);
txn_global = &conn->txn_global;
- stats = &conn->stats;
+ stats = conn->stats;
checkpoint_pinned = txn_global->checkpoint_pinned;
- WT_STAT_SET(stats, txn_pinned_range,
- txn_global->current - txn_global->oldest_id);
+ WT_STAT_SET(session, stats, txn_pinned_range,
+ txn_global->current - txn_global->oldest_id);
- WT_STAT_SET(stats, txn_pinned_checkpoint_range,
+ WT_STAT_SET(session, stats, txn_pinned_checkpoint_range,
checkpoint_pinned == WT_TXN_NONE ?
0 : txn_global->current - checkpoint_pinned);
+
+ WT_STAT_SET(
+ session, stats, txn_checkpoint_time_max, conn->ckpt_time_max);
+ WT_STAT_SET(
+ session, stats, txn_checkpoint_time_min, conn->ckpt_time_min);
+ WT_STAT_SET(
+ session, stats, txn_checkpoint_time_recent, conn->ckpt_time_recent);
+ WT_STAT_SET(
+ session, stats, txn_checkpoint_time_total, conn->ckpt_time_total);
}
/*
@@ -712,10 +711,11 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[])
WT_RET(__wt_rwlock_alloc(session,
&txn_global->nsnap_rwlock, "named snapshot lock"));
txn_global->nsnap_oldest_id = WT_TXN_NONE;
- STAILQ_INIT(&txn_global->nsnaph);
+ TAILQ_INIT(&txn_global->nsnaph);
WT_RET(__wt_calloc_def(
session, conn->session_size, &txn_global->states));
+ WT_CACHE_LINE_ALIGNMENT_VERIFY(session, txn_global->states);
for (i = 0, s = txn_global->states; i < conn->session_size; i++, s++)
s->id = s->snap_min = WT_TXN_NONE;
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index 49fcd69ffed..9f59c53314e 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -246,6 +246,10 @@ __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[])
WT_ASSERT(session, session->dhandle->checkpoint == NULL);
WT_ASSERT(session, WT_PREFIX_MATCH(session->dhandle->name, "file:"));
+ /* Skip files that are never involved in a checkpoint. */
+ if (F_ISSET(S2BT(session), WT_BTREE_NO_CHECKPOINT))
+ return (0);
+
/* Make sure there is space for the next entry. */
WT_RET(__wt_realloc_def(session, &session->ckpt_handle_allocated,
session->ckpt_handle_next + 1, &session->ckpt_handle));
@@ -285,19 +289,22 @@ static void
__checkpoint_stats(
WT_SESSION_IMPL *session, struct timespec *start, struct timespec *stop)
{
+ WT_CONNECTION_IMPL *conn;
uint64_t msec;
+ conn = S2C(session);
+
/*
* Get time diff in microseconds.
*/
msec = WT_TIMEDIFF(*stop, *start) / WT_MILLION;
- if (msec > WT_CONN_STAT(session, txn_checkpoint_time_max))
- WT_STAT_FAST_CONN_SET(session, txn_checkpoint_time_max, msec);
- if (WT_CONN_STAT(session, txn_checkpoint_time_min) == 0 ||
- msec < WT_CONN_STAT(session, txn_checkpoint_time_min))
- WT_STAT_FAST_CONN_SET(session, txn_checkpoint_time_min, msec);
- WT_STAT_FAST_CONN_SET(session, txn_checkpoint_time_recent, msec);
- WT_STAT_FAST_CONN_INCRV(session, txn_checkpoint_time_total, msec);
+
+ if (msec > conn->ckpt_time_max)
+ conn->ckpt_time_max = msec;
+ if (conn->ckpt_time_min == 0 || msec < conn->ckpt_time_min)
+ conn->ckpt_time_min = msec;
+ conn->ckpt_time_recent = msec;
+ conn->ckpt_time_total += msec;
}
/*
@@ -1161,9 +1168,17 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, int final)
btree = S2BT(session);
bulk = F_ISSET(btree, WT_BTREE_BULK) ? 1 : 0;
- /* If the handle is already dead, force the discard. */
+ /*
+ * If the handle is already dead or the file isn't durable, force the
+ * discard.
+ *
+ * If the file isn't durable, mark the handle dead, there are asserts
+ * later on that only dead handles can have modified pages.
+ */
+ if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT))
+ F_SET(session->dhandle, WT_DHANDLE_DEAD);
if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
- return (__wt_cache_op(session, NULL, WT_SYNC_DISCARD_FORCE));
+ return (__wt_cache_op(session, NULL, WT_SYNC_DISCARD));
/*
* If closing an unmodified file, check that no update is required
diff --git a/src/txn/txn_log.c b/src/txn/txn_log.c
index 0d66eccd7dc..a63720d736f 100644
--- a/src/txn/txn_log.c
+++ b/src/txn/txn_log.c
@@ -33,18 +33,7 @@ __txn_op_log(WT_SESSION_IMPL *session,
* 3) row store remove; or
* 4) row store insert/update.
*/
- if (cbt->btree->type != BTREE_ROW) {
- WT_ASSERT(session, cbt->ins != NULL);
- recno = WT_INSERT_RECNO(cbt->ins);
- WT_ASSERT(session, recno != 0);
-
- if (WT_UPDATE_DELETED_ISSET(upd))
- WT_ERR(__wt_logop_col_remove_pack(session, logrec,
- op->fileid, recno));
- else
- WT_ERR(__wt_logop_col_put_pack(session, logrec,
- op->fileid, recno, &value));
- } else {
+ if (cbt->btree->type == BTREE_ROW) {
WT_ERR(__wt_cursor_row_leaf_key(cbt, &key));
if (WT_UPDATE_DELETED_ISSET(upd))
@@ -53,6 +42,16 @@ __txn_op_log(WT_SESSION_IMPL *session,
else
WT_ERR(__wt_logop_row_put_pack(session, logrec,
op->fileid, &key, &value));
+ } else {
+ recno = WT_INSERT_RECNO(cbt->ins);
+ WT_ASSERT(session, recno != WT_RECNO_OOB);
+
+ if (WT_UPDATE_DELETED_ISSET(upd))
+ WT_ERR(__wt_logop_col_remove_pack(session, logrec,
+ op->fileid, recno));
+ else
+ WT_ERR(__wt_logop_col_put_pack(session, logrec,
+ op->fileid, recno, &value));
}
err: __wt_buf_free(session, &key);
@@ -308,7 +307,7 @@ __wt_txn_checkpoint_log(
switch (flags) {
case WT_TXN_LOG_CKPT_PREPARE:
txn->full_ckpt = 1;
- *ckpt_lsn = S2C(session)->log->write_start_lsn;
+ WT_ERR(__wt_log_ckpt_lsn(session, ckpt_lsn));
/*
* We need to make sure that the log records in the checkpoint
* LSN are on disk. In particular to make sure that the
@@ -337,7 +336,7 @@ __wt_txn_checkpoint_log(
txn->ckpt_nsnapshot = 0;
WT_CLEAR(empty);
ckpt_snapshot = &empty;
- *ckpt_lsn = S2C(session)->log->write_start_lsn;
+ WT_ERR(__wt_log_ckpt_lsn(session, ckpt_lsn));
} else
ckpt_snapshot = txn->ckpt_snapshot;
@@ -419,9 +418,9 @@ __wt_txn_truncate_log(
} else {
op->type = WT_TXN_OP_TRUNCATE_COL;
op->u.truncate_col.start =
- (start == NULL) ? 0 : start->recno;
+ (start == NULL) ? WT_RECNO_OOB : start->recno;
op->u.truncate_col.stop =
- (stop == NULL) ? 0 : stop->recno;
+ (stop == NULL) ? WT_RECNO_OOB : stop->recno;
}
/* Write that operation into the in-memory log. */
diff --git a/src/txn/txn_nsnap.c b/src/txn/txn_nsnap.c
index bd352c2237e..be736cc1c98 100644
--- a/src/txn/txn_nsnap.c
+++ b/src/txn/txn_nsnap.c
@@ -34,7 +34,7 @@ __nsnap_drop_one(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name)
txn_global = &S2C(session)->txn_global;
- STAILQ_FOREACH(found, &txn_global->nsnaph, q)
+ TAILQ_FOREACH(found, &txn_global->nsnaph, q)
if (WT_STRING_MATCH(found->name, name->str, name->len))
break;
@@ -42,10 +42,10 @@ __nsnap_drop_one(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name)
return (WT_NOTFOUND);
/* Bump the global ID if we are removing the first entry */
- if (found == STAILQ_FIRST(&txn_global->nsnaph))
- txn_global->nsnap_oldest_id = (STAILQ_NEXT(found, q) != NULL) ?
- STAILQ_NEXT(found, q)->snap_min : WT_TXN_NONE;
- STAILQ_REMOVE(&txn_global->nsnaph, found, __wt_named_snapshot, q);
+ if (found == TAILQ_FIRST(&txn_global->nsnaph))
+ txn_global->nsnap_oldest_id = (TAILQ_NEXT(found, q) != NULL) ?
+ TAILQ_NEXT(found, q)->snap_min : WT_TXN_NONE;
+ TAILQ_REMOVE(&txn_global->nsnaph, found, q);
__nsnap_destroy(session, found);
return (ret);
@@ -67,7 +67,7 @@ __nsnap_drop_to(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name, int inclusive)
last = nsnap = prev = NULL;
txn_global = &S2C(session)->txn_global;
- if (STAILQ_EMPTY(&txn_global->nsnaph)) {
+ if (TAILQ_EMPTY(&txn_global->nsnaph)) {
if (name == NULL)
return (0);
/*
@@ -85,7 +85,7 @@ __nsnap_drop_to(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name, int inclusive)
*/
new_nsnap_oldest = WT_TXN_NONE;
if (name != NULL) {
- STAILQ_FOREACH(last, &txn_global->nsnaph, q) {
+ TAILQ_FOREACH(last, &txn_global->nsnaph, q) {
if (WT_STRING_MATCH(last->name, name->str, name->len))
break;
prev = last;
@@ -102,17 +102,17 @@ __nsnap_drop_to(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *name, int inclusive)
last = prev;
}
- if (STAILQ_NEXT(last, q) != NULL)
- new_nsnap_oldest = STAILQ_NEXT(last, q)->snap_min;
+ if (TAILQ_NEXT(last, q) != NULL)
+ new_nsnap_oldest = TAILQ_NEXT(last, q)->snap_min;
}
do {
- nsnap = STAILQ_FIRST(&txn_global->nsnaph);
+ nsnap = TAILQ_FIRST(&txn_global->nsnaph);
WT_ASSERT(session, nsnap != NULL);
- STAILQ_REMOVE_HEAD(&txn_global->nsnaph, q);
+ TAILQ_REMOVE(&txn_global->nsnaph, nsnap, q);
__nsnap_destroy(session, nsnap);
/* Last will be NULL in the all case so it will never match */
- } while (nsnap != last && !STAILQ_EMPTY(&txn_global->nsnaph));
+ } while (nsnap != last && !TAILQ_EMPTY(&txn_global->nsnaph));
/* Now that the queue of named snapshots is updated, update the ID */
txn_global->nsnap_oldest_id = new_nsnap_oldest;
@@ -173,9 +173,9 @@ __wt_txn_named_snapshot_begin(WT_SESSION_IMPL *session, const char *cfg[])
*/
WT_ERR_NOTFOUND_OK(__nsnap_drop_one(session, &cval));
- if (STAILQ_EMPTY(&txn_global->nsnaph))
+ if (TAILQ_EMPTY(&txn_global->nsnaph))
txn_global->nsnap_oldest_id = nsnap_new->snap_min;
- STAILQ_INSERT_TAIL(&txn_global->nsnaph, nsnap_new, q);
+ TAILQ_INSERT_TAIL(&txn_global->nsnaph, nsnap_new, q);
nsnap_new = NULL;
err: if (started_txn)
@@ -254,7 +254,7 @@ __wt_txn_named_snapshot_get(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *nameval)
WT_RET(__wt_session_copy_values(session));
WT_RET(__wt_readlock(session, txn_global->nsnap_rwlock));
- STAILQ_FOREACH(nsnap, &txn_global->nsnaph, q)
+ TAILQ_FOREACH(nsnap, &txn_global->nsnaph, q)
if (WT_STRING_MATCH(nsnap->name, nameval->str, nameval->len)) {
txn->snap_min = txn_state->snap_min = nsnap->snap_min;
txn->snap_max = nsnap->snap_max;
@@ -358,10 +358,8 @@ __wt_txn_named_snapshot_destroy(WT_SESSION_IMPL *session)
txn_global = &S2C(session)->txn_global;
txn_global->nsnap_oldest_id = WT_TXN_NONE;
- while (!STAILQ_EMPTY(&txn_global->nsnaph)) {
- nsnap = STAILQ_FIRST(&txn_global->nsnaph);
- WT_ASSERT(session, nsnap != NULL);
- STAILQ_REMOVE_HEAD(&txn_global->nsnaph, q);
+ while ((nsnap = TAILQ_FIRST(&txn_global->nsnaph)) != NULL) {
+ TAILQ_REMOVE(&txn_global->nsnaph, nsnap, q);
__nsnap_destroy(session, nsnap);
}
diff --git a/src/txn/txn_recover.c b/src/txn/txn_recover.c
index 0eadcbf3b01..240d0a5ffd3 100644
--- a/src/txn/txn_recover.c
+++ b/src/txn/txn_recover.c
@@ -65,7 +65,7 @@ __recovery_cursor(WT_SESSION_IMPL *session, WT_RECOVERY *r,
"No file found with ID %u (max %u)",
id, r->nfiles));
r->missing = 1;
- } else if (WT_LOG_CMP(lsnp, &r->files[id].ckpt_lsn) >= 0) {
+ } else if (__wt_log_cmp(lsnp, &r->files[id].ckpt_lsn) >= 0) {
/*
* We're going to apply the operation. Get the cursor, opening
* one if none is cached.
@@ -144,10 +144,10 @@ __txn_op_apply(
GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor);
/* Set up the cursors. */
- if (start_recno == 0) {
+ if (start_recno == WT_RECNO_OOB) {
start = NULL;
stop = cursor;
- } else if (stop_recno == 0) {
+ } else if (stop_recno == WT_RECNO_OOB) {
start = cursor;
stop = NULL;
} else {
@@ -522,7 +522,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session)
*/
WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));
-done:
+done: FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE);
err: WT_TRET(__recovery_free(&r));
__wt_free(session, config);
WT_TRET(session->iface.close(&session->iface, NULL));
diff --git a/src/utilities/util_list.c b/src/utilities/util_list.c
index 1888c7d967b..1d35f2efc72 100644
--- a/src/utilities/util_list.c
+++ b/src/utilities/util_list.c
@@ -97,12 +97,15 @@ list_print(WT_SESSION *session, const char *name, int cflag, int vflag)
}
/*
- * XXX
- * We don't normally say anything about the WiredTiger
- * metadata, it's not a normal "object" in the database. I'm
- * making an exception for the checkpoint and verbose options.
+ * !!!
+ * We don't normally say anything about the WiredTiger metadata
+ * and lookaside tables, they're not application/user "objects"
+ * in the database. I'm making an exception for the checkpoint
+ * and verbose options.
*/
- if (strcmp(key, WT_METADATA_URI) != 0 || cflag || vflag)
+ if (cflag || vflag ||
+ (strcmp(key, WT_METADATA_URI) != 0 &&
+ strcmp(key, WT_LAS_URI) != 0))
printf("%s\n", key);
if (!cflag && !vflag)
diff --git a/test/checkpoint/checkpointer.c b/test/checkpoint/checkpointer.c
index dd6fcd6b95a..c4f36ac69ba 100644
--- a/test/checkpoint/checkpointer.c
+++ b/test/checkpoint/checkpointer.c
@@ -134,8 +134,7 @@ done: if ((ret = session->close(session, NULL)) != 0)
/*
* verify_checkpoint --
* Open a cursor on each table at the last checkpoint and walk through
- * the tables in parallel. The key/values should match across all
- * tables.
+ * the tables in parallel. The key/values should match across all tables.
*/
static int
verify_checkpoint(WT_SESSION *session)
@@ -245,41 +244,36 @@ compare_cursors(
WT_CURSOR *cursor2, const char *type2)
{
uint64_t key1, key2;
- char *val1, *val2;
- char buf[128];
+ char *val1, *val2, buf[128];
+ int ret;
+ ret = 0;
memset(buf, 0, 128);
if (cursor1->get_key(cursor1, &key1) != 0 ||
cursor2->get_key(cursor2, &key2) != 0)
return (log_print_err("Error getting keys", EINVAL, 1));
- if (key1 != key2) {
- printf("Key mismatch %" PRIu64 " from a %s table "
- "is not %" PRIu64 " from a %s table\n",
- key1, type1, key2, type2);
-
- return (ERR_KEY_MISMATCH);
- }
-
- /* Now check the values. */
if (cursor1->get_value(cursor1, &val1) != 0 ||
cursor2->get_value(cursor2, &val2) != 0)
return (log_print_err("Error getting values", EINVAL, 1));
if (g.logfp != NULL)
fprintf(g.logfp, "k1: %" PRIu64 " k2: %" PRIu64
- " val1: %s val2: %s \n",
- key1, key2, val1, val2);
- if (strlen(val1) != strlen(val2) ||
- strcmp(val1, val2) != 0) {
- printf("Value mismatch for key %" PRIu64
- ", %s from a %s table is not %s from a %s table\n",
- key1, val1, type1, val2, type2);
- return (ERR_DATA_MISMATCH);
- }
+ " val1: %s val2: %s \n", key1, key2, val1, val2);
- return (0);
+ if (key1 != key2)
+ ret = ERR_KEY_MISMATCH;
+ else if (strlen(val1) != strlen(val2) || strcmp(val1, val2) != 0)
+ ret = ERR_DATA_MISMATCH;
+ else
+ return (0);
+
+ printf("Key/value mismatch: %" PRIu64 "/%s from a %s table is not %"
+ PRIu64 "/%s from a %s table\n",
+ key1, val1, type1, key2, val2, type2);
+
+ return (ret);
}
/*
@@ -349,10 +343,10 @@ diagnose_key_error(
return (1);
c->set_key(c, key1_orig);
if ((ret = c->search(c)) != 0)
- (void)log_print_err("1st cursor didn't find 1st key\n", ret, 0);
+ (void)log_print_err("1st cursor didn't find 1st key", ret, 0);
c->set_key(c, key2_orig);
if ((ret = c->search(c)) != 0)
- (void)log_print_err("1st cursor didn't find 2nd key\n", ret, 0);
+ (void)log_print_err("1st cursor didn't find 2nd key", ret, 0);
if (c->close(c) != 0)
return (1);
@@ -361,10 +355,10 @@ diagnose_key_error(
return (1);
c->set_key(c, key1_orig);
if ((ret = c->search(c)) != 0)
- (void)log_print_err("2nd cursor didn't find 1st key\n", ret, 0);
+ (void)log_print_err("2nd cursor didn't find 1st key", ret, 0);
c->set_key(c, key2_orig);
if ((ret = c->search(c)) != 0)
- (void)log_print_err("2nd cursor didn't find 2nd key\n", ret, 0);
+ (void)log_print_err("2nd cursor didn't find 2nd key", ret, 0);
if (c->close(c) != 0)
return (1);
@@ -378,7 +372,7 @@ live_check:
return (1);
c->set_key(c, key1_orig);
if ((ret = c->search(c)) != 0)
- (void)log_print_err("1st cursor didn't find 1st key\n", ret, 0);
+ (void)log_print_err("1st cursor didn't find 1st key", ret, 0);
if (c->close(c) != 0)
return (1);
@@ -387,7 +381,7 @@ live_check:
return (1);
c->set_key(c, key2_orig);
if ((ret = c->search(c)) != 0)
- (void)log_print_err("2nd cursor didn't find 2nd key\n", ret, 0);
+ (void)log_print_err("2nd cursor didn't find 2nd key", ret, 0);
if (c->close(c) != 0)
return (1);
diff --git a/test/checkpoint/workers.c b/test/checkpoint/workers.c
index 5cd2ef4e97b..b8ca5a37d2b 100644
--- a/test/checkpoint/workers.c
+++ b/test/checkpoint/workers.c
@@ -44,8 +44,7 @@ create_table(WT_SESSION *session, COOKIE *cookie)
p = config;
end = config + sizeof(config);
p += snprintf(p, (size_t)(end - p),
- "key_format=%s,value_format=S",
- cookie->type == COL ? "r" : "q");
+ "key_format=%s,value_format=S", cookie->type == COL ? "r" : "q");
if (cookie->type == LSM)
(void)snprintf(p, (size_t)(end - p), ",type=lsm");
@@ -133,8 +132,7 @@ worker_op(WT_CURSOR *cursor, uint64_t keyno, u_int new_val)
char valuebuf[64];
cursor->set_key(cursor, keyno);
- (void)snprintf(
- valuebuf, sizeof(valuebuf), "%037u", new_val);
+ (void)snprintf(valuebuf, sizeof(valuebuf), "%037u", new_val);
cursor->set_value(cursor, valuebuf);
if ((ret = cursor->insert(cursor)) != 0) {
if (ret == WT_ROLLBACK)
diff --git a/test/format/backup.c b/test/format/backup.c
index 3b95ea92b5e..5805012e1e0 100644
--- a/test/format/backup.c
+++ b/test/format/backup.c
@@ -65,8 +65,7 @@ copy_file(const char *name)
int ret;
len = strlen(g.home) + strlen(g.home_backup) + strlen(name) * 2 + 20;
- if ((cmd = malloc(len)) == NULL)
- die(errno, "malloc");
+ cmd = dmalloc(len);
(void)snprintf(cmd, len,
"cp %s/%s %s/%s", g.home, name, g.home_backup, name);
if ((ret = system(cmd)) != 0)
diff --git a/test/format/bulk.c b/test/format/bulk.c
index 7cf4ba559dc..203043166a4 100644
--- a/test/format/bulk.c
+++ b/test/format/bulk.c
@@ -39,6 +39,7 @@ wts_load(void)
int is_bulk, ret;
conn = g.wts_conn;
+ keybuf = valbuf = NULL;
if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0)
die(ret, "connection.open_session");
diff --git a/test/format/config.c b/test/format/config.c
index 6e767a2c6a2..1f19ecf2cd2 100644
--- a/test/format/config.c
+++ b/test/format/config.c
@@ -36,6 +36,7 @@ static const char *config_file_type(u_int);
static CONFIG *config_find(const char *, size_t);
static int config_is_perm(const char *);
static void config_isolation(void);
+static void config_lrt(void);
static void config_map_checksum(const char *, u_int *);
static void config_map_compression(const char *, u_int *);
static void config_map_encryption(const char *, u_int *);
@@ -102,8 +103,7 @@ config_setup(void)
* our configuration, LSM or KVS devices are "tables", but files are
* tested as well.
*/
- if ((g.uri = malloc(256)) == NULL)
- die(errno, "malloc");
+ g.uri = dmalloc(256);
strcpy(g.uri, DATASOURCE("file") ? "file:" : "table:");
if (DATASOURCE("helium"))
strcat(g.uri, "dev1/");
@@ -135,12 +135,6 @@ config_setup(void)
if (DATASOURCE("helium") || DATASOURCE("kvsbdb"))
g.c_reverse = 0;
- config_checksum();
- config_compression("compression");
- config_compression("logging_compression");
- config_encryption();
- config_isolation();
-
/*
* Periodically, run single-threaded so we can compare the results to
* a Berkeley DB copy, as long as the thread-count isn't nailed down.
@@ -149,6 +143,13 @@ config_setup(void)
if (!g.replay && g.run_cnt % 20 == 19 && !config_is_perm("threads"))
g.c_threads = 1;
+ config_checksum();
+ config_compression("compression");
+ config_compression("logging_compression");
+ config_encryption();
+ config_isolation();
+ config_lrt();
+
/*
* Periodically, set the delete percentage to 0 so salvage gets run,
* as long as the delete percentage isn't nailed down.
@@ -329,6 +330,26 @@ config_isolation(void)
}
/*
+ * config_lrt --
+ * Long-running transaction configuration.
+ */
+static void
+config_lrt(void)
+{
+ /*
+ * The underlying engine doesn't support a lookaside file for
+ * fixed-length column stores.
+ */
+ if (g.type == FIX) {
+ if (config_is_perm("long_running_txn"))
+ die(EINVAL,
+ "long_running_txn not supported with fixed-length "
+ "column store");
+ g.c_long_running_txn = 0;
+ }
+}
+
+/*
* config_error --
* Display configuration information on error.
*/
diff --git a/test/format/format.h b/test/format/format.h
index 4ec2734aee9..d82dea5451f 100644
--- a/test/format/format.h
+++ b/test/format/format.h
@@ -310,6 +310,8 @@ void config_file(const char *);
void config_print(int);
void config_setup(void);
void config_single(const char *, int);
+void *dmalloc(size_t);
+char *dstrdup(const char *);
void fclose_and_clear(FILE **);
void key_gen(uint8_t *, size_t *, uint64_t);
void key_gen_insert(WT_RAND_STATE *, uint8_t *, size_t *, uint64_t);
@@ -317,6 +319,7 @@ void key_gen_setup(uint8_t **);
void key_len_setup(void);
void *lrt(void *);
void path_setup(const char *);
+int read_row(WT_CURSOR *, WT_ITEM *, uint64_t, int);
uint32_t rng(WT_RAND_STATE *);
void track(const char *, uint64_t, TINFO *);
void val_gen(WT_RAND_STATE *, uint8_t *, size_t *, uint64_t);
diff --git a/test/format/lrt.c b/test/format/lrt.c
index a00a4e07879..85b6e29f224 100644
--- a/test/format/lrt.c
+++ b/test/format/lrt.c
@@ -37,33 +37,120 @@ lrt(void *arg)
{
WT_CONNECTION *conn;
WT_CURSOR *cursor;
+ WT_ITEM key, value;
WT_SESSION *session;
+ size_t buf_len, buf_size;
+ uint64_t keyno, saved_keyno;
u_int period;
int pinned, ret;
+ uint8_t bitfield, *keybuf;
+ void *buf;
- (void)(arg);
+ (void)(arg); /* Unused parameter */
+
+ saved_keyno = 0; /* [-Werror=maybe-uninitialized] */
+
+ key_gen_setup(&keybuf);
+ memset(&key, 0, sizeof(key));
+ key.data = keybuf;
+ memset(&value, 0, sizeof(value));
+
+ buf = NULL;
+ buf_len = buf_size = 0;
/* Open a session and cursor. */
conn = g.wts_conn;
- if ((ret = conn->open_session(
- conn, NULL, "isolation=snapshot", &session)) != 0)
+ if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0)
die(ret, "connection.open_session");
if ((ret = session->open_cursor(
session, g.uri, NULL, NULL, &cursor)) != 0)
die(ret, "session.open_cursor");
for (pinned = 0;;) {
- /*
- * If we have an open cursor, reset it, releasing our pin, else
- * position the cursor, creating a snapshot.
- */
if (pinned) {
+ /* Re-read the record at the end of the table. */
+ while ((ret = read_row(cursor,
+ &key, saved_keyno, 1)) == WT_ROLLBACK)
+ ;
+ if (ret != 0)
+ die(ret, "read_row %" PRIu64, saved_keyno);
+
+ /* Compare the previous value with the current one. */
+ if (g.type == FIX) {
+ ret = cursor->get_value(cursor, &bitfield);
+ value.data = &bitfield;
+ value.size = 1;
+ } else
+ ret = cursor->get_value(cursor, &value);
+ if (ret != 0)
+ die(ret,
+ "cursor.get_value: %" PRIu64, saved_keyno);
+
+ if (buf_size != value.size ||
+ memcmp(buf, value.data, value.size) != 0)
+ die(0, "mismatched start/stop values");
+
+ /* End the transaction. */
+ if ((ret =
+ session->commit_transaction(session, NULL)) != 0)
+ die(ret, "session.commit_transaction");
+
+ /* Reset the cursor, releasing our pin. */
if ((ret = cursor->reset(cursor)) != 0)
die(ret, "cursor.reset");
pinned = 0;
} else {
- if ((ret = cursor->next(cursor)) != 0)
- die(ret, "cursor.reset");
+ /*
+ * Begin transaction: without an explicit transaction,
+ * the snapshot is only kept around while a cursor is
+ * positioned. As soon as the cursor loses its position
+ * a new snapshot will be allocated.
+ */
+ if ((ret = session->begin_transaction(
+ session, "isolation=snapshot")) != 0)
+ die(ret, "session.begin_transaction");
+
+ /* Read a record at the end of the table. */
+ do {
+ saved_keyno = mmrand(NULL,
+ (u_int)(g.key_cnt - g.key_cnt / 10),
+ (u_int)g.key_cnt);
+ while ((ret = read_row(cursor,
+ &key, saved_keyno, 1)) == WT_ROLLBACK)
+ ;
+ } while (ret == WT_NOTFOUND);
+ if (ret != 0)
+ die(ret, "read_row %" PRIu64, saved_keyno);
+
+ /* Copy the cursor's value. */
+ if (g.type == FIX) {
+ ret = cursor->get_value(cursor, &bitfield);
+ value.data = &bitfield;
+ value.size = 1;
+ } else
+ ret = cursor->get_value(cursor, &value);
+ if (ret != 0)
+ die(ret,
+ "cursor.get_value: %" PRIu64, saved_keyno);
+ if (buf_len < value.size &&
+ (buf = realloc(buf, buf_len = value.size)) == NULL)
+ die(errno, "malloc");
+ memcpy(buf, value.data, buf_size = value.size);
+
+ /*
+ * Move the cursor to an early record in the table,
+ * hopefully allowing the page with the record just
+ * retrieved to be evicted from memory.
+ */
+ do {
+ keyno = mmrand(NULL, 1, (u_int)g.key_cnt / 5);
+ while ((ret = read_row(cursor,
+ &key, keyno, 1)) == WT_ROLLBACK)
+ ;
+ } while (ret == WT_NOTFOUND);
+ if (ret != 0)
+ die(ret, "read_row %" PRIu64, keyno);
+
pinned = 1;
}
@@ -82,5 +169,8 @@ lrt(void *arg)
if ((ret = session->close(session, NULL)) != 0)
die(ret, "session.close");
+ free(keybuf);
+ free(buf);
+
return (NULL);
}
diff --git a/test/format/ops.c b/test/format/ops.c
index 7d3b22175ca..7c38aec4757 100644
--- a/test/format/ops.c
+++ b/test/format/ops.c
@@ -33,7 +33,6 @@ static int col_remove(WT_CURSOR *, WT_ITEM *, uint64_t, int *);
static int col_update(TINFO *, WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t);
static int nextprev(WT_CURSOR *, int, int *);
static void *ops(void *);
-static int read_row(WT_CURSOR *, WT_ITEM *, uint64_t);
static int row_insert(TINFO *, WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t);
static int row_remove(WT_CURSOR *, WT_ITEM *, uint64_t, int *);
static int row_update(TINFO *, WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t);
@@ -64,6 +63,7 @@ wts_ops(int lastrun)
session = NULL; /* -Wconditional-uninitialized */
memset(&backup_tid, 0, sizeof(backup_tid));
memset(&compact_tid, 0, sizeof(compact_tid));
+ memset(&lrt_tid, 0, sizeof(lrt_tid));
/*
* There are two mechanisms to specify the length of the run, a number
@@ -239,13 +239,13 @@ ops(void *arg)
tinfo = arg;
- /* Initialize the per-thread random number generator. */
- __wt_random_init(&tinfo->rnd);
-
conn = g.wts_conn;
keybuf = valbuf = NULL;
readonly = 0; /* -Wconditional-uninitialized */
+ /* Initialize the per-thread random number generator. */
+ __wt_random_init(&tinfo->rnd);
+
/* Set up the default key and value buffers. */
key_gen_setup(&keybuf);
val_gen_setup(&tinfo->rnd, &valbuf);
@@ -475,7 +475,7 @@ skip_insert: if (col_update(tinfo,
}
} else {
++tinfo->search;
- if (read_row(cursor, &key, keyno))
+ if (read_row(cursor, &key, keyno, 0))
if (intxn)
goto deadlock;
continue;
@@ -498,7 +498,7 @@ skip_insert: if (col_update(tinfo,
/* Read to confirm the operation. */
++tinfo->search;
- if (read_row(cursor, &key, keyno))
+ if (read_row(cursor, &key, keyno, 0))
goto deadlock;
/* Reset the cursor: there is no reason to keep pages pinned. */
@@ -583,7 +583,7 @@ wts_read_scan(void)
}
key.data = keybuf;
- if ((ret = read_row(cursor, &key, cnt)) != 0)
+ if ((ret = read_row(cursor, &key, cnt, 0)) != 0)
die(ret, "read_scan");
}
@@ -597,8 +597,8 @@ wts_read_scan(void)
* read_row --
* Read and verify a single element in a row- or column-store file.
*/
-static int
-read_row(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno)
+int
+read_row(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int notfound_err)
{
static int sn = 0;
WT_ITEM value;
@@ -634,19 +634,24 @@ read_row(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno)
ret = cursor->search(cursor);
sn = 1;
}
- if (ret == 0) {
+ switch (ret) {
+ case 0:
if (g.type == FIX) {
ret = cursor->get_value(cursor, &bitfield);
value.data = &bitfield;
value.size = 1;
- } else {
+ } else
ret = cursor->get_value(cursor, &value);
- }
- }
- if (ret == WT_ROLLBACK)
+ break;
+ case WT_ROLLBACK:
return (WT_ROLLBACK);
- if (ret != 0 && ret != WT_NOTFOUND)
+ case WT_NOTFOUND:
+ if (notfound_err)
+ return (WT_NOTFOUND);
+ break;
+ default:
die(ret, "read_row: read row %" PRIu64, keyno);
+ }
#ifdef HAVE_BERKELEY_DB
if (!SINGLETHREADED)
diff --git a/test/format/smoke.sh b/test/format/smoke.sh
index 8b4b5d9e424..5fbc349f242 100755
--- a/test/format/smoke.sh
+++ b/test/format/smoke.sh
@@ -3,7 +3,7 @@
set -e
# Smoke-test format as part of running "make check".
-args="-1 -c "." data_source=table ops=100000 rows=10000 threads=4 compression=none"
+args="-1 -c "." data_source=table ops=100000 rows=10000 threads=4 compression=none logging_compression=none"
$TEST_WRAPPER ./t $args file_type=fix
$TEST_WRAPPER ./t $args file_type=row
diff --git a/test/format/t.c b/test/format/t.c
index 8e8a627235f..603706e0ba1 100644
--- a/test/format/t.c
+++ b/test/format/t.c
@@ -40,7 +40,7 @@ int
main(int argc, char *argv[])
{
time_t start;
- int ch, reps, ret;
+ int ch, i, onerun, reps, ret;
const char *config, *home;
config = NULL;
@@ -64,11 +64,12 @@ main(int argc, char *argv[])
/* Set values from the command line. */
home = NULL;
+ onerun = 0;
while ((ch = __wt_getopt(
g.progname, argc, argv, "1C:c:H:h:Llqrt:")) != EOF)
switch (ch) {
case '1': /* One run */
- g.c_runs = 1;
+ onerun = 1;
break;
case 'C': /* wiredtiger_open config */
g.config_open = __wt_optarg;
@@ -105,8 +106,14 @@ main(int argc, char *argv[])
argc -= __wt_optind;
argv += __wt_optind;
- /* Initialize the global random number generator. */
+ /*
+ * Initialize the global RNG. Start with the standard seeds, and then
+ * use seconds since the Epoch modulo a prime to run the RNG for some
+ * number of steps, so we don't start with the same values every time.
+ */
__wt_random_init(&g.rnd);
+ for (i = (int)time(NULL) % 10007; i > 0; --i)
+ (void)__wt_random(&g.rnd);
/* Set up paths. */
path_setup(home);
@@ -155,6 +162,13 @@ main(int argc, char *argv[])
g.c_runs = 1;
/*
+ * Let the command line -1 flag override runs configured from other
+ * sources.
+ */
+ if (onerun)
+ g.c_runs = 1;
+
+ /*
* Initialize locks to single-thread named checkpoints and backups, last
* last-record updates, and failures.
*/
@@ -298,6 +312,11 @@ die(int e, const char *fmt, ...)
/* Single-thread error handling. */
(void)pthread_rwlock_wrlock(&g.death_lock);
+ /* Try and turn off tracking so it doesn't obscure the error message. */
+ if (g.track) {
+ g.track = 0;
+ fprintf(stderr, "\n");
+ }
if (fmt != NULL) { /* Death message. */
fprintf(stderr, "%s: ", g.progname);
va_start(ap, fmt);
diff --git a/test/format/util.c b/test/format/util.c
index 9d28b7a81bc..0f4f5de7c20 100644
--- a/test/format/util.c
+++ b/test/format/util.c
@@ -78,8 +78,7 @@ key_gen_setup(uint8_t **keyp)
*keyp = NULL;
len = MAX(KILOBYTE(100), g.c_key_max);
- if ((key = malloc(len)) == NULL)
- die(errno, "malloc");
+ key = dmalloc(len);
for (i = 0; i < len; ++i)
key[i] = (uint8_t)("abcdefghijklmnopqrstuvwxyz"[i % 26]);
*keyp = key;
@@ -139,8 +138,7 @@ val_gen_setup(WT_RAND_STATE *rnd, uint8_t **valp)
* data for column-store run-length encoded files.
*/
len = MAX(KILOBYTE(100), g.c_value_max) + 20;
- if ((val = malloc(len)) == NULL)
- die(errno, "malloc");
+ val = dmalloc(len);
for (i = 0; i < len; ++i)
val[i] = (uint8_t)("ABCDEFGHIJKLMNOPQRSTUVWXYZ"[i % 26]);
@@ -257,43 +255,36 @@ path_setup(const char *home)
size_t len;
/* Home directory. */
- if ((g.home = strdup(home == NULL ? "RUNDIR" : home)) == NULL)
- die(errno, "malloc");
+ g.home = dstrdup(home == NULL ? "RUNDIR" : home);
/* Log file. */
len = strlen(g.home) + strlen("log") + 2;
- if ((g.home_log = malloc(len)) == NULL)
- die(errno, "malloc");
+ g.home_log = dmalloc(len);
snprintf(g.home_log, len, "%s/%s", g.home, "log");
/* RNG log file. */
len = strlen(g.home) + strlen("rand") + 2;
- if ((g.home_rand = malloc(len)) == NULL)
- die(errno, "malloc");
+ g.home_rand = dmalloc(len);
snprintf(g.home_rand, len, "%s/%s", g.home, "rand");
/* Run file. */
len = strlen(g.home) + strlen("CONFIG") + 2;
- if ((g.home_config = malloc(len)) == NULL)
- die(errno, "malloc");
+ g.home_config = dmalloc(len);
snprintf(g.home_config, len, "%s/%s", g.home, "CONFIG");
/* Statistics file. */
len = strlen(g.home) + strlen("stats") + 2;
- if ((g.home_stats = malloc(len)) == NULL)
- die(errno, "malloc");
+ g.home_stats = dmalloc(len);
snprintf(g.home_stats, len, "%s/%s", g.home, "stats");
/* Backup directory. */
len = strlen(g.home) + strlen("BACKUP") + 2;
- if ((g.home_backup = malloc(len)) == NULL)
- die(errno, "malloc");
+ g.home_backup = dmalloc(len);
snprintf(g.home_backup, len, "%s/%s", g.home, "BACKUP");
/* BDB directory. */
len = strlen(g.home) + strlen("bdb") + 2;
- if ((g.home_bdb = malloc(len)) == NULL)
- die(errno, "malloc");
+ g.home_bdb = dmalloc(len);
snprintf(g.home_bdb, len, "%s/%s", g.home, "bdb");
/*
@@ -315,8 +306,7 @@ path_setup(const char *home)
"mkdir KVS"
#endif
len = strlen(g.home) * 3 + strlen(CMD) + 1;
- if ((g.home_init = malloc(len)) == NULL)
- die(errno, "malloc");
+ g.home_init = dmalloc(len);
snprintf(g.home_init, len, CMD, g.home, g.home, g.home);
/* Backup directory initialize command, remove and re-create it. */
@@ -327,8 +317,7 @@ path_setup(const char *home)
#define CMD "rm -rf %s && mkdir %s"
#endif
len = strlen(g.home_backup) * 2 + strlen(CMD) + 1;
- if ((g.home_backup_init = malloc(len)) == NULL)
- die(errno, "malloc");
+ g.home_backup_init = dmalloc(len);
snprintf(g.home_backup_init, len, CMD, g.home_backup, g.home_backup);
/*
@@ -351,8 +340,7 @@ path_setup(const char *home)
"cp WiredTiger* wt* slvg.copy/"
#endif
len = strlen(g.home) + strlen(CMD) + 1;
- if ((g.home_salvage_copy = malloc(len)) == NULL)
- die(errno, "malloc");
+ g.home_salvage_copy = dmalloc(len);
snprintf(g.home_salvage_copy, len, CMD, g.home);
}
@@ -422,3 +410,31 @@ fclose_and_clear(FILE **fpp)
die(errno, "fclose");
return;
}
+
+/*
+ * dmalloc --
+ * Call malloc, dying on failure.
+ */
+void *
+dmalloc(size_t len)
+{
+ void *p;
+
+ if ((p = malloc(len)) == NULL)
+ die(errno, "malloc");
+ return (p);
+}
+
+/*
+ * dstrdup --
+ * Call strdup, dying on failure.
+ */
+char *
+dstrdup(const char *str)
+{
+ char *p;
+
+ if ((p = strdup(str)) == NULL)
+ die(errno, "strdup");
+ return (p);
+}
diff --git a/test/format/wts.c b/test/format/wts.c
index 3d3b59810e8..23823c20184 100644
--- a/test/format/wts.c
+++ b/test/format/wts.c
@@ -462,8 +462,7 @@ wts_dump(const char *tag, int dump_bdb)
track("dump files and compare", 0ULL, NULL);
len = strlen(g.home) + strlen(BERKELEY_DB_PATH) + strlen(g.uri) + 100;
- if ((cmd = malloc(len)) == NULL)
- die(errno, "malloc");
+ cmd = dmalloc(len);
(void)snprintf(cmd, len,
"sh s_dumpcmp -h %s %s %s %s %s %s",
g.home,
@@ -564,9 +563,7 @@ wts_stats(void)
/* Data source statistics. */
fprintf(fp, "\n\n====== Data source statistics:\n");
- if ((stat_name =
- malloc(strlen("statistics:") + strlen(g.uri) + 1)) == NULL)
- die(errno, "malloc");
+ stat_name = dmalloc(strlen("statistics:") + strlen(g.uri) + 1);
sprintf(stat_name, "statistics:%s", g.uri);
if ((ret = session->open_cursor(
session, stat_name, NULL, NULL, &cursor)) != 0)
diff --git a/test/suite/run.py b/test/suite/run.py
index 1cb7309cb53..5e7b76a79b9 100644
--- a/test/suite/run.py
+++ b/test/suite/run.py
@@ -312,7 +312,7 @@ if __name__ == '__main__':
else:
for arg in testargs:
testsFromArg(tests, loader, arg)
-
+
if debug:
import pdb
pdb.set_trace()
diff --git a/test/suite/test_async01.py b/test/suite/test_async01.py
index af5180192af..fee5e8232f1 100644
--- a/test/suite/test_async01.py
+++ b/test/suite/test_async01.py
@@ -51,7 +51,7 @@ class Callback(wiredtiger.AsyncCallback):
def notify_error(self, key, value, optype, desc):
tty_pr('ERROR: notify(' + str(key) + ',' + str(value) + ',' +
str(optype) + '): ' + desc)
-
+
def notify(self, op, op_ret, flags):
# Note: we are careful not to throw any errors here. Any
diff --git a/test/suite/test_async02.py b/test/suite/test_async02.py
index 21d811989c8..c878e8dd114 100644
--- a/test/suite/test_async02.py
+++ b/test/suite/test_async02.py
@@ -51,7 +51,7 @@ class Callback(wiredtiger.AsyncCallback):
def notify_error(self, key, value, optype, exp, desc):
tty_pr('ERROR: notify(' + str(key) + ',' + str(value) + ',' +
str(optype) + '): ' + 'Expected: ' + str(exp) + ' ' + desc)
-
+
def notify(self, op, op_ret, flags):
# Note: we are careful not to throw any errors here. Any
diff --git a/test/suite/test_autoclose.py b/test/suite/test_autoclose.py
index 40106e6f97d..6dc71003a34 100644
--- a/test/suite/test_autoclose.py
+++ b/test/suite/test_autoclose.py
@@ -156,7 +156,7 @@ class test_autoclose(wttest.WiredTigerTestCase):
self.assertRaisesHavingMessage(exceptions.RuntimeError,
lambda: self.create_table(),
'/wt_session.* is None/')
-
+
def test_close_connection1(self):
"""
Use a connection handle after it is closed.
@@ -166,6 +166,6 @@ class test_autoclose(wttest.WiredTigerTestCase):
self.assertRaisesHavingMessage(exceptions.RuntimeError,
lambda: conn.open_session(None),
'/wt_connection.* is None/')
-
+
if __name__ == '__main__':
wttest.run()
diff --git a/test/suite/test_backup04.py b/test/suite/test_backup04.py
index 47e656cf9b1..a0a52f49817 100644
--- a/test/suite/test_backup04.py
+++ b/test/suite/test_backup04.py
@@ -83,7 +83,7 @@ class test_backup_target(wttest.WiredTigerTestCase, suite_subprocess):
# Compare the original and backed-up files using the wt dump command.
def compare(self, uri, dir_full, dir_incr):
- # print "Compare: full URI: " + uri + " with incremental URI "
+ # print "Compare: full URI: " + uri + " with incremental URI "
if dir_full == None:
full_name='original'
else:
diff --git a/test/suite/test_backup05.py b/test/suite/test_backup05.py
index 80706b20299..8ab329f761a 100644
--- a/test/suite/test_backup05.py
+++ b/test/suite/test_backup05.py
@@ -71,7 +71,7 @@ class test_backup05(wttest.WiredTigerTestCase, suite_subprocess):
session = self.setUpSessionOpen(conn)
session.verify(self.uri)
conn.close()
-
+
def test_backup(self):
'''Check manual fsyncLock backup strategy'''
diff --git a/test/suite/test_base05.py b/test/suite/test_base05.py
index 399cba07164..7d5ff59b2c9 100644
--- a/test/suite/test_base05.py
+++ b/test/suite/test_base05.py
@@ -154,7 +154,7 @@ class test_base05(wttest.WiredTigerTestCase):
choice = (n + i) % len(reflist)
result += reflist[choice]
return result + ':' + str(n)
-
+
def test_table_ss(self):
"""
Create entries, and read back in a cursor: key=string, value=string
@@ -196,7 +196,7 @@ class test_base05(wttest.WiredTigerTestCase):
def do_test_table_base(self, convert):
"""
- Base functionality that uses regular strings with
+ Base functionality that uses regular strings with
non-ASCII (UTF) chars and optionally converts them to
Unicode (considered a type separate from string in Python).
"""
diff --git a/test/suite/test_baseconfig.py b/test/suite/test_baseconfig.py
new file mode 100644
index 00000000000..6ac3654af11
--- /dev/null
+++ b/test/suite/test_baseconfig.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2015 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import os
+import wiredtiger, wttest
+
+# test_baseconfig
+# test base configuration file being ignored.
+class test_baseconfig(wttest.WiredTigerTestCase):
+ def test_baseconfig(self):
+ # Open up another database and modify the baseconfig
+ os.mkdir("A")
+ conn = wiredtiger.wiredtiger_open("A", 'create')
+ self.assertTrue(os.path.exists("A/WiredTiger.basecfg"))
+ with open("A/WiredTiger.basecfg", "a") as basecfg_file:
+ basecfg_file.write("foo!")
+ conn.close()
+
+ # Open a database, we should assert here as the basecfg is invalid
+ self.assertRaisesWithMessage(
+ wiredtiger.WiredTigerError,
+ lambda: wiredtiger.wiredtiger_open("A", ''),
+ '/unknown configuration key/')
+
+ conn = wiredtiger.wiredtiger_open("A", "create,config_base=false")
+ conn.close()
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/test/suite/test_bug005.py b/test/suite/test_bug005.py
index 961bb551b69..3e06bea8694 100644
--- a/test/suite/test_bug005.py
+++ b/test/suite/test_bug005.py
@@ -37,7 +37,7 @@ from helper import key_populate, value_populate
class test_bug005(wttest.WiredTigerTestCase):
# This is a btree layer test, test files, ignore tables.
uri = 'file:test_bug005'
-
+
def test_bug005(self):
# Create the object.
self.session.create(self.uri, 'value_format=S,key_format=S')
diff --git a/test/suite/test_bug008.py b/test/suite/test_bug008.py
index 0102cbd63f4..75cbd989cd1 100644
--- a/test/suite/test_bug008.py
+++ b/test/suite/test_bug008.py
@@ -48,7 +48,7 @@ class test_bug008(wttest.WiredTigerTestCase):
# Populate the tree and reopen the connection, forcing it to disk
# and moving the records to an on-page format.
- simple_populate(self, uri, self.fmt, 100)
+ simple_populate(self, uri, self.fmt, 100)
self.reopen_conn()
# Begin a transaction, and add some additional records.
@@ -105,7 +105,7 @@ class test_bug008(wttest.WiredTigerTestCase):
# Populate the tree and reopen the connection, forcing it to disk
# and moving the records to an on-page format.
- simple_populate(self, uri, self.fmt, 100)
+ simple_populate(self, uri, self.fmt, 100)
self.reopen_conn()
# Add some additional visible records.
diff --git a/test/suite/test_bug011.py b/test/suite/test_bug011.py
index d2c56adb221..b93fc3a75b7 100644
--- a/test/suite/test_bug011.py
+++ b/test/suite/test_bug011.py
@@ -64,7 +64,7 @@ class test_bug011(wttest.WiredTigerTestCase):
# Make sure we have a cursor for the table so it stays in cache.
for i in range(0, self.ntables):
this_uri = 'table:%s-%03d' % (self.table_name, i)
- cursors.append(self.session.open_cursor(this_uri, None))
+ cursors.append(self.session.open_cursor(this_uri, None))
# Make use of the cache.
for i in range(0, self.nops):
diff --git a/test/suite/test_checkpoint01.py b/test/suite/test_checkpoint01.py
index aacc8f1f055..799e6ded1ea 100644
--- a/test/suite/test_checkpoint01.py
+++ b/test/suite/test_checkpoint01.py
@@ -70,7 +70,7 @@ class test_checkpoint(wttest.WiredTigerTestCase):
for checkpoint_name, entry in self.checkpoints.iteritems():
self.add_records(checkpoint_name)
self.session.checkpoint("name=" + checkpoint_name)
-
+
# Create a dictionary of sorted records a checkpoint should include.
def list_expected(self, name):
records = {}
diff --git a/test/suite/test_cursor01.py b/test/suite/test_cursor01.py
index 507036e85cf..47cc7f6c5b7 100644
--- a/test/suite/test_cursor01.py
+++ b/test/suite/test_cursor01.py
@@ -167,7 +167,7 @@ class test_cursor01(wttest.WiredTigerTestCase):
def backward_iter(self, cursor):
cursor.reset()
self.assertCursorHasNoKeyValue(cursor)
-
+
i = self.nentries - 1
while True:
prevret = cursor.prev()
@@ -188,7 +188,7 @@ class test_cursor01(wttest.WiredTigerTestCase):
def backward_iter_with_dup(self, cursor):
cursor.reset()
self.assertCursorHasNoKeyValue(cursor)
-
+
i = self.nentries - 1
while True:
prevret = cursor.prev()
diff --git a/test/suite/test_cursor04.py b/test/suite/test_cursor04.py
index 50cde0023d8..08f1a7240a5 100644
--- a/test/suite/test_cursor04.py
+++ b/test/suite/test_cursor04.py
@@ -113,7 +113,7 @@ class test_cursor04(wttest.WiredTigerTestCase):
self.assertEqual(direction, 0)
self.assertEqual(cursor.get_key(), origkey)
self.assertEqual(cursor.get_value(), 0)
-
+
def test_searches(self):
"""
Create entries, and read back in a cursor: key=string, value=string
@@ -174,7 +174,7 @@ class test_cursor04(wttest.WiredTigerTestCase):
self.assertEqual(cmp, 0)
self.assertEqual(cursor.get_key(), self.genkey(0))
self.assertEqual(cursor.get_value(), 0)
-
+
cursor.set_key(self.genkey(5))
self.expect_either(cursor, 4, 6)
diff --git a/test/suite/test_cursor06.py b/test/suite/test_cursor06.py
index 28ac581cf66..c11d043a548 100644
--- a/test/suite/test_cursor06.py
+++ b/test/suite/test_cursor06.py
@@ -58,7 +58,7 @@ class test_cursor06(wttest.WiredTigerTestCase):
cursor.set_value(v[0], v[1], v[2], v[3])
else:
cursor.set_value(value_populate(cursor, 10))
-
+
def test_reconfigure_overwrite(self):
uri = self.type + self.name
for open_config in (None, "overwrite=0", "overwrite=1"):
@@ -77,7 +77,7 @@ class test_cursor06(wttest.WiredTigerTestCase):
self.set_kv(cursor)
cursor.insert()
cursor.close()
-
+
def test_reconfigure_readonly(self):
uri = self.type + self.name
for open_config in (None, "readonly=0", "readonly=1"):
diff --git a/test/suite/test_cursor_random.py b/test/suite/test_cursor_random.py
index be08c59210f..10a3140a2fd 100644
--- a/test/suite/test_cursor_random.py
+++ b/test/suite/test_cursor_random.py
@@ -92,7 +92,7 @@ class test_cursor_random(wttest.WiredTigerTestCase):
# Check that next_random works in the presence of a larger set of values,
# where the values are in a disk format page.
- def test_cursor_random_multiple_page_records(self):
+ def cursor_random_multiple_page_records(self, reopen):
uri = self.type + 'random'
if self.type == 'file:':
simple_populate(self, uri,
@@ -103,10 +103,10 @@ class test_cursor_random(wttest.WiredTigerTestCase):
'allocation_size=512,leaf_page_max=512,key_format=' +\
self.fmt, 10000)
- # Close the connection so everything is forced to disk (otherwise the
- # values are on an insert list and the underlying engine doesn't make
- # random selections, it selects the middle of the list.
- self.reopen_conn()
+ # Optionally close the connection so everything is forced to disk,
+ # insert lists are an entirely different path in the code.
+ if reopen:
+ self.reopen_conn()
cursor = self.session.open_cursor(uri, None, "next_random=true")
last = ''
@@ -120,6 +120,10 @@ class test_cursor_random(wttest.WiredTigerTestCase):
self.assertLess(match, 5,
'next_random did not return random records, too many matches found')
+ def test_cursor_random_multiple_page_records_reopen(self):
+ self.cursor_random_multiple_page_records(1)
+ def test_cursor_random_multiple_page_records(self):
+ self.cursor_random_multiple_page_records(0)
# Check that opening a random cursor on column-store returns not-supported.
class test_cursor_random_column(wttest.WiredTigerTestCase):
diff --git a/test/suite/test_cursor_tracker.py b/test/suite/test_cursor_tracker.py
index 1fa93f3e59b..742dea4c32b 100644
--- a/test/suite/test_cursor_tracker.py
+++ b/test/suite/test_cursor_tracker.py
@@ -461,7 +461,7 @@ class TestCursorTracker(wttest.WiredTigerTestCase):
except:
v = '[invalid]'
print(prefix + k + ' ' + v)
-
+
def cur_check(self, cursor, got, want, iskey):
if got != want:
if iskey:
diff --git a/test/suite/test_durability01.py b/test/suite/test_durability01.py
index 716e38c17d4..8d00d05fa14 100644
--- a/test/suite/test_durability01.py
+++ b/test/suite/test_durability01.py
@@ -52,7 +52,7 @@ class test_durability01(wttest.WiredTigerTestCase, suite_subprocess):
session = self.setUpSessionOpen(conn)
session.verify(self.uri)
conn.close()
-
+
def test_durability(self):
'''Check for missing metadata checkpoints'''
diff --git a/test/suite/test_encrypt03.py b/test/suite/test_encrypt03.py
index 0e06d4491ca..0e19ad39263 100644
--- a/test/suite/test_encrypt03.py
+++ b/test/suite/test_encrypt03.py
@@ -86,7 +86,7 @@ class test_encrypt03(wttest.WiredTigerTestCase):
def test_encrypt(self):
params = 'key_format=S,value_format=S,encryption=(name='
if self.file_encrypt != None:
- params += self.file_encrypt
+ params += self.file_encrypt
if self.file_encrypt_args != None:
params += ',keyid=' + self.file_encrypt_args
params += ')'
diff --git a/test/suite/test_encrypt04.py b/test/suite/test_encrypt04.py
index ea9bcc5aacb..41fd0f6dd48 100644
--- a/test/suite/test_encrypt04.py
+++ b/test/suite/test_encrypt04.py
@@ -46,9 +46,15 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess):
# with simply the wrong keyid may appear valid when initially verified,
# but may result in error on first use. The odds that a real encryptor
# would leave a lot of its input unchanged is infinitesimally small.
+ #
+ # When both self.forceerror1 and self.forceerror2 occur, we set a config
+ # flag when loading the rotn encryptor, which forces a particular error
+ # return in rotn.decrypt. We look for that return back from
+ # wiredtiger_open.
encrypt_scen_1 = [
('none', dict( name1='none', keyid1='', secretkey1='')),
- ('rotn17abc', dict( name1='rotn', keyid1='17', secretkey1='ABC')),
+ ('rotn17abc', dict( name1='rotn', keyid1='17',
+ secretkey1='ABC', forceerror1=True)),
('rotn11abc', dict( name1='rotn', keyid1='11', secretkey1='ABC')),
('rotn11xyz', dict( name1='rotn', keyid1='11', secretkey1='XYZ')),
('rotn11xyz_and_clear', dict( name1='rotn', keyid1='11',
@@ -58,7 +64,8 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess):
('none', dict( name2='none', keyid2='', secretkey2='')),
('rotn17abc', dict( name2='rotn', keyid2='17', secretkey2='ABC')),
('rotn11abc', dict( name2='rotn', keyid2='11', secretkey2='ABC')),
- ('rotn11xyz', dict( name2='rotn', keyid2='11', secretkey2='XYZ')),
+ ('rotn11xyz', dict( name2='rotn', keyid2='11',
+ secretkey2='XYZ', forceerror2=True)),
('rotn11xyz_and_clear', dict( name2='rotn', keyid2='11',
secretkey2='XYZ', fileinclear2=True))
]
@@ -73,6 +80,7 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess):
# Override WiredTigerTestCase, we have extensions.
def setUpConnectionOpen(self, dir):
+ forceerror = None
if self.part == 1:
self.name = self.name1
self.keyid = self.keyid1
@@ -85,15 +93,28 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess):
self.secretkey = self.secretkey2
self.fileinclear = self.fileinclear2 if \
hasattr(self, 'fileinclear2') else False
+ if hasattr(self, 'forceerror1') and hasattr(self, 'forceerror2'):
+ forceerror = "rotn_force_error=true"
+ self.expect_forceerror = forceerror != None
+ self.got_forceerror = False
encarg = 'encryption=(name={0},keyid={1},secretkey={2}),'.format(
self.name, self.keyid, self.secretkey)
- extarg = self.extensionArg([('encryptors', self.name),
- ('encryptors', self.name)])
+ # If forceerror is set for this test, add a config arg to
+ # the extension string. That signals rotn to return a (-1000)
+ # error code, which we'll detect here.
+ extarg = self.extensionArg([('encryptors', self.name, forceerror)])
self.pr('encarg = ' + encarg + ' extarg = ' + extarg)
- conn = wiredtiger.wiredtiger_open(dir,
- 'create,error_prefix="{0}: ",{1}{2}'.format(
- self.shortid(), encarg, extarg))
+ completed = False
+ try:
+ conn = wiredtiger.wiredtiger_open(dir,
+ 'create,error_prefix="{0}: ",{1}{2}'.format(
+ self.shortid(), encarg, extarg))
+ except (BaseException) as err:
+ # Capture the recognizable error created by rotn
+ if str(-1000) in str(err):
+ self.got_forceerror = True
+ raise
self.pr(`conn`)
return conn
@@ -119,7 +140,7 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess):
def extensionArg(self, exts):
extfiles = []
for ext in exts:
- (dirname, name) = ext
+ (dirname, name, extarg) = ext
if name != None and name != 'none':
testdir = os.path.dirname(__file__)
extdir = os.path.join(run.wt_builddir, 'ext', dirname)
@@ -127,12 +148,16 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess):
extdir, name, '.libs', 'libwiredtiger_' + name + '.so')
if not os.path.exists(extfile):
self.skipTest('extension "' + extfile + '" not built')
+ extfile = '"' + extfile + '"'
if not extfile in extfiles:
- extfiles.append(extfile)
+ s = extfile
+ if extarg != None:
+ s += "=(config=\"" + extarg + "\")"
+ extfiles.append(s)
if len(extfiles) == 0:
return ''
else:
- return ',extensions=["' + '","'.join(extfiles) + '"]'
+ return ',extensions=[' + ','.join(extfiles) + ']'
# Evaluate expression, which either must succeed (if expect_okay)
# or must fail (if !expect_okay).
@@ -204,7 +229,8 @@ class test_encrypt04(wttest.WiredTigerTestCase, suite_subprocess):
self.check_records(cursor, r, 0, self.nrecords)
self.check_records(cursor, r, self.nrecords, self.nrecords * 2)
cursor.close()
-
+ self.assertEqual(self.expect_forceerror, self.got_forceerror)
+
if __name__ == '__main__':
wttest.run()
diff --git a/test/suite/test_encrypt05.py b/test/suite/test_encrypt05.py
index f5db543ecf3..8a69e5f909f 100644
--- a/test/suite/test_encrypt05.py
+++ b/test/suite/test_encrypt05.py
@@ -93,7 +93,7 @@ class test_encrypt05(wttest.WiredTigerTestCase):
diff = n - len(self.bigvalue)
rchr = ''.join(chr(r.randint(1, 255)) for i in range(diff))
return self.bigvalue + rchr
-
+
# Create a table, add key/values with specific lengths, then verify them.
def test_encrypt(self):
params = 'key_format=S,value_format=S'
diff --git a/test/suite/test_encrypt06.py b/test/suite/test_encrypt06.py
index 21e4d50769c..5b2007fe6e7 100644
--- a/test/suite/test_encrypt06.py
+++ b/test/suite/test_encrypt06.py
@@ -211,7 +211,7 @@ class test_encrypt06(wttest.WiredTigerTestCase):
c0.close()
c1.close()
-
+
# Force everything to disk so we can examine it
self.close_conn()
@@ -222,7 +222,7 @@ class test_encrypt06(wttest.WiredTigerTestCase):
not self.match_string_in_rundir(txt0))
self.assertEqual(self.expected_encryption(self.encrypt1),
not self.match_string_in_rundir(txt1))
-
+
if __name__ == '__main__':
wttest.run()
diff --git a/test/suite/test_jsondump02.py b/test/suite/test_jsondump02.py
index 0c6b38db3ef..790f651fd2f 100644
--- a/test/suite/test_jsondump02.py
+++ b/test/suite/test_jsondump02.py
@@ -84,7 +84,7 @@ class test_jsondump02(wttest.WiredTigerTestCase):
cursor[insert[0]] = insert[1]
finally:
cursor.close()
-
+
# Create JSON cursors and test them directly.
def test_json_cursor(self):
"""
@@ -140,50 +140,50 @@ class test_jsondump02(wttest.WiredTigerTestCase):
# bad tokens
self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: self.load_json(self.table_uri2,
+ lambda: self.load_json(self.table_uri2,
(('<>abc?', '9'),)),
'/unknown token/')
# bad tokens
self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: self.load_json(self.table_uri2,
+ lambda: self.load_json(self.table_uri2,
(('"abc\u"', ''),)),
'/invalid Unicode/')
# bad tokens
self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: self.load_json(self.table_uri2,
+ lambda: self.load_json(self.table_uri2,
(('"abc', ''),)),
'/unterminated string/')
# bad syntax
self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: self.load_json(self.table_uri2,
+ lambda: self.load_json(self.table_uri2,
(('"stuff" "jibberish"', '"value0" "more jibberish"'),)),
'/expected key name.*\"key0\"/')
# bad types
self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: self.load_json(self.table_uri2,
+ lambda: self.load_json(self.table_uri2,
(('"key0" : "KEY002"', '"value0" : "xyz",\n"value1" : "str0"'),)),
'/expected unsigned JSON <int>, got <string>/')
# bad types
self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: self.load_json(self.table_uri2,
+ lambda: self.load_json(self.table_uri2,
(('"key0" : "KEY002"', '"value0" : 123,\n"value1" : 456'),)),
'/expected JSON <string>, got <integer>/')
# extra stuff
self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: self.load_json(self.table_uri2,
+ lambda: self.load_json(self.table_uri2,
(('"key0" : "KEY002"',
'"value0" : 123,\n"value1" : "str0",'),)),
'/expected JSON <EOF>, got \',\'/')
# fields out of order currently not supported
self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: self.load_json(self.table_uri2,
+ lambda: self.load_json(self.table_uri2,
(('"key0" : "KEY002"', '"value1" : "str0",\n"value0" : 123'),)),
'/expected value name.*\"value0\"/')
@@ -192,17 +192,17 @@ class test_jsondump02(wttest.WiredTigerTestCase):
'\\u', '\\ux', '\\u0', '\\u0F', '\\u0FA', '\\u0FAx', '\\u0FA\\x')
for uni in invalid_unicode:
self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: self.load_json(self.table_uri2,
+ lambda: self.load_json(self.table_uri2,
(('"key0" : "KEY002"', '"value0" : 123,\n"value1" : "'
+ uni + '"'),)),
'/invalid Unicode/')
# this one should work
- self.load_json(self.table_uri2,
+ self.load_json(self.table_uri2,
(('"key0" : "KEY002"', '"value0" : 345,\n"value1" : "str2"'),))
# extraneous/missing space is okay
- self.load_json(self.table_uri2,
+ self.load_json(self.table_uri2,
((' "key0"\n:\t"KEY003" ',
'"value0":456,"value1"\n\n\r\n:\t\n"str3"'),))
diff --git a/test/suite/test_metadata_cursor01.py b/test/suite/test_metadata_cursor01.py
index 35fd1a74354..706b8a4132a 100644
--- a/test/suite/test_metadata_cursor01.py
+++ b/test/suite/test_metadata_cursor01.py
@@ -107,7 +107,7 @@ class test_metadata_cursor01(wttest.WiredTigerTestCase):
self.create_table()
cursor = self.session.open_cursor(self.metauri, None, None)
self.assertCursorHasNoKeyValue(cursor)
-
+
while True:
prevret = cursor.prev()
if prevret != 0:
@@ -124,7 +124,7 @@ class test_metadata_cursor01(wttest.WiredTigerTestCase):
self.create_table()
cursor = self.session.open_cursor(self.metauri, None, None)
self.assertCursorHasNoKeyValue(cursor)
-
+
# Ensure the 'special' metadata metadata is found.
value = cursor['metadata:']
self.assertTrue(value.find('key_format') != -1)
diff --git a/test/suite/test_pack.py b/test/suite/test_pack.py
index c9d360c2dcd..451c6fbb9a9 100644
--- a/test/suite/test_pack.py
+++ b/test/suite/test_pack.py
@@ -43,7 +43,7 @@ class test_pack(wttest.WiredTigerTestCase):
y = cursor.get_value()
self.tty(' ' + name + ': ' + str(x) + ' => ' + str(y))
cursor.reset()
-
+
def check(self, fmt, *v):
v = list(v)
fmtname = re.sub('([A-Z])', r'_\1', fmt)
diff --git a/test/suite/test_priv01.py b/test/suite/test_priv01.py
index 9b6b494e76e..0602d24a2b2 100644
--- a/test/suite/test_priv01.py
+++ b/test/suite/test_priv01.py
@@ -131,7 +131,7 @@ class test_priv01(wttest.WiredTigerTestCase):
lambda: self.common_test(None, edir, None),
'/WIREDTIGER_HOME environment variable set but\
process lacks privileges to use that environment variable/')
-
+
def test_env_conf_priv(self):
edir = 'envdir'
os.mkdir(edir)
diff --git a/test/suite/test_schema02.py b/test/suite/test_schema02.py
index ab709a28211..0cbff4b5ae0 100644
--- a/test/suite/test_schema02.py
+++ b/test/suite/test_schema02.py
@@ -173,7 +173,7 @@ class test_schema02(wttest.WiredTigerTestCase):
cursor[(i, 'key' + str(i))] = \
('val' + str(square), square, 'val' + str(cube), cube)
cursor.close()
-
+
def check_entries(self):
cursor = self.session.open_cursor('table:main', None, None)
# spot check via search
diff --git a/test/suite/test_schema04.py b/test/suite/test_schema04.py
index 9ad01b0f285..a66e1ea2411 100644
--- a/test/suite/test_schema04.py
+++ b/test/suite/test_schema04.py
@@ -79,7 +79,7 @@ class test_schema04(wttest.WiredTigerTestCase):
(i*3)%100, (i*4)%100, (i*5)%100)
cursor.insert()
cursor.close()
-
+
def check_entries(self):
cursor = self.session.open_cursor('table:schema04', None, None)
icursor = []
diff --git a/test/suite/test_schema05.py b/test/suite/test_schema05.py
index c3919af0880..2a7bc042c80 100644
--- a/test/suite/test_schema05.py
+++ b/test/suite/test_schema05.py
@@ -120,7 +120,7 @@ class test_schema05(wttest.WiredTigerTestCase):
cursor[i] = ','.join([str((i*j)%100) for j in
range(0, self.nindices)])
cursor.close()
-
+
def check_entries(self):
cursor = self.session.open_cursor('table:schema05', None, None)
icursor = []
diff --git a/test/suite/test_sweep01.py b/test/suite/test_sweep01.py
index f5e2aa96cbe..13422a75a61 100644
--- a/test/suite/test_sweep01.py
+++ b/test/suite/test_sweep01.py
@@ -42,7 +42,6 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
uri = 'table:' + tablebase
numfiles = 50
numkv = 1000
- ckpt = 5
types = [
('row', dict(tabletype='row',
@@ -65,7 +64,6 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
',create,error_prefix="%s: ",' % self.shortid() + \
'file_manager=(close_handle_minimum=0,' + \
'close_idle_time=6,close_scan_interval=2),' + \
- 'checkpoint=(wait=%d),' % self.ckpt + \
'statistics=(fast),'
# print "Creating conn at '%s' with config '%s'" % (dir, conn_params)
try:
@@ -93,12 +91,13 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
time.sleep(1)
stat_cursor = self.session.open_cursor('statistics:', None, None)
- close1 = stat_cursor[stat.conn.dh_conn_handles][2]
- sweep1 = stat_cursor[stat.conn.dh_conn_sweeps][2]
+ close1 = stat_cursor[stat.conn.dh_sweep_close][2]
+ remove1 = stat_cursor[stat.conn.dh_sweep_remove][2]
+ sweep1 = stat_cursor[stat.conn.dh_sweeps][2]
sclose1 = stat_cursor[stat.conn.dh_session_handles][2]
ssweep1 = stat_cursor[stat.conn.dh_session_sweeps][2]
- tod1 = stat_cursor[stat.conn.dh_conn_tod][2]
- ref1 = stat_cursor[stat.conn.dh_conn_ref][2]
+ tod1 = stat_cursor[stat.conn.dh_sweep_tod][2]
+ ref1 = stat_cursor[stat.conn.dh_sweep_ref][2]
nfile1 = stat_cursor[stat.conn.file_open][2]
stat_cursor.close()
@@ -116,10 +115,15 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
# checkpoint something to do. Make sure checkpoint doesn't adjust
# the time of death for inactive handles.
#
+ # Note that we do checkpoints inline because that has the side effect
+ # of sweeping the session cache, which will allow handles to be
+ # removed.
+ #
c = self.session.open_cursor(uri, None)
k = 0
sleep = 0
while sleep < 12:
+ self.session.checkpoint()
k = k+1
c[k] = 1
sleep += 2
@@ -127,13 +131,14 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
c.close()
stat_cursor = self.session.open_cursor('statistics:', None, None)
- close2 = stat_cursor[stat.conn.dh_conn_handles][2]
- sweep2 = stat_cursor[stat.conn.dh_conn_sweeps][2]
+ close2 = stat_cursor[stat.conn.dh_sweep_close][2]
+ remove2 = stat_cursor[stat.conn.dh_sweep_remove][2]
+ sweep2 = stat_cursor[stat.conn.dh_sweeps][2]
sclose2 = stat_cursor[stat.conn.dh_session_handles][2]
ssweep2 = stat_cursor[stat.conn.dh_session_sweeps][2]
nfile2 = stat_cursor[stat.conn.file_open][2]
- tod2 = stat_cursor[stat.conn.dh_conn_tod][2]
- ref2 = stat_cursor[stat.conn.dh_conn_ref][2]
+ tod2 = stat_cursor[stat.conn.dh_sweep_tod][2]
+ ref2 = stat_cursor[stat.conn.dh_sweep_ref][2]
stat_cursor.close()
# print "checkpoint: " + str(self.ckpt)
# print "nfile1: " + str(nfile1) + " nfile2: " + str(nfile2)
@@ -144,12 +149,13 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
# print "tod1: " + str(tod1) + " tod2: " + str(tod2)
# print "ref1: " + str(ref1) + " ref2: " + str(ref2)
- #
+ #
# The files are all closed. Check that sweep did its work even
# in the presence of recent checkpoints.
#
if (close1 >= close2):
print "XX: close1: " + str(close1) + " close2: " + str(close2)
+ print "remove1: " + str(remove1) + " remove2: " + str(remove2)
print "sweep1: " + str(sweep1) + " sweep2: " + str(sweep2)
print "sclose1: " + str(sclose1) + " sclose2: " + str(sclose2)
print "ssweep1: " + str(ssweep1) + " ssweep2: " + str(ssweep2)
@@ -157,8 +163,19 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
print "ref1: " + str(ref1) + " ref2: " + str(ref2)
print "nfile1: " + str(nfile1) + " nfile2: " + str(nfile2)
self.assertEqual(close1 < close2, True)
+ if (remove1 >= remove2):
+ print "close1: " + str(close1) + " close2: " + str(close2)
+ print "XX: remove1: " + str(remove1) + " remove2: " + str(remove2)
+ print "sweep1: " + str(sweep1) + " sweep2: " + str(sweep2)
+ print "sclose1: " + str(sclose1) + " sclose2: " + str(sclose2)
+ print "ssweep1: " + str(ssweep1) + " ssweep2: " + str(ssweep2)
+ print "tod1: " + str(tod1) + " tod2: " + str(tod2)
+ print "ref1: " + str(ref1) + " ref2: " + str(ref2)
+ print "nfile1: " + str(nfile1) + " nfile2: " + str(nfile2)
+ self.assertEqual(remove1 < remove2, True)
if (sweep1 >= sweep2):
print "close1: " + str(close1) + " close2: " + str(close2)
+ print "remove1: " + str(remove1) + " remove2: " + str(remove2)
print "XX: sweep1: " + str(sweep1) + " sweep2: " + str(sweep2)
print "sclose1: " + str(sclose1) + " sclose2: " + str(sclose2)
print "ssweep1: " + str(ssweep1) + " ssweep2: " + str(ssweep2)
@@ -167,6 +184,7 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
self.assertEqual(sweep1 < sweep2, True)
if (nfile2 >= nfile1):
print "close1: " + str(close1) + " close2: " + str(close2)
+ print "remove1: " + str(remove1) + " remove2: " + str(remove2)
print "sweep1: " + str(sweep1) + " sweep2: " + str(sweep2)
print "sclose1: " + str(sclose1) + " sclose2: " + str(sclose2)
print "ssweep1: " + str(ssweep1) + " ssweep2: " + str(ssweep2)
@@ -174,17 +192,18 @@ class test_sweep01(wttest.WiredTigerTestCase, suite_subprocess):
print "ref1: " + str(ref1) + " ref2: " + str(ref2)
print "XX: nfile1: " + str(nfile1) + " nfile2: " + str(nfile2)
self.assertEqual(nfile2 < nfile1, True)
- # The only files that should be left is the metadata, the lock file
- # and the active file.
- if (nfile2 != 3):
+ # The only files that should be left are the metadata, the lookaside
+ # file, the lock file, and the active file.
+ if (nfile2 != 4):
print "close1: " + str(close1) + " close2: " + str(close2)
+ print "remove1: " + str(remove1) + " remove2: " + str(remove2)
print "sweep1: " + str(sweep1) + " sweep2: " + str(sweep2)
print "sclose1: " + str(sclose1) + " sclose2: " + str(sclose2)
print "ssweep1: " + str(ssweep1) + " ssweep2: " + str(ssweep2)
print "tod1: " + str(tod1) + " tod2: " + str(tod2)
print "ref1: " + str(ref1) + " ref2: " + str(ref2)
print "XX2: nfile1: " + str(nfile1) + " nfile2: " + str(nfile2)
- self.assertEqual(nfile2 == 3, True)
+ self.assertEqual(nfile2 == 4, True)
if __name__ == '__main__':
wttest.run()
diff --git a/test/suite/test_sweep03.py b/test/suite/test_sweep03.py
index 4030e2fb715..684c87695c5 100644
--- a/test/suite/test_sweep03.py
+++ b/test/suite/test_sweep03.py
@@ -93,13 +93,13 @@ class test_sweep03(wttest.WiredTigerTestCase, suite_subprocess):
time.sleep(5)
stat_cursor = self.session.open_cursor('statistics:', None, None)
- close1 = stat_cursor[stat.conn.dh_conn_handles][2]
- sweep1 = stat_cursor[stat.conn.dh_conn_sweeps][2]
+ close1 = stat_cursor[stat.conn.dh_sweep_close][2]
+ sweep1 = stat_cursor[stat.conn.dh_sweeps][2]
stat_cursor.close()
# The sweep server should have run, or the test isn't working.
self.assertGreater(sweep1, 0)
- # We expect nothing to have been closed, so dh_conn_handles should be 0
+ # We expect nothing to have been closed.
self.assertEqual(close1, 0)
def test_disable_idle_timeout_drop_force(self):
@@ -116,7 +116,7 @@ class test_sweep03(wttest.WiredTigerTestCase, suite_subprocess):
# We just filled the table, now check what the stats are
stat_cursor = self.session.open_cursor('statistics:', None, None)
cache1 = stat_cursor[stat.conn.cache_bytes_inuse][2]
- sweep1 = stat_cursor[stat.conn.dh_conn_sweeps][2]
+ sweep1 = stat_cursor[stat.conn.dh_sweeps][2]
stat_cursor.close()
# We force the drop in this case to confirm that the handle is closed
@@ -127,8 +127,8 @@ class test_sweep03(wttest.WiredTigerTestCase, suite_subprocess):
# Grab the stats post table drop to see things have decremented
stat_cursor = self.session.open_cursor('statistics:', None, None)
cache2 = stat_cursor[stat.conn.cache_bytes_inuse][2]
- close2 = stat_cursor[stat.conn.dh_conn_handles][2]
- sweep2 = stat_cursor[stat.conn.dh_conn_sweeps][2]
+ close2 = stat_cursor[stat.conn.dh_sweep_close][2]
+ sweep2 = stat_cursor[stat.conn.dh_sweeps][2]
stat_cursor.close()
# Make sure the sweep server is still working.
@@ -151,8 +151,8 @@ class test_sweep03(wttest.WiredTigerTestCase, suite_subprocess):
# We just filled the table, now check what the stats are
stat_cursor = self.session.open_cursor('statistics:', None, None)
cache1 = stat_cursor[stat.conn.cache_bytes_inuse][2]
- close1 = stat_cursor[stat.conn.dh_conn_handles][2]
- sweep1 = stat_cursor[stat.conn.dh_conn_sweeps][2]
+ close1 = stat_cursor[stat.conn.dh_sweep_close][2]
+ sweep1 = stat_cursor[stat.conn.dh_sweeps][2]
stat_cursor.close()
self.session.drop(drop_uri, None)
@@ -162,8 +162,8 @@ class test_sweep03(wttest.WiredTigerTestCase, suite_subprocess):
# Grab the stats post table drop to see things have decremented
stat_cursor = self.session.open_cursor('statistics:', None, None)
cache2 = stat_cursor[stat.conn.cache_bytes_inuse][2]
- close2 = stat_cursor[stat.conn.dh_conn_handles][2]
- sweep2 = stat_cursor[stat.conn.dh_conn_sweeps][2]
+ close2 = stat_cursor[stat.conn.dh_sweep_close][2]
+ sweep2 = stat_cursor[stat.conn.dh_sweeps][2]
stat_cursor.close()
self.assertGreater(sweep2, sweep1)
diff --git a/test/suite/test_txn02.py b/test/suite/test_txn02.py
index 5827a892654..83c10f41244 100644
--- a/test/suite/test_txn02.py
+++ b/test/suite/test_txn02.py
@@ -217,7 +217,7 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess):
for i, ot in enumerate(zip(ops, txns)):
ok, txn = ot
op, k = ok
-
+
# Close and reopen the connection and cursor.
if reopen == 'reopen':
self.reopen_conn()
diff --git a/test/suite/test_txn03.py b/test/suite/test_txn03.py
index 41e283a8050..e2efef1742e 100644
--- a/test/suite/test_txn03.py
+++ b/test/suite/test_txn03.py
@@ -39,7 +39,7 @@ class test_txn03(wttest.WiredTigerTestCase):
uri2 = 'table:' + tablename + "_2"
key_str = "TEST_KEY1"
data_str1 = "VAL"
- data_str2 = "TEST_VAL1"
+ data_str2 = "TEST_VAL1"
nentries = 1000
scenarios = check_scenarios([
diff --git a/test/suite/test_txn04.py b/test/suite/test_txn04.py
index d0a21f5ec9c..f9f660223da 100644
--- a/test/suite/test_txn04.py
+++ b/test/suite/test_txn04.py
@@ -72,7 +72,7 @@ class test_txn04(wttest.WiredTigerTestCase, suite_subprocess):
self.txn_sync = self.sync_list[
self.scenario_number % len(self.sync_list)]
self.backup_dir = os.path.join(self.home, "WT_BACKUP")
- # Set archive false on the home directory.
+ # Set archive false on the home directory.
conn_params = \
'log=(archive=false,enabled,file_max=%s),' % self.logmax + \
'create,error_prefix="%s: ",' % self.shortid() + \
@@ -158,7 +158,7 @@ class test_txn04(wttest.WiredTigerTestCase, suite_subprocess):
self.session.begin_transaction()
ok, txn = ot
op, k = ok
-
+
# print '%d: %s(%d)[%s]' % (i, ok[0], ok[1], txn)
if op == 'insert' or op == 'update':
c[k] = i + 2
@@ -199,7 +199,7 @@ class test_txn04(wttest.WiredTigerTestCase, suite_subprocess):
self.hot_backup(self.uri, committed)
if txn == 'commit':
self.assertEqual(True, self.exception == 'true')
- else:
+ else:
self.assertEqual(True, self.exception == 'false')
if __name__ == '__main__':
diff --git a/test/suite/test_txn05.py b/test/suite/test_txn05.py
index 8a2f36fc910..d427b893b17 100644
--- a/test/suite/test_txn05.py
+++ b/test/suite/test_txn05.py
@@ -181,7 +181,7 @@ class test_txn05(wttest.WiredTigerTestCase, suite_subprocess):
ok, txn = ot
# print '%d: %s(%d)[%s]' % (i, ok[0], ok[1], txn)
op, k = ok
-
+
# print '%d: %s(%d)[%s]' % (i, ok[0], ok[1], txn)
if op == 'stop':
c.set_key(k)
diff --git a/test/suite/test_txn07.py b/test/suite/test_txn07.py
index 8e7119186f5..fa522582a8e 100644
--- a/test/suite/test_txn07.py
+++ b/test/suite/test_txn07.py
@@ -171,7 +171,7 @@ class test_txn07(wttest.WiredTigerTestCase, suite_subprocess):
ok, txn = ot
# print '%d: %s(%d)[%s]' % (i, ok[0], ok[1], txn)
op, k = ok
-
+
# print '%d: %s(%d)[%s]' % (i, ok[0], ok[1], txn)
if op == 'stop':
c.set_key(k)
diff --git a/test/suite/test_txn09.py b/test/suite/test_txn09.py
index 98229c52f2e..df085a75d67 100644
--- a/test/suite/test_txn09.py
+++ b/test/suite/test_txn09.py
@@ -139,7 +139,7 @@ class test_txn09(wttest.WiredTigerTestCase, suite_subprocess):
for i, ot in enumerate(zip(ops, txns)):
ok, txn = ot
op, k = ok
-
+
# Close and reopen the connection and cursor, toggling the log
self.log_enabled = not self.log_enabled
self.reopen_conn()
diff --git a/test/suite/test_txn10.py b/test/suite/test_txn10.py
index cee25562756..8810df46777 100644
--- a/test/suite/test_txn10.py
+++ b/test/suite/test_txn10.py
@@ -62,15 +62,15 @@ class test_txn10(wttest.WiredTigerTestCase, suite_subprocess):
self.close_conn()
self.conn = self.setUpConnectionOpen(newdir)
self.session = self.setUpSessionOpen(self.conn)
-
+
def test_recovery(self):
''' Check for bugs in file ID allocation. '''
# Here's the strategy:
- # - Create a table (t1).
- # - Do a clean restart.
- # - Create another table (t2).
- # - Insert data into t2.
+ # - Create a table (t1).
+ # - Do a clean restart.
+ # - Create another table (t2).
+ # - Insert data into t2.
# - Make recovery run.
#
# If we aren't tracking file IDs properly, it's possible that
diff --git a/test/suite/test_txn12.py b/test/suite/test_txn12.py
new file mode 100644
index 00000000000..0901811535e
--- /dev/null
+++ b/test/suite/test_txn12.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2015 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wttest
+from suite_subprocess import suite_subprocess
+from wiredtiger import stat
+from wtscenario import multiply_scenarios, number_scenarios
+
+# test_txn12.py
+# test of commit following failed op in a read only transaction.
+class test_txn12(wttest.WiredTigerTestCase, suite_subprocess):
+ name = 'test_txn12'
+ uri = 'table:' + name
+ create_params = 'key_format=i,value_format=i'
+
+ # Test that read-only transactions can commit following a failure.
+ def test_txn12(self):
+
+ # Setup the session and table.
+ session = self.conn.open_session(None)
+ session.create(self.uri, self.create_params)
+ session.begin_transaction("isolation=snapshot")
+
+ # Create a read only transaction.
+ c = session.open_cursor(self.uri, None)
+ c.next()
+ msg = '/next_random.*boolean/'
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda:session.open_cursor(self.uri, None, "next_random=bar"), msg)
+ # This commit should succeed as we have done no writes.
+ session.commit_transaction()
+
+ # Create a read/write transaction.
+ session.begin_transaction("isolation=snapshot")
+ c = session.open_cursor(self.uri, None)
+ c[123] = 123
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda:session.open_cursor(self.uri, None, "next_random=bar"), msg)
+ # This commit should fail as we have written something
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda:session.commit_transaction(), '/requires rollback/')
+
+if __name__ == '__main__':
+ wttest.run()
+
diff --git a/test/suite/test_util01.py b/test/suite/test_util01.py
index 0b1e2a35833..29033fb43ba 100644
--- a/test/suite/test_util01.py
+++ b/test/suite/test_util01.py
@@ -168,7 +168,7 @@ class test_util01(wttest.WiredTigerTestCase, suite_subprocess):
dumpargs.append("-x")
dumpargs.append(self.tablename)
self.runWt(dumpargs, outfilename="dump.out")
-
+
self.assertTrue(self.compare_files("expect.out", "dump.out"))
def test_dump_process(self):
@@ -179,10 +179,10 @@ class test_util01(wttest.WiredTigerTestCase, suite_subprocess):
def test_dump_api(self):
self.dump(True, False)
-
+
def test_dump_api_hex(self):
self.dump(True, True)
-
+
if __name__ == '__main__':
wttest.run()
diff --git a/test/suite/wtscenario.py b/test/suite/wtscenario.py
index 6e4b0d3464e..0f8e8c30c1f 100644
--- a/test/suite/wtscenario.py
+++ b/test/suite/wtscenario.py
@@ -61,7 +61,7 @@ def log2chr(val):
return chr(ord('0') + p)
else:
return chr(ord('a') + p - 10)
-
+
megabyte = 1024 * 1024
def check_scenarios(scenes):
diff --git a/test/suite/wttest.py b/test/suite/wttest.py
index 9f833b0b6a4..443fabb00b2 100644
--- a/test/suite/wttest.py
+++ b/test/suite/wttest.py
@@ -169,14 +169,14 @@ class WiredTigerTestCase(unittest.TestCase):
self.captureerr = CapturedFd('stderr.txt', 'error output')
sys.stdout = self.captureout.capture()
sys.stderr = self.captureerr.capture()
-
+
def fdTearDown(self):
# restore stderr/stdout
self.captureout.release()
self.captureerr.release()
sys.stdout = WiredTigerTestCase._stdout
sys.stderr = WiredTigerTestCase._stderr
-
+
def __init__(self, *args, **kwargs):
if hasattr(self, 'scenarios'):
assert(len(self.scenarios) == len(dict(self.scenarios)))
@@ -204,11 +204,11 @@ class WiredTigerTestCase(unittest.TestCase):
'create,error_prefix="%s",%s' % (self.shortid(), self.conn_config))
self.pr(`conn`)
return conn
-
+
# Can be overridden
def setUpSessionOpen(self, conn):
return conn.open_session(None)
-
+
# Can be overridden
def close_conn(self):
"""
@@ -351,7 +351,7 @@ class WiredTigerTestCase(unittest.TestCase):
else:
with self.expectedStderr(message):
self.assertRaises(exceptionType, expr)
-
+
def exceptionToStderr(self, expr):
"""
Used by assertRaisesHavingMessage to convert an expression
diff --git a/test/suite/wtthread.py b/test/suite/wtthread.py
index 1e2e4f56380..8959684d6d3 100644
--- a/test/suite/wtthread.py
+++ b/test/suite/wtthread.py
@@ -35,7 +35,7 @@ class checkpoint_thread(threading.Thread):
self.conn = conn
self.done = done
threading.Thread.__init__(self)
-
+
def run(self):
sess = self.conn.open_session()
while not self.done.isSet():
@@ -50,7 +50,7 @@ class backup_thread(threading.Thread):
self.conn = conn
self.done = done
threading.Thread.__init__(self)
-
+
def run(self):
sess = self.conn.open_session()
while not self.done.isSet():
@@ -111,7 +111,7 @@ class op_thread(threading.Thread):
self.queue = queue
self.done = done
threading.Thread.__init__(self)
-
+
def run(self):
sess = self.conn.open_session()
if (len(self.uris) == 1):
diff --git a/tools/wtstats/stat_data.py b/tools/wtstats/stat_data.py
index db5b14d6cd6..f2f193c0860 100644
--- a/tools/wtstats/stat_data.py
+++ b/tools/wtstats/stat_data.py
@@ -13,6 +13,7 @@ no_scale_per_second_list = [
'cache: tracked dirty bytes in the cache',
'cache: tracked dirty pages in the cache',
'connection: files currently open',
+ 'data-handle: connection data handles currently active',
'log: maximum log file size',
'log: number of pre-allocated log files to create',
'log: total log buffer size',
@@ -42,6 +43,7 @@ no_scale_per_second_list = [
'btree: column-store internal pages',
'btree: column-store variable-size deleted values',
'btree: column-store variable-size leaf pages',
+ 'btree: column-store variable-size RLE encoded values',
'btree: fixed-record size',
'btree: maximum internal page key size',
'btree: maximum internal page size',
@@ -74,6 +76,7 @@ no_clear_list = [
'cache: tracked dirty bytes in the cache',
'cache: tracked dirty pages in the cache',
'connection: files currently open',
+ 'data-handle: connection data handles currently active',
'log: maximum log file size',
'log: number of pre-allocated log files to create',
'log: total log buffer size',