summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@wiredtiger.com>2014-02-07 18:16:22 +1100
committerMichael Cahill <michael.cahill@wiredtiger.com>2014-02-07 18:16:22 +1100
commit3bcd2a96e6546419a871dba4a35a2e2a3453adb9 (patch)
treed93f25e4d576e47adbf78b352c910e7354d68639
parent3b6d36874f716625c3f8c867f9185c829931472e (diff)
parent0f319b1107960bdeb7d617d1797dd992029bb1df (diff)
downloadmongo-3bcd2a96e6546419a871dba4a35a2e2a3453adb9.tar.gz
Merge branch 'develop' into checkpoint-directio
-rw-r--r--NEWS43
-rw-r--r--README6
-rw-r--r--RELEASE4
-rw-r--r--bench/wtperf/config.c10
-rw-r--r--bench/wtperf/misc.c4
-rw-r--r--bench/wtperf/runners/fruit-lsm.wtperf19
-rw-r--r--bench/wtperf/runners/fruit-short.wtperf19
-rw-r--r--bench/wtperf/runners/medium-lsm-compact.wtperf2
-rw-r--r--bench/wtperf/runners/test1-500m-lsm.wtperf16
-rw-r--r--bench/wtperf/runners/test1-50m-lsm.wtperf16
-rw-r--r--bench/wtperf/runners/test2-500m-lsm.wtperf16
-rw-r--r--bench/wtperf/runners/test2-50m-lsm.wtperf16
-rw-r--r--bench/wtperf/runners/test3-500m-lsm.wtperf18
-rw-r--r--bench/wtperf/runners/test3-50m-lsm.wtperf18
-rw-r--r--bench/wtperf/runners/test4-500m-lsm.wtperf16
-rw-r--r--bench/wtperf/runners/test4-50m-lsm.wtperf16
-rwxr-xr-xbench/wtperf/smoke.sh2
-rw-r--r--bench/wtperf/track.c32
-rw-r--r--bench/wtperf/wtperf.c375
-rw-r--r--bench/wtperf/wtperf.h9
-rw-r--r--bench/wtperf/wtperf_opt.i11
-rw-r--r--build_posix/Make.subdirs3
-rw-r--r--build_posix/aclocal/options.m417
-rw-r--r--build_posix/aclocal/version-set.m48
-rw-r--r--build_posix/aclocal/version.m42
-rw-r--r--build_posix/configure.ac.in20
-rw-r--r--dist/api_data.py13
-rw-r--r--dist/s_string.ok15
-rw-r--r--dist/stat_data.py6
-rw-r--r--examples/c/ex_all.c19
-rw-r--r--ext/compressors/bzip2/bzip2_compress.c49
-rw-r--r--ext/compressors/zlib/Makefile.am6
-rw-r--r--ext/compressors/zlib/zlib_compress.c367
-rw-r--r--ext/datasources/helium/Makefile.am11
-rw-r--r--ext/datasources/helium/README (renamed from ext/test/memrata/README)62
-rw-r--r--ext/datasources/helium/helium.c (renamed from ext/test/memrata/memrata.c)1708
-rw-r--r--ext/test/memrata/Makefile.am12
-rw-r--r--lang/python/wiredtiger.i22
-rw-r--r--src/btree/bt_evict.c149
-rw-r--r--src/btree/bt_handle.c37
-rw-r--r--src/btree/rec_evict.c30
-rw-r--r--src/btree/rec_merge.c45
-rw-r--r--src/btree/rec_track.c53
-rw-r--r--src/btree/rec_write.c22
-rw-r--r--src/docs/compression.dox83
-rw-r--r--src/docs/helium.dox125
-rw-r--r--src/docs/hot_backup.dox10
-rw-r--r--src/docs/memrata.dox129
-rw-r--r--src/docs/programming.dox2
-rw-r--r--src/docs/spell.ok12
-rw-r--r--src/docs/top/Doxyfile2
-rw-r--r--src/docs/top/main.dox6
-rw-r--r--src/docs/upgrading.dox20
-rw-r--r--src/include/btmem.h5
-rw-r--r--src/include/btree.i78
-rw-r--r--src/include/stat.h4
-rw-r--r--src/include/txn.i2
-rw-r--r--src/include/wiredtiger.in108
-rw-r--r--src/lsm/lsm_cursor.c11
-rw-r--r--src/lsm/lsm_tree.c7
-rw-r--r--src/support/stat.c12
-rw-r--r--src/txn/txn_ckpt.c6
-rw-r--r--test/format/Makefile.am19
-rw-r--r--test/format/backup.c25
-rw-r--r--test/format/bdb.c2
-rw-r--r--test/format/bulk.c2
-rw-r--r--test/format/compact.c2
-rw-r--r--test/format/config.c56
-rw-r--r--test/format/config.h4
-rw-r--r--test/format/format.h14
-rw-r--r--test/format/ops.c4
-rwxr-xr-xtest/format/s_dumpcmp.sh10
-rw-r--r--test/format/t.c8
-rw-r--r--test/format/wts.c88
-rw-r--r--tools/wtperf_graph.py182
75 files changed, 2658 insertions, 1724 deletions
diff --git a/NEWS b/NEWS
index 80521e53b3a..9f0682f0ecd 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,46 @@
+WiredTiger release 2.1.0, 2014-02-04
+------------------------------------
+
+The WiredTiger 2.1.0 release contains new features, performance enhancements
+and bug fixes. Significant changes include:
+
+The WT_ITEM structure was changed so that the size field is a size_t rather
+than a uint32_t. See upgrading documentation for details.
+
+A change to the compress_raw interface around repeating the call with more
+records. See upgrading documentation for details.
+
+In LSM trees, the memory_page_max setting is ignored. The effective setting
+is double the chunk size. [#861][#859]
+
+Add support for zlib compression. [#855] [#865]
+
+Various enhancements to how WiredTiger generates tree structures in memory to
+help maintain consistent performance as table size grows. [#851]
+
+Add support for Levyx Inc Helium as an external data source in WiredTiger
+[#849][#850]
+
+Improve insert performance when a table contains many identical overflow
+items.
+
+Various performance enhancements to btree searches. [#838][#839][#840]
+
+Add support for newer versions of autoconf up to 1.14. [#599][#841]
+
+Improve multi-threaded throughput of durable log writes, including changing
+the default wiredtiger_open transaction_sync configuration from dsync to
+fsync, see the upgrading documentation for further information. [#831][#832]
+
+In the Python and Java APIs, automatically close handles to prevent invalid
+accesses by applications. [#649][#800][#830]
+
+Various enhancements to the LSM merge algorithm, including improvements to how
+files are selected for merging, and throttling based on whether merges are
+keeping up (to limit write amplification). Made the minimum number of chunks
+chosen to merge configurable. [#817][#819][#822]
+
+
WiredTiger release 2.0.1, 2013-12-12
------------------------------------
diff --git a/README b/README
index 83a190b2f14..bffb154233c 100644
--- a/README
+++ b/README
@@ -1,6 +1,6 @@
-WiredTiger 2.0.2: (December 12, 2013)
+WiredTiger 2.1.1: (February 4, 2014)
-This is version 2.0.2 of WiredTiger.
+This is version 2.1.1 of WiredTiger.
WiredTiger release packages and documentation can be found at:
@@ -9,7 +9,7 @@ WiredTiger release packages and documentation can be found at:
Information on configuring, building and installing WiredTiger can be
found at:
- http://source.wiredtiger.com/2.0.2/install.html
+ http://source.wiredtiger.com/2.1.1/install.html
WiredTiger licensing information can be found at:
diff --git a/RELEASE b/RELEASE
index dbe6867bff9..9254d748a4d 100644
--- a/RELEASE
+++ b/RELEASE
@@ -1,6 +1,6 @@
WIREDTIGER_VERSION_MAJOR=2
-WIREDTIGER_VERSION_MINOR=0
-WIREDTIGER_VERSION_PATCH=2
+WIREDTIGER_VERSION_MINOR=1
+WIREDTIGER_VERSION_PATCH=1
WIREDTIGER_VERSION="$WIREDTIGER_VERSION_MAJOR.$WIREDTIGER_VERSION_MINOR.$WIREDTIGER_VERSION_PATCH"
WIREDTIGER_RELEASE_DATE=`date "+%B %e, %Y"`
diff --git a/bench/wtperf/config.c b/bench/wtperf/config.c
index 04db60a5193..2bb0201d639 100644
--- a/bench/wtperf/config.c
+++ b/bench/wtperf/config.c
@@ -491,6 +491,8 @@ config_print(CONFIG *cfg)
printf("\tHome: %s\n", cfg->home);
printf("\tTable name: %s\n", cfg->table_name);
printf("\tConnection configuration: %s\n", cfg->conn_config);
+ if (cfg->sess_config != NULL)
+ printf("\tSession configuration: %s\n", cfg->sess_config);
printf("\t%s table: %s\n",
cfg->create ? "Creating new" : "Using existing",
@@ -598,13 +600,11 @@ config_opt_usage(void)
void
usage(void)
{
- printf("wtperf [-LMSv] [-C config] "
- "[-h home] [-O file] [-o option] [-T config]\n");
- printf("\t-L Use a large default configuration\n");
- printf("\t-M Use a medium default configuration\n");
- printf("\t-S Use a small default configuration\n");
+ printf("wtperf [-C config] "
+ "[-H mount] [-h home] [-O file] [-o option] [-T config]\n");
printf("\t-C <string> additional connection configuration\n");
printf("\t (added to option conn_config)\n");
+ printf("\t-H <mount> configure Helium volume mount point\n");
printf("\t-h <string> Wired Tiger home must exist, default WT_TEST\n");
printf("\t-O <file> file contains options as listed below\n");
printf("\t-o option=val[,option=val,...] set options listed below\n");
diff --git a/bench/wtperf/misc.c b/bench/wtperf/misc.c
index f9a36921164..34efa728185 100644
--- a/bench/wtperf/misc.c
+++ b/bench/wtperf/misc.c
@@ -49,11 +49,11 @@ setup_log_file(CONFIG *cfg)
if (cfg->verbose < 1)
return (0);
- if ((fname = calloc(strlen(cfg->home) +
+ if ((fname = calloc(strlen(cfg->monitor_dir) +
strlen(cfg->table_name) + strlen(".stat") + 2, 1)) == NULL)
return (enomem(cfg));
- sprintf(fname, "%s/%s.stat", cfg->home, cfg->table_name);
+ sprintf(fname, "%s/%s.stat", cfg->monitor_dir, cfg->table_name);
cfg->logf = fopen(fname, "w");
free(fname);
diff --git a/bench/wtperf/runners/fruit-lsm.wtperf b/bench/wtperf/runners/fruit-lsm.wtperf
new file mode 100644
index 00000000000..193b29d38bf
--- /dev/null
+++ b/bench/wtperf/runners/fruit-lsm.wtperf
@@ -0,0 +1,19 @@
+# wtperf options file: simulate riak and its test1 and test2 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+#
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=600)"
+conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024"
+compact=true
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm"
+icount=25000000
+key_sz=40
+value_sz=800
+pareto=true
+populate_threads=20
+report_interval=10
+random_value=true
+run_time=18000
+sample_interval=10
+threads=((count=20,read=6,update=1))
diff --git a/bench/wtperf/runners/fruit-short.wtperf b/bench/wtperf/runners/fruit-short.wtperf
new file mode 100644
index 00000000000..9061e231bbe
--- /dev/null
+++ b/bench/wtperf/runners/fruit-short.wtperf
@@ -0,0 +1,19 @@
+# wtperf options file: simulate riak and its test1 and test2 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+#
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=600)"
+conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024"
+compact=true
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm"
+icount=25000000
+key_sz=40
+value_sz=800
+pareto=true
+populate_threads=20
+report_interval=10
+random_value=true
+run_time=1800
+sample_interval=10
+threads=((count=20,read=6,update=1))
diff --git a/bench/wtperf/runners/medium-lsm-compact.wtperf b/bench/wtperf/runners/medium-lsm-compact.wtperf
index 5393cdbfeba..62ae8cf86ca 100644
--- a/bench/wtperf/runners/medium-lsm-compact.wtperf
+++ b/bench/wtperf/runners/medium-lsm-compact.wtperf
@@ -1,6 +1,6 @@
# wtperf options file: medium lsm configuration
conn_config="cache_size=1G"
-table_config="lsm=(chunk_size=100MB,merge_threads=2),type=lsm"
+table_config="lsm=(chunk_size=100MB,merge_threads=2,chunk_max=1TB),type=lsm"
icount=50000000
populate_threads=1
compact=true
diff --git a/bench/wtperf/runners/test1-500m-lsm.wtperf b/bench/wtperf/runners/test1-500m-lsm.wtperf
new file mode 100644
index 00000000000..bdb4b7bd066
--- /dev/null
+++ b/bench/wtperf/runners/test1-500m-lsm.wtperf
@@ -0,0 +1,16 @@
+# wtperf options file: simulate riak and its test1 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+#
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=60)"
+conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024"
+compact=true
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm"
+icount=500000000
+key_sz=40
+value_sz=1000
+populate_threads=20
+report_interval=10
+random_value=true
+sample_interval=10
diff --git a/bench/wtperf/runners/test1-50m-lsm.wtperf b/bench/wtperf/runners/test1-50m-lsm.wtperf
new file mode 100644
index 00000000000..4b99b27e625
--- /dev/null
+++ b/bench/wtperf/runners/test1-50m-lsm.wtperf
@@ -0,0 +1,16 @@
+# wtperf options file: simulate riak and its test1 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+#
+#conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=30)"
+conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024"
+compact=true
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm"
+icount=50000000
+key_sz=40
+value_sz=1000
+populate_threads=10
+report_interval=10
+random_value=true
+sample_interval=10
diff --git a/bench/wtperf/runners/test2-500m-lsm.wtperf b/bench/wtperf/runners/test2-500m-lsm.wtperf
new file mode 100644
index 00000000000..49eb2356d6d
--- /dev/null
+++ b/bench/wtperf/runners/test2-500m-lsm.wtperf
@@ -0,0 +1,16 @@
+# wtperf options file: simulate riak and its test2 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+# This test assumes that a test1 populate already completed and exists.
+#
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=60)"
+conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024"
+create=false
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm"
+key_sz=40
+value_sz=1000
+report_interval=10
+run_time=14400
+sample_interval=10
+threads=((count=20,reads=4,updates=1))
diff --git a/bench/wtperf/runners/test2-50m-lsm.wtperf b/bench/wtperf/runners/test2-50m-lsm.wtperf
new file mode 100644
index 00000000000..5ef18057961
--- /dev/null
+++ b/bench/wtperf/runners/test2-50m-lsm.wtperf
@@ -0,0 +1,16 @@
+# wtperf options file: simulate riak and its test2 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+# This test assumes that a test1 populate already completed and exists.
+#
+#conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=30)"
+conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024"
+create=false
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm"
+key_sz=40
+value_sz=1000
+report_interval=10
+run_time=1440
+sample_interval=10
+threads=((count=10,reads=4,updates=1))
diff --git a/bench/wtperf/runners/test3-500m-lsm.wtperf b/bench/wtperf/runners/test3-500m-lsm.wtperf
new file mode 100644
index 00000000000..73bc121c39e
--- /dev/null
+++ b/bench/wtperf/runners/test3-500m-lsm.wtperf
@@ -0,0 +1,18 @@
+# wtperf options file: simulate riak and its test3 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+# This test assumes that a test1 populate already completed and exists.
+#
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=60)"
+conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024"
+create=false
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm"
+key_sz=40
+value_sz=1000
+pareto=true
+report_interval=10
+run_time=14400
+sample_interval=10
+#threads=((count=20,reads=1,updates=1))
+threads=((count=10,reads=1),(count=10,updates=1))
diff --git a/bench/wtperf/runners/test3-50m-lsm.wtperf b/bench/wtperf/runners/test3-50m-lsm.wtperf
new file mode 100644
index 00000000000..e1efe394f47
--- /dev/null
+++ b/bench/wtperf/runners/test3-50m-lsm.wtperf
@@ -0,0 +1,18 @@
+# wtperf options file: simulate riak and its test3 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+# This test assumes that a test1 populate already completed and exists.
+#
+#conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=30)"
+conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024"
+create=false
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm"
+key_sz=40
+value_sz=1000
+pareto=true
+report_interval=10
+run_time=1440
+sample_interval=10
+#threads=((count=10,reads=1,updates=1))
+threads=((count=5,reads=1),(count=5,updates=1))
diff --git a/bench/wtperf/runners/test4-500m-lsm.wtperf b/bench/wtperf/runners/test4-500m-lsm.wtperf
new file mode 100644
index 00000000000..058790ce2a0
--- /dev/null
+++ b/bench/wtperf/runners/test4-500m-lsm.wtperf
@@ -0,0 +1,16 @@
+# wtperf options file: simulate riak and its test4 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+# This test assumes that a test1 populate already completed and exists.
+#
+#conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=60)"
+conn_config="cache_size=21G,checkpoint_sync=false,mmap=false,session_max=1024"
+create=false
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm"
+key_sz=40
+value_sz=1000
+report_interval=10
+run_time=14400
+sample_interval=10
+threads=((count=20,reads=1))
diff --git a/bench/wtperf/runners/test4-50m-lsm.wtperf b/bench/wtperf/runners/test4-50m-lsm.wtperf
new file mode 100644
index 00000000000..a081dabce9e
--- /dev/null
+++ b/bench/wtperf/runners/test4-50m-lsm.wtperf
@@ -0,0 +1,16 @@
+# wtperf options file: simulate riak and its test4 configuration
+# The configuration for the connection and table are from riak and the
+# specification of the data (count, size, threads) is from basho_bench.
+# This test assumes that a test1 populate already completed and exists.
+#
+#conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024,statistics=(fast,clear),statistics_log=(wait=30)"
+conn_config="cache_size=10G,checkpoint_sync=false,mmap=false,session_max=1024"
+create=false
+sess_config="isolation=snapshot"
+table_config="internal_page_max=128K,lsm=(bloom_config=(leaf_page_max=8MB),bloom_bit_count=28,bloom_hash_count=19,bloom_oldest=true,chunk_size=100MB,merge_threads=2),type=lsm"
+key_sz=40
+value_sz=1000
+report_interval=10
+run_time=1440
+sample_interval=10
+threads=((count=10,reads=1))
diff --git a/bench/wtperf/smoke.sh b/bench/wtperf/smoke.sh
index 192d3cd208b..d6eaed1fc50 100755
--- a/bench/wtperf/smoke.sh
+++ b/bench/wtperf/smoke.sh
@@ -1,4 +1,4 @@
#! /bin/sh
# Smoke-test wtperf as part of running "make check".
-./wtperf -S
+./wtperf -O ../../../bench/wtperf/runners/small-lsm.wtperf -o "run_time=20"
diff --git a/bench/wtperf/track.c b/bench/wtperf/track.c
index 9d13fce5602..12f50dd5411 100644
--- a/bench/wtperf/track.c
+++ b/bench/wtperf/track.c
@@ -71,11 +71,17 @@ sum_ops(CONFIG *cfg, size_t field_offset)
{
CONFIG_THREAD *thread;
uint64_t total;
- int64_t i;
+ int64_t i, th_cnt;
total = 0;
- for (i = 0, thread = cfg->workers;
- thread != NULL && i < cfg->workers_cnt; ++i, ++thread)
+ if (cfg->popthreads == NULL) {
+ thread = cfg->workers;
+ th_cnt = cfg->workers_cnt;
+ } else {
+ thread = cfg->popthreads;
+ th_cnt = cfg->populate_threads;
+ }
+ for (i = 0; thread != NULL && i < th_cnt; ++i, ++thread)
total += ((TRACK *)((uint8_t *)thread + field_offset))->ops;
return (total);
@@ -108,19 +114,25 @@ latency_op(CONFIG *cfg,
CONFIG_THREAD *thread;
TRACK *track;
uint64_t ops, latency, tmp;
- int64_t i;
+ int64_t i, th_cnt;
uint32_t max, min;
ops = latency = 0;
max = 0;
min = UINT32_MAX;
- for (i = 0, thread = cfg->workers;
- thread != NULL && i < cfg->workers_cnt; ++i, ++thread) {
+ if (cfg->popthreads == NULL) {
+ thread = cfg->workers;
+ th_cnt = cfg->workers_cnt;
+ } else {
+ thread = cfg->popthreads;
+ th_cnt = cfg->populate_threads;
+ }
+ for (i = 0; thread != NULL && i < th_cnt; ++i, ++thread) {
track = (TRACK *)((uint8_t *)thread + field_offset);
- tmp = track->ops;
- ops += tmp - track->last_ops;
- track->last_ops = tmp;
+ tmp = track->latency_ops;
+ ops += tmp - track->last_latency_ops;
+ track->last_latency_ops = tmp;
tmp = track->latency;
latency += tmp - track->last_latency;
track->last_latency = tmp;
@@ -261,7 +273,7 @@ latency_print_single(CONFIG *cfg, TRACK *total, const char *name)
uint64_t cumops;
char path[1024];
- snprintf(path, sizeof(path), "%s/latency.%s", cfg->home, name);
+ snprintf(path, sizeof(path), "%s/latency.%s", cfg->monitor_dir, name);
if ((fp = fopen(path, "w")) == NULL) {
lprintf(cfg, errno, 0, "%s", path);
return;
diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c
index 6e6625c40f0..7543f79747e 100644
--- a/bench/wtperf/wtperf.c
+++ b/bench/wtperf/wtperf.c
@@ -30,6 +30,7 @@
/* Default values. */
static const CONFIG default_cfg = {
"WT_TEST", /* home */
+ "WT_TEST", /* monitor dir */
NULL, /* uri */
NULL, /* conn */
NULL, /* logf */
@@ -45,39 +46,6 @@ static const CONFIG default_cfg = {
#undef OPT_DEFINE_DEFAULT
};
-static const char * const small_config_str =
- "conn_config=\"cache_size=500MB\","
- "table_config=\"lsm=(chunk_size=5MB)\","
- "icount=500000,"
- "value_sz=100,"
- "key_sz=20,"
- "report_interval=5,"
- "run_time=20,"
- "populate_threads=1,"
- "threads=((count=8,read=1)),";
-
-static const char * const med_config_str =
- "conn_config=\"cache_size=1GB\","
- "table_config=\"lsm=(chunk_size=20MB)\","
- "icount=50000000,"
- "value_sz=100,"
- "key_sz=20,"
- "report_interval=5,"
- "run_time=100,"
- "populate_threads=1,"
- "threads=((count=16,read=1)),";
-
-static const char * const large_config_str =
- "conn_config=\"cache_size=2GB\","
- "table_config=\"lsm=(chunk_size=50MB)\","
- "icount=500000000,"
- "value_sz=100,"
- "key_sz=20,"
- "report_interval=5,"
- "run_time=600,"
- "populate_threads=1,"
- "threads=((count=16,read=1)),";
-
static const char * const debug_cconfig = "verbose=[lsm]";
static const char * const debug_tconfig = "";
@@ -92,6 +60,8 @@ static volatile int g_ckpt; /* checkpoint in progress */
static volatile int g_error; /* thread error */
static volatile int g_stop; /* notify threads to stop */
+static volatile uint32_t g_totalsec; /* total seconds running */
+
/*
* Atomic update where needed.
*/
@@ -107,6 +77,7 @@ static int execute_workload(CONFIG *);
static int find_table_count(CONFIG *);
static void *monitor(void *);
static void *populate_thread(void *);
+static void randomize_value(CONFIG *, char *);
static int start_threads(CONFIG *,
WORKLOAD *, CONFIG_THREAD *, u_int, void *(*)(void *));
static int stop_threads(CONFIG *, u_int, CONFIG_THREAD *);
@@ -114,7 +85,15 @@ static void *worker(void *);
static uint64_t wtperf_rand(CONFIG *);
static uint64_t wtperf_value_range(CONFIG *);
-/* We use a couple of WiredTiger library routines to simplify portability. */
+#define HELIUM_NAME "dev1"
+#define HELIUM_PATH \
+ "../../ext/test/helium/.libs/libwiredtiger_helium.so"
+#define HELIUM_CONFIG ",type=helium"
+
+/*
+ * wtperf uses a couple of internal WiredTiger library routines for timing
+ * and generating random numbers.
+ */
extern int __wt_epoch(void *, struct timespec *);
extern uint32_t __wt_random(void);
@@ -125,19 +104,34 @@ get_next_incr(void)
return (ATOMIC_ADD(g_insert_key, 1));
}
+static void
+randomize_value(CONFIG *cfg, char *value_buf)
+{
+ uint32_t i;
+
+ /*
+ * Each time we're called overwrite value_buf[0] and one
+ * other randomly chosen uint32_t.
+ */
+ i = __wt_random() % (cfg->value_sz / sizeof(uint32_t));
+ value_buf[0] = __wt_random();
+ value_buf[i] = __wt_random();
+ return;
+}
+
/*
- * track_aggregated_update --
+ * track_operation --
* Update an operation's tracking structure with new latency information.
*/
static inline void
-track_operation(TRACK *trk, uint64_t nsecs)
+track_operation(TRACK *trk, uint64_t usecs)
{
uint64_t v;
- /* average nanoseconds per call */
- v = (uint64_t)nsecs;
+ /* average microseconds per call */
+ v = (uint64_t)usecs;
- trk->latency += nsecs; /* track total latency */
+ trk->latency += usecs; /* track total latency */
if (v > trk->max_latency) /* track max/min latency */
trk->max_latency = (uint32_t)v;
@@ -148,20 +142,20 @@ track_operation(TRACK *trk, uint64_t nsecs)
* Update a latency bucket.
* First buckets: usecs from 100us to 1000us at 100us each.
*/
- if (v < us_to_ns(1000))
- ++trk->us[ns_to_us(v)];
+ if (v < 1000)
+ ++trk->us[v];
/*
* Second buckets: millseconds from 1ms to 1000ms, at 1ms each.
*/
- else if (v < ms_to_ns(1000))
- ++trk->ms[ns_to_ms(v)];
+ else if (v < ms_to_us(1000))
+ ++trk->ms[us_to_ms(v)];
/*
* Third buckets are seconds from 1s to 100s, at 1s each.
*/
- else if (v < sec_to_ns(100))
- ++trk->sec[ns_to_sec(v)];
+ else if (v < sec_to_us(100))
+ ++trk->sec[us_to_sec(v)];
/* >100 seconds, accumulate in the biggest bucket. */
else
@@ -196,7 +190,7 @@ worker(void *arg)
WT_CONNECTION *conn;
WT_CURSOR *cursor;
WT_SESSION *session;
- uint64_t next_val, nsecs;
+ uint64_t next_val, usecs;
int measure_latency, ret;
uint8_t *op, *op_end;
char *value_buf, *key_buf, *value;
@@ -205,8 +199,10 @@ worker(void *arg)
cfg = thread->cfg;
conn = cfg->conn;
session = NULL;
+ trk = NULL;
- if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) {
+ if ((ret = conn->open_session(
+ conn, NULL, cfg->sess_config, &session)) != 0) {
lprintf(cfg, ret, 0, "worker: WT_CONNECTION.open_session");
goto err;
}
@@ -290,6 +286,8 @@ worker(void *arg)
/* FALLTHROUGH */
case WORKER_INSERT:
+ if (cfg->random_value)
+ randomize_value(cfg, value_buf);
cursor->set_value(cursor, value_buf);
if ((ret = cursor->insert(cursor)) == 0)
break;
@@ -302,11 +300,13 @@ worker(void *arg)
"get_value in update.");
goto err;
}
- memcpy(value_buf, value, cfg->value_sz);
+ memcpy(value_buf, value, strlen(value));
if (value_buf[0] == 'a')
value_buf[0] = 'b';
else
value_buf[0] = 'a';
+ if (cfg->random_value)
+ randomize_value(cfg, value_buf);
cursor->set_value(cursor, value_buf);
if ((ret = cursor->update(cursor)) == 0)
break;
@@ -337,10 +337,9 @@ op_err: lprintf(cfg, ret, 0,
lprintf(cfg, ret, 0, "Get time call failed");
goto err;
}
- nsecs = (uint64_t)(stop.tv_nsec - start.tv_nsec);
- nsecs += sec_to_ns(
- (uint64_t)(stop.tv_sec - start.tv_sec));
- track_operation(trk, nsecs);
+ ++trk->latency_ops;
+ usecs = ns_to_us(WT_TIMEDIFF(stop, start));
+ track_operation(trk, usecs);
}
++trk->ops; /* increment operation counts */
@@ -467,14 +466,16 @@ run_mix_schedule(CONFIG *cfg, WORKLOAD *workp)
static void *
populate_thread(void *arg)
{
+ struct timespec start, stop;
CONFIG *cfg;
CONFIG_THREAD *thread;
+ TRACK *trk;
WT_CONNECTION *conn;
WT_CURSOR *cursor;
WT_SESSION *session;
uint32_t opcount;
- uint64_t op;
- int intxn, ret;
+ uint64_t op, usecs;
+ int intxn, measure_latency, ret;
char *value_buf, *key_buf;
thread = (CONFIG_THREAD *)arg;
@@ -482,11 +483,13 @@ populate_thread(void *arg)
conn = cfg->conn;
session = NULL;
ret = 0;
+ trk = &thread->insert;
key_buf = thread->key_buf;
value_buf = thread->value_buf;
- if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) {
+ if ((ret = conn->open_session(
+ conn, NULL, cfg->sess_config, &session)) != 0) {
lprintf(cfg, ret, 0, "populate: WT_CONNECTION.open_session");
goto err;
}
@@ -500,60 +503,65 @@ populate_thread(void *arg)
}
/* Populate the database. */
- if (cfg->populate_ops_per_txn == 0)
- for (;;) {
- op = get_next_incr();
- if (op > cfg->icount)
- break;
+ for (intxn = 0, opcount = 0;;) {
+ op = get_next_incr();
+ if (op > cfg->icount)
+ break;
- sprintf(key_buf, "%0*" PRIu64, cfg->key_sz, op);
- cursor->set_key(cursor, key_buf);
- cursor->set_value(cursor, value_buf);
- if ((ret = cursor->insert(cursor)) != 0) {
- lprintf(cfg, ret, 0, "Failed inserting");
+ if (cfg->populate_ops_per_txn != 0 && !intxn) {
+ if ((ret = session->begin_transaction(
+ session, cfg->transaction_config)) != 0) {
+ lprintf(cfg, ret, 0,
+ "Failed starting transaction.");
goto err;
}
- ++thread->insert.ops;
+ intxn = 1;
}
- else {
- for (intxn = 0, opcount = 0;;) {
- op = get_next_incr();
- if (op > cfg->icount)
- break;
-
- if (!intxn) {
- if ((ret = session->begin_transaction(
- session, cfg->transaction_config)) != 0) {
- lprintf(cfg, ret, 0,
- "Failed starting transaction.");
- goto err;
- }
- intxn = 1;
- }
- sprintf(key_buf, "%0*" PRIu64, cfg->key_sz, op);
- cursor->set_key(cursor, key_buf);
- cursor->set_value(cursor, value_buf);
- if ((ret = cursor->insert(cursor)) != 0) {
- lprintf(cfg, ret, 0, "Failed inserting");
+ sprintf(key_buf, "%0*" PRIu64, cfg->key_sz, op);
+ measure_latency = cfg->sample_interval != 0 && (
+ trk->ops % cfg->sample_rate == 0);
+ if (measure_latency &&
+ (ret = __wt_epoch(NULL, &start)) != 0) {
+ lprintf(cfg, ret, 0, "Get time call failed");
+ goto err;
+ }
+ cursor->set_key(cursor, key_buf);
+ if (cfg->random_value)
+ randomize_value(cfg, value_buf);
+ cursor->set_value(cursor, value_buf);
+ if ((ret = cursor->insert(cursor)) != 0) {
+ lprintf(cfg, ret, 0, "Failed inserting");
+ goto err;
+ }
+ /* Gather statistics */
+ if (measure_latency) {
+ if ((ret = __wt_epoch(NULL, &stop)) != 0) {
+ lprintf(cfg, ret, 0,
+ "Get time call failed");
goto err;
}
- ++thread->insert.ops;
+ ++trk->latency_ops;
+ usecs = ns_to_us(WT_TIMEDIFF(stop, start));
+ track_operation(trk, usecs);
+ }
+ ++thread->insert.ops; /* Same as trk->ops */
+ if (cfg->populate_ops_per_txn != 0) {
if (++opcount < cfg->populate_ops_per_txn)
continue;
opcount = 0;
- if ((ret =
- session->commit_transaction(session, NULL)) != 0)
+ if ((ret = session->commit_transaction(
+ session, NULL)) != 0)
lprintf(cfg, ret, 0,
"Fail committing, transaction was aborted");
intxn = 0;
}
- if (intxn &&
- (ret = session->commit_transaction(session, NULL)) != 0)
- lprintf(cfg, ret, 0,
- "Fail committing, transaction was aborted");
}
+ if (intxn &&
+ (ret = session->commit_transaction(session, NULL)) != 0)
+ lprintf(cfg, ret, 0,
+ "Fail committing, transaction was aborted");
if ((ret = session->close(session, NULL)) != 0) {
lprintf(cfg, ret, 0, "Error closing session in populate");
@@ -578,6 +586,7 @@ monitor(void *arg)
char buf[64], *path;
int ret;
uint64_t reads, inserts, updates;
+ uint64_t cur_reads, cur_inserts, cur_updates;
uint64_t last_reads, last_inserts, last_updates;
uint32_t read_avg, read_min, read_max;
uint32_t insert_avg, insert_min, insert_max;
@@ -591,19 +600,21 @@ monitor(void *arg)
path = NULL;
/* Open the logging file. */
- len = strlen(cfg->home) + 100;
+ len = strlen(cfg->monitor_dir) + 100;
if ((path = malloc(len)) == NULL) {
(void)enomem(cfg);
goto err;
}
- snprintf(path, len, "%s/monitor", cfg->home);
+ snprintf(path, len, "%s/monitor", cfg->monitor_dir);
if ((fp = fopen(path, "w")) == NULL) {
lprintf(cfg, errno, 0, "%s", path);
goto err;
}
+ /* Set line buffering for monitor file. */
+ (void)setvbuf(fp, NULL, _IOLBF, 0);
#ifdef __WRITE_A_HEADER
fprintf(fp,
- "#time,"
+ "#time,totalsec,"
"read operations,insert operations,update operations,"
"checkpoints,"
"read average latency(NS),read minimum latency(NS),"
@@ -639,18 +650,30 @@ monitor(void *arg)
latency_insert(cfg, &insert_avg, &insert_min, &insert_max);
latency_update(cfg, &update_avg, &update_min, &update_max);
+ cur_reads = reads - last_reads;
+ cur_updates = updates - last_updates;
+ /*
+ * For now the only item we need to worry about changing is
+ * inserts when we transition from the populate phase to
+ * workload phase.
+ */
+ if (inserts < last_inserts)
+ cur_inserts = 0;
+ else
+ cur_inserts = inserts - last_inserts;
+
(void)fprintf(fp,
- "%s"
+ "%s,%" PRIu32
",%" PRIu64 ",%" PRIu64 ",%" PRIu64
",%c"
",%" PRIu32 ",%" PRIu32 ",%" PRIu32
",%" PRIu32 ",%" PRIu32 ",%" PRIu32
",%" PRIu32 ",%" PRIu32 ",%" PRIu32
"\n",
- buf,
- (reads - last_reads) / cfg->sample_interval,
- (inserts - last_inserts) / cfg->sample_interval,
- (updates - last_updates) / cfg->sample_interval,
+ buf, g_totalsec,
+ cur_reads / cfg->sample_interval,
+ cur_inserts / cfg->sample_interval,
+ cur_updates / cfg->sample_interval,
g_ckpt ? 'Y' : 'N',
read_avg, read_min, read_max,
insert_avg, insert_min, insert_max,
@@ -689,7 +712,8 @@ checkpoint_worker(void *arg)
conn = cfg->conn;
session = NULL;
- if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) {
+ if ((ret = conn->open_session(
+ conn, NULL, cfg->sess_config, &session)) != 0) {
lprintf(cfg, ret, 0,
"open_session failed in checkpoint thread.");
goto err;
@@ -742,6 +766,7 @@ err: g_error = g_stop = 1;
static int
execute_populate(CONFIG *cfg)
{
+ CONFIG_THREAD *popth;
WT_SESSION *session;
struct timespec start, stop;
double secs;
@@ -780,12 +805,13 @@ execute_populate(CONFIG *cfg)
if (++interval < cfg->report_interval)
continue;
interval = 0;
+ g_totalsec += cfg->report_interval;
g_insert_ops = sum_pop_ops(cfg);
lprintf(cfg, 0, 1,
"%" PRIu64 " populate inserts (%" PRIu64 " of %"
- PRIu32 ") in %" PRIu32 " secs",
+ PRIu32 ") in %" PRIu32 " secs (%" PRIu32 " total secs)",
g_insert_ops - last_ops, g_insert_ops,
- cfg->icount, cfg->report_interval);
+ cfg->icount, cfg->report_interval, g_totalsec);
last_ops = g_insert_ops;
}
if ((ret = __wt_epoch(NULL, &stop)) != 0) {
@@ -793,8 +819,17 @@ execute_populate(CONFIG *cfg)
return (ret);
}
- if ((ret =
- stop_threads(cfg, cfg->populate_threads, cfg->popthreads)) != 0)
+ /*
+ * Move popthreads aside to narrow possible race with the monitor
+ * thread. The latency tracking code also requires that popthreads be
+ * NULL when the populate phase is finished, to know that the workload
+ * phase has started.
+ */
+ popth = cfg->popthreads;
+ cfg->popthreads = NULL;
+ ret = stop_threads(cfg, cfg->populate_threads, popth);
+ free(popth);
+ if (ret != 0)
return (ret);
/* Report if any worker threads didn't finish. */
@@ -817,7 +852,7 @@ execute_populate(CONFIG *cfg)
*/
if (cfg->compact) {
if ((ret = cfg->conn->open_session(
- cfg->conn, NULL, NULL, &session)) != 0) {
+ cfg->conn, NULL, cfg->sess_config, &session)) != 0) {
lprintf(cfg, ret, 0,
"execute_populate: WT_CONNECTION.open_session");
return (ret);
@@ -897,7 +932,7 @@ execute_workload(CONFIG *cfg)
threads += workp->threads;
}
- for (interval = cfg->report_interval,
+ for (interval = cfg->report_interval,
run_time = cfg->run_time, run_ops = cfg->run_ops; g_error == 0;) {
/*
* Sleep for one second at a time.
@@ -927,15 +962,17 @@ execute_workload(CONFIG *cfg)
if (interval == 0 || --interval > 0)
continue;
interval = cfg->report_interval;
+ g_totalsec += cfg->report_interval;
lprintf(cfg, 0, 1,
"%" PRIu64 " reads, %" PRIu64 " inserts, %" PRIu64
- " updates, %" PRIu64 " checkpoints in %" PRIu32 " secs",
+ " updates, %" PRIu64 " checkpoints in %" PRIu32
+ " secs (%" PRIu32 " total secs)",
g_read_ops - last_reads,
g_insert_ops - last_inserts,
g_update_ops - last_updates,
g_ckpt_ops - last_ckpts,
- cfg->report_interval);
+ cfg->report_interval, g_totalsec);
last_reads = g_read_ops;
last_inserts = g_insert_ops;
last_updates = g_update_ops;
@@ -974,7 +1011,8 @@ find_table_count(CONFIG *cfg)
conn = cfg->conn;
- if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) {
+ if ((ret = conn->open_session(
+ conn, NULL, cfg->sess_config, &session)) != 0) {
lprintf(cfg, ret, 0,
"open_session failed finding existing table count");
goto err;
@@ -1014,15 +1052,16 @@ main(int argc, char *argv[])
pthread_t monitor_thread;
size_t len;
uint64_t req_len, total_ops;
- int ch, monitor_created, ret, t_ret;
- const char *opts = "C:O:T:h:o:SML";
+ int ch, monitor_created, monitor_set, ret, t_ret;
+ const char *helium_mount;
+ const char *opts = "C:H:h:m:O:o:T:";
const char *wtperftmp_subdir = "wtperftmp";
const char *user_cconfig, *user_tconfig;
char *cmd, *cc_buf, *tc_buf, *tmphome;
session = NULL;
- monitor_created = ret = 0;
- user_cconfig = user_tconfig = NULL;
+ monitor_created = monitor_set = ret = 0;
+ helium_mount = user_cconfig = user_tconfig = NULL;
cmd = cc_buf = tc_buf = tmphome = NULL;
/* Setup the default configuration values. */
@@ -1037,6 +1076,10 @@ main(int argc, char *argv[])
case 'h':
cfg->home = optarg;
break;
+ case 'm':
+ cfg->monitor_dir = optarg;
+ monitor_set = 1;
+ break;
case '?':
fprintf(stderr, "Invalid option\n");
usage();
@@ -1044,10 +1087,17 @@ main(int argc, char *argv[])
}
/*
+ * If the user did not specify a monitor directory
+ * then set the monitor directory to the home dir.
+ */
+ if (!monitor_set)
+ cfg->monitor_dir = cfg->home;
+
+ /*
* Create a temporary directory underneath the test directory in which
- * we do an initial WiredTiger open, because we need a connection and
- * session in order to use the extension configuration parser. We will
- * open the real WiredTiger database after parsing the options.
+ * we do an initial WiredTiger open, because we need a connection in
+ * order to use the extension configuration parser. We will open the
+ * real WiredTiger database after parsing the options.
*/
len = strlen(cfg->home) + strlen(wtperftmp_subdir) + 2;
if ((tmphome = malloc(len)) == NULL) {
@@ -1070,11 +1120,6 @@ main(int argc, char *argv[])
lprintf(cfg, ret, 0, "wiredtiger_open: %s", tmphome);
goto err;
}
- if ((ret = cfg->conn->open_session(
- cfg->conn, NULL, NULL, &session)) != 0) {
- lprintf(cfg, ret, 0, "Error creating session");
- goto err;
- }
/*
* Then parse different config structures - other options override
@@ -1083,18 +1128,6 @@ main(int argc, char *argv[])
optind = 1;
while ((ch = getopt(argc, argv, opts)) != EOF)
switch (ch) {
- case 'S':
- if (config_opt_line(cfg, small_config_str) != 0)
- goto einval;
- break;
- case 'M':
- if (config_opt_line(cfg, med_config_str) != 0)
- goto einval;
- break;
- case 'L':
- if (config_opt_line(cfg, large_config_str) != 0)
- goto einval;
- break;
case 'O':
if (config_opt_file(cfg, optarg) != 0)
goto einval;
@@ -1108,26 +1141,33 @@ main(int argc, char *argv[])
optind = 1;
while ((ch = getopt(argc, argv, opts)) != EOF)
switch (ch) {
+ case 'C':
+ user_cconfig = optarg;
+ break;
+ case 'H':
+ helium_mount = optarg;
+ break;
case 'o':
/* Allow -o key=value */
if (config_opt_line(cfg, optarg) != 0)
goto einval;
break;
- case 'C':
- user_cconfig = optarg;
- break;
case 'T':
user_tconfig = optarg;
break;
}
/* Build the URI from the table name. */
- req_len = strlen("table:") + strlen(cfg->table_name) + 1;
+ req_len = strlen("table:") +
+ strlen(HELIUM_NAME) + strlen(cfg->table_name) + 2;
if ((cfg->uri = calloc(req_len, 1)) == NULL) {
ret = enomem(cfg);
goto err;
}
- snprintf(cfg->uri, req_len, "table:%s", cfg->table_name);
+ snprintf(cfg->uri, req_len, "table:%s%s%s",
+ helium_mount == NULL ? "" : HELIUM_NAME,
+ helium_mount == NULL ? "" : "/",
+ cfg->table_name);
if ((ret = setup_log_file(cfg)) != 0)
goto err;
@@ -1152,36 +1192,31 @@ main(int argc, char *argv[])
if ((ret = config_opt_str(cfg, "conn_config", cc_buf)) != 0)
goto err;
}
- if (cfg->verbose > 1 || user_tconfig != NULL) {
- req_len = strlen(cfg->table_config) + strlen(debug_tconfig) + 3;
+ if (cfg->verbose > 1 || helium_mount != NULL || user_tconfig != NULL) {
+ req_len = strlen(cfg->table_config) +
+ strlen(HELIUM_CONFIG) + strlen(debug_tconfig) + 3;
if (user_tconfig != NULL)
req_len += strlen(user_tconfig);
if ((tc_buf = calloc(req_len, 1)) == NULL) {
ret = enomem(cfg);
goto err;
}
- snprintf(tc_buf, req_len, "%s%s%s%s%s",
+ snprintf(tc_buf, req_len, "%s%s%s%s%s%s",
cfg->table_config,
cfg->verbose > 1 ? "," : "",
cfg->verbose > 1 ? debug_tconfig : "",
- user_tconfig ? "," : "", user_tconfig ? user_tconfig : "");
+ user_tconfig ? "," : "", user_tconfig ? user_tconfig : "",
+ helium_mount == NULL ? "" : HELIUM_CONFIG);
if ((ret = config_opt_str(cfg, "table_config", tc_buf)) != 0)
goto err;
}
- ret = session->close(session, NULL);
- session = NULL;
- if (ret != 0) {
- lprintf(cfg, ret, 0, "WT_SESSION.close");
- goto err;
- }
ret = cfg->conn->close(cfg->conn, NULL);
cfg->conn = NULL;
if (ret != 0) {
lprintf(cfg, ret, 0, "WT_CONNECTION.close: %s", tmphome);
goto err;
}
-
/* Sanity-check the configuration */
if (config_sanity(cfg) != 0)
goto err;
@@ -1195,9 +1230,22 @@ main(int argc, char *argv[])
goto err;
}
+ if (helium_mount != NULL) { /* Configure optional Helium volume. */
+ char helium_buf[256];
+ snprintf(helium_buf, sizeof(helium_buf),
+ "entry=wiredtiger_extension_init,config=["
+ "%s=[helium_devices=\"he://./%s\","
+ "helium_o_volume_truncate=1]]",
+ HELIUM_NAME, helium_mount);
+ if ((ret = cfg->conn->load_extension(
+ cfg->conn, HELIUM_PATH, helium_buf)) != 0)
+ lprintf(cfg,
+ ret, 0, "Error loading Helium: %s", helium_buf);
+ }
+
if (cfg->create != 0) { /* If creating, create the table. */
if ((ret = cfg->conn->open_session(
- cfg->conn, NULL, NULL, &session)) != 0) {
+ cfg->conn, NULL, cfg->sess_config, &session)) != 0) {
lprintf(cfg, ret, 0,
"Error opening a session on %s", cfg->home);
goto err;
@@ -1260,14 +1308,20 @@ main(int argc, char *argv[])
total_ops = g_read_ops + g_insert_ops + g_update_ops;
lprintf(cfg, 0, 1,
- "Executed %" PRIu64 " read operations (%" PRIu64 "%%)",
- g_read_ops, (g_read_ops * 100) / total_ops);
+ "Executed %" PRIu64 " read operations (%" PRIu64
+ "%%) %" PRIu64 " ops/sec",
+ g_read_ops, (g_read_ops * 100) / total_ops,
+ g_read_ops / cfg->run_time);
lprintf(cfg, 0, 1,
- "Executed %" PRIu64 " insert operations (%" PRIu64 "%%)",
- g_insert_ops, (g_insert_ops * 100) / total_ops);
+ "Executed %" PRIu64 " insert operations (%" PRIu64
+ "%%) %" PRIu64 " ops/sec",
+ g_insert_ops, (g_insert_ops * 100) / total_ops,
+ g_insert_ops / cfg->run_time);
lprintf(cfg, 0, 1,
- "Executed %" PRIu64 " update operations (%" PRIu64 "%%)",
- g_update_ops, (g_update_ops * 100) / total_ops);
+ "Executed %" PRIu64 " update operations (%" PRIu64
+ "%%) %" PRIu64 " ops/sec",
+ g_update_ops, (g_update_ops * 100) / total_ops,
+ g_update_ops / cfg->run_time);
lprintf(cfg, 0, 1,
"Executed %" PRIu64 " checkpoint operations",
g_ckpt_ops);
@@ -1348,7 +1402,12 @@ start_threads(CONFIG *cfg,
return (enomem(cfg));
if ((thread->value_buf = calloc(cfg->value_sz, 1)) == NULL)
return (enomem(cfg));
+ /*
+ * Initialize and then toss in a bit of random values if needed.
+ */
memset(thread->value_buf, 'a', cfg->value_sz - 1);
+ if (cfg->random_value)
+ randomize_value(cfg, thread->value_buf);
/*
* Every thread gets tracking information and is initialized
diff --git a/bench/wtperf/wtperf.h b/bench/wtperf/wtperf.h
index afaf56c243a..54c4334a2ac 100644
--- a/bench/wtperf/wtperf.h
+++ b/bench/wtperf/wtperf.h
@@ -63,6 +63,7 @@ typedef struct {
struct __config { /* Configuration struction */
const char *home; /* WiredTiger home */
+ const char *monitor_dir; /* Monitor output dir */
char *uri; /* Object URI */
WT_CONNECTION *conn; /* Database connection */
@@ -98,6 +99,11 @@ typedef struct {
#define ELEMENTS(a) (sizeof(a) / sizeof(a[0]))
+/* From include/os.h */
+#define WT_TIMEDIFF(end, begin) \
+ (1000000000 * (uint64_t)((end).tv_sec - (begin).tv_sec) + \
+ (uint64_t)(end).tv_nsec - (uint64_t)(begin).tv_nsec)
+
#define THOUSAND (1000ULL)
#define MILLION (1000000ULL)
#define BILLION (1000000000ULL)
@@ -125,9 +131,10 @@ typedef struct {
* the last_XXX fields.
*/
uint64_t ops; /* Total operations */
+ uint64_t latency_ops; /* Total ops sampled for latency */
uint64_t latency; /* Total latency */
- uint64_t last_ops; /* Last read by monitor thread */
+ uint64_t last_latency_ops; /* Last read by monitor thread */
uint64_t last_latency;
/*
diff --git a/bench/wtperf/wtperf_opt.i b/bench/wtperf/wtperf_opt.i
index 28a2cb51662..4e11799781f 100644
--- a/bench/wtperf/wtperf_opt.i
+++ b/bench/wtperf/wtperf_opt.i
@@ -84,7 +84,6 @@ DEF_OPT_AS_CONFIG_STRING(conn_config, "create",
DEF_OPT_AS_BOOL(compact, 0, "post-populate compact for LSM merging activity")
DEF_OPT_AS_BOOL(create, 1,
"do population phase; false to use existing database")
-DEF_OPT_AS_UINT32(value_sz, 100, "value size")
DEF_OPT_AS_UINT32(icount, 5000, "number of records to initially populate")
DEF_OPT_AS_BOOL(insert_rmw, 0,
"execute a read prior to each insert in workload phase")
@@ -97,7 +96,8 @@ DEF_OPT_AS_UINT32(populate_threads, 1,
"number of populate threads, 1 for bulk load")
DEF_OPT_AS_UINT32(random_range, 0,
"if non zero choose a value from within this range as the key for "
- "operations")
+ "insert operations")
+DEF_OPT_AS_BOOL(random_value, 0, "generate random content for the value")
DEF_OPT_AS_UINT32(report_interval, 2,
"output throughput information every interval seconds, 0 to disable")
DEF_OPT_AS_UINT32(run_ops, 0,
@@ -109,22 +109,25 @@ DEF_OPT_AS_UINT32(sample_interval, 0,
DEF_OPT_AS_UINT32(sample_rate, 50,
"how often the latency of operations is measured. One for every operation,"
"two for every second operation, three for every third operation etc.")
+DEF_OPT_AS_CONFIG_STRING(sess_config, "", "session configuration string")
DEF_OPT_AS_CONFIG_STRING(table_config,
"key_format=S,value_format=S,type=lsm,exclusive=true,"
"leaf_page_max=4kb,internal_page_max=64kb,allocation_size=4kb,",
"table configuration string")
-DEF_OPT_AS_STRING(threads, "", "worker thread configuration: each 'count' "
+DEF_OPT_AS_STRING(threads, "", "workload configuration: each 'count' "
"entry is the total number of threads, and the 'insert', 'read' and "
"'update' entries are the ratios of insert, read and update operations "
"done by each worker thread; multiple workload configurations may be "
"specified; for example, a more complex threads configuration might be "
"'threads=((count=2,reads=1)(count=8,reads=1,inserts=2,updates=1))' "
"which would create 2 threads doing nothing but reads and 8 threads "
- "each doing 50% inserts and 25% reads and updates")
+ "each doing 50% inserts and 25% reads and updates. Allowed"
+ "configuration values are 'count', 'reads', 'inserts', 'updates'")
DEF_OPT_AS_CONFIG_STRING(transaction_config, "",
"transaction configuration string, relevant when populate_opts_per_txn "
"is nonzero")
DEF_OPT_AS_STRING(table_name, "test", "table name")
+DEF_OPT_AS_UINT32(value_sz, 100, "value size")
DEF_OPT_AS_UINT32(verbose, 1, "verbosity")
#undef DEF_OPT_AS_BOOL
diff --git a/build_posix/Make.subdirs b/build_posix/Make.subdirs
index 21600fd2a29..4c5c8503312 100644
--- a/build_posix/Make.subdirs
+++ b/build_posix/Make.subdirs
@@ -12,8 +12,9 @@ ext/collators/reverse
ext/compressors/bzip2 BZIP2
ext/compressors/nop
ext/compressors/snappy SNAPPY
+ext/compressors/zlib ZLIB
+ext/datasources/helium HAVE_HELIUM
ext/test/kvs_bdb HAVE_BERKELEY_DB
-ext/test/memrata HAVE_MEMRATA
lang/java JAVA
lang/python PYTHON
test/bloom
diff --git a/build_posix/aclocal/options.m4 b/build_posix/aclocal/options.m4
index 00bc1daf8f0..c1bf988e91d 100644
--- a/build_posix/aclocal/options.m4
+++ b/build_posix/aclocal/options.m4
@@ -122,4 +122,21 @@ pthread_logging|pthreads_logging)
esac
AC_MSG_RESULT($with_spinlock)
+AC_MSG_CHECKING(if --enable-zlib option specified)
+AC_ARG_ENABLE(zlib,
+ [AS_HELP_STRING([--enable-zlib],
+ [Build the zlib compressor extension.])], r=$enableval, r=no)
+case "$r" in
+no) wt_cv_enable_zlib=no;;
+*) wt_cv_enable_zlib=yes;;
+esac
+AC_MSG_RESULT($wt_cv_enable_zlib)
+if test "$wt_cv_enable_zlib" = "yes"; then
+ AC_CHECK_HEADER(zlib.h,,
+ [AC_MSG_ERROR([--enable-zlib requires zlib.h])])
+ AC_CHECK_LIB(z, deflate,,
+ [AC_MSG_ERROR([--enable-zlib requires zlib library])])
+fi
+AM_CONDITIONAL([ZLIB], [test "$wt_cv_enable_zlib" = "yes"])
+
])
diff --git a/build_posix/aclocal/version-set.m4 b/build_posix/aclocal/version-set.m4
index 7b56b726e76..6d51c171c1e 100644
--- a/build_posix/aclocal/version-set.m4
+++ b/build_posix/aclocal/version-set.m4
@@ -1,14 +1,14 @@
dnl build by dist/s_version
VERSION_MAJOR=2
-VERSION_MINOR=0
-VERSION_PATCH=2
-VERSION_STRING='"WiredTiger 2.0.2: (December 12, 2013)"'
+VERSION_MINOR=1
+VERSION_PATCH=1
+VERSION_STRING='"WiredTiger 2.1.1: (February 4, 2014)"'
AC_SUBST(VERSION_MAJOR)
AC_SUBST(VERSION_MINOR)
AC_SUBST(VERSION_PATCH)
AC_SUBST(VERSION_STRING)
-VERSION_NOPATCH=2.0
+VERSION_NOPATCH=2.1
AC_SUBST(VERSION_NOPATCH)
diff --git a/build_posix/aclocal/version.m4 b/build_posix/aclocal/version.m4
index b406ad5f4b8..80fe1bb193f 100644
--- a/build_posix/aclocal/version.m4
+++ b/build_posix/aclocal/version.m4
@@ -1,2 +1,2 @@
dnl WiredTiger product version for AC_INIT. Maintained by dist/s_version
-2.0.2
+2.1.1
diff --git a/build_posix/configure.ac.in b/build_posix/configure.ac.in
index 3c07bd3cb29..8e57b7c6a88 100644
--- a/build_posix/configure.ac.in
+++ b/build_posix/configure.ac.in
@@ -119,11 +119,11 @@ if test "$ac_cv_func_posix_memalign" = "yes" ; then
esac
fi
AC_DEFINE_UNQUOTED(WT_BUFFER_ALIGNMENT_DEFAULT, $BUFFER_ALIGNMENT,
- [Default alignment of bufffers used for I/O])
+ [Default alignment of buffers used for I/O])
AC_SUBST(AM_CFLAGS)
-# test/format requires an installed Berkeley DB release tree.
+# test/format requires an installed Oracle Berkeley DB release tree.
AC_MSG_CHECKING([if --with-berkeleydb=DIR option specified])
AC_ARG_WITH(berkeleydb,
[AS_HELP_STRING([--with-berkeleydb=DIR],
@@ -133,13 +133,15 @@ AC_MSG_RESULT($with_berkeleydb)
AM_CONDITIONAL([HAVE_BERKELEY_DB], [test -d $with_berkeleydb])
AC_SUBST(BERKELEY_DB_PATH, [$with_berkeleydb])
-# test/format optionally supports the MEMRATA KVS library.
-memrata_dir=`ls -d "$ac_pwd/memrata" 2>/dev/null | head -1`
-if ! test -d "$memrata_dir" ; then
- memrata_dir="NO_MEMRATA_LIBRARY_FOUND"
-fi
-AM_CONDITIONAL([HAVE_MEMRATA], [test -d $memrata_dir])
-AC_SUBST(MEMRATA_PATH, [$memrata_dir])
+# test/format optionally supports the Levyx/Helium key/value store.
+AC_MSG_CHECKING([if --with-helium=DIR option specified])
+AC_ARG_WITH(helium,
+ [AS_HELP_STRING([--with-helium=DIR],
+ [Specify installed library directory of Helium])],
+ [with_helium="$withval"], [with_helium="NO_HELIUM_LIBRARY"])
+AC_MSG_RESULT($with_helium)
+AM_CONDITIONAL([HAVE_HELIUM], [test -d $with_helium])
+AC_SUBST(HELIUM_PATH, [$with_helium])
# Warn that diagnostic builds should not be used in production
if test "$wt_cv_enable_diagnostic" = "yes"; then
diff --git a/dist/api_data.py b/dist/api_data.py
index 4ede19bc4eb..e74359425fb 100644
--- a/dist/api_data.py
+++ b/dist/api_data.py
@@ -124,7 +124,10 @@ lsm_config = [
be larger than chunk_size''',
min='100MB', max='10TB'),
Config('chunk_size', '10MB', r'''
- the maximum size of the in-memory chunk of an LSM tree''',
+ the maximum size of the in-memory chunk of an LSM tree. This
+ limit is soft - it is possible for chunks to be temporarily
+ larger than this value. This overrides the \c memory_page_max
+ setting''',
min='512K', max='500MB'),
Config('merge_max', '15', r'''
the maximum number of chunks to include in a merge operation''',
@@ -231,10 +234,10 @@ file_config = format_meta + [
min=0),
Config('memory_page_max', '5MB', r'''
the maximum size a page can grow to in memory before being
- reconciled to disk. The specified size will be adjusted to a
- lower bound of <code>50 * leaf_page_max</code>. This limit is
- soft - it is possible for pages to be temporarily larger than
- this value''',
+ reconciled to disk. The specified size will be adjusted to a lower
+ bound of <code>50 * leaf_page_max</code>. This limit is soft - it
+ is possible for pages to be temporarily larger than this value.
+ This setting is ignored for LSM trees, see \c chunk_size''',
min='512B', max='10TB'),
Config('os_cache_max', '0', r'''
maximum system buffer cache usage, in bytes. If non-zero, evict
diff --git a/dist/s_string.ok b/dist/s_string.ok
index c6d18377076..a71cca62a8c 100644
--- a/dist/s_string.ok
+++ b/dist/s_string.ok
@@ -142,6 +142,7 @@ LSN
LSNs
LZO
LeafGreen
+Levyx
Llqr
Llqrt
LoadLoad
@@ -155,7 +156,6 @@ MUTEX
MVCC
Manos
Marsaglia's
-Memrata
Metadata
Mewhort
Multi
@@ -213,6 +213,7 @@ SIMD
SLIST
SLVG
SML
+SOURCE's
SPINLOCK
SQL
SSD
@@ -255,6 +256,7 @@ Unmarshall
Unregister
VARCHAR
VLDB
+VMSG
Vanishingly
Vc
Vixie
@@ -269,8 +271,10 @@ WiredTiger
WiredTiger's
WiredTigerCheckpoint
WiredTigerHome
+WiredTigerInit
WiredTigerLog
WiredTigerStat
+WiredTigerTxn
WithSeeds
Wmissing
Wuninitialized
@@ -394,6 +398,7 @@ database's
datalen
datasets
datasource
+datastore
dbc
decile
deciles
@@ -401,6 +406,8 @@ decl
decr
decrement
decrementing
+deflateEnd
+deflateInit
defno
del
delfmt
@@ -534,6 +541,8 @@ indices
indirects
indx
infeasible
+inflateEnd
+inflateInit
init
initn
initsize
@@ -570,6 +579,7 @@ lf
lfence
libdatasource
libs
+libwiredtiger
lld
llll
llu
@@ -607,7 +617,6 @@ membar
memcpy
memfree
memmove
-memrata
memset
memsize
mergeable
@@ -653,6 +662,7 @@ nocase
nonliteral
noop
nop
+noraw
notfound
notset
notsup
@@ -935,4 +945,5 @@ xff
xxxx
xxxxx
xxxxxx
+zlib
zu
diff --git a/dist/stat_data.py b/dist/stat_data.py
index 1007cf71a11..72babeb881a 100644
--- a/dist/stat_data.py
+++ b/dist/stat_data.py
@@ -153,6 +153,9 @@ connection_stats = [
##########################################
# LSM statistics
##########################################
+ Stat('lsm_checkpoint_throttle',
+ 'sleep for LSM checkpoint throttle'),
+ Stat('lsm_merge_throttle', 'sleep for LSM merge throttle'),
Stat('lsm_rows_merged', 'rows merged in an LSM tree'),
##########################################
@@ -244,6 +247,8 @@ dsrc_stats = [
'bloom filter pages evicted from cache'),
Stat('bloom_page_read', 'bloom filter pages read into cache'),
Stat('bloom_size', 'total size of bloom filters', 'no_scale'),
+ Stat('lsm_checkpoint_throttle',
+ 'sleep for LSM checkpoint throttle'),
Stat('lsm_chunk_count',
'chunks in the LSM tree', 'no_aggregate,no_scale'),
Stat('lsm_generation_max',
@@ -252,6 +257,7 @@ dsrc_stats = [
Stat('lsm_lookup_no_bloom',
'queries that could have benefited ' +
'from a Bloom filter that did not exist'),
+ Stat('lsm_merge_throttle', 'sleep for LSM merge throttle'),
##########################################
# Block manager statistics
diff --git a/examples/c/ex_all.c b/examples/c/ex_all.c
index b67377415e5..a135b66da19 100644
--- a/examples/c/ex_all.c
+++ b/examples/c/ex_all.c
@@ -542,6 +542,13 @@ session_ops(WT_SESSION *session)
"block_compressor=snappy,key_format=S,value_format=S");
/*! [Create a snappy compressed table] */
ret = session->drop(session, "table:mytable", NULL);
+
+ /*! [Create a zlib compressed table] */
+ ret = session->create(session,
+ "table:mytable",
+ "block_compressor=zlib,key_format=S,value_format=S");
+ /*! [Create a zlib compressed table] */
+ ret = session->drop(session, "table:mytable", NULL);
#endif
/*! [Configure checksums to uncompressed] */
@@ -959,7 +966,7 @@ main(void)
/*! [Configure bzip2 extension] */
ret = wiredtiger_open(home, NULL,
"create,"
- "extensions=[/usr/local/lib/wiredtiger_bzip2.so]", &conn);
+ "extensions=[/usr/local/lib/libwiredtiger_bzip2.so]", &conn);
/*! [Configure bzip2 extension] */
if (ret == 0)
(void)conn->close(conn, NULL);
@@ -967,11 +974,19 @@ main(void)
/*! [Configure snappy extension] */
ret = wiredtiger_open(home, NULL,
"create,"
- "extensions=[/usr/local/lib/wiredtiger_snappy.so]", &conn);
+ "extensions=[/usr/local/lib/libwiredtiger_snappy.so]", &conn);
/*! [Configure snappy extension] */
if (ret == 0)
(void)conn->close(conn, NULL);
+ /*! [Configure zlib extension] */
+ ret = wiredtiger_open(home, NULL,
+ "create,"
+ "extensions=[/usr/local/lib/libwiredtiger_zlib.so]", &conn);
+ /*! [Configure zlib extension] */
+ if (ret == 0)
+ (void)conn->close(conn, NULL);
+
/*
* This example code gets run, and direct I/O might not be available,
* causing the open to fail. The documentation requires code snippets,
diff --git a/ext/compressors/bzip2/bzip2_compress.c b/ext/compressors/bzip2/bzip2_compress.c
index 73e9ef3a932..dd97e2abee3 100644
--- a/ext/compressors/bzip2/bzip2_compress.c
+++ b/ext/compressors/bzip2/bzip2_compress.c
@@ -42,12 +42,10 @@ bzip2_decompress(WT_COMPRESSOR *, WT_SESSION *,
uint8_t *, size_t, uint8_t *, size_t, size_t *);
static int
bzip2_terminate(WT_COMPRESSOR *, WT_SESSION *);
-#ifdef WIREDTIGER_TEST_COMPRESS_RAW
static int
bzip2_compress_raw(WT_COMPRESSOR *, WT_SESSION *, size_t, int,
size_t, uint8_t *, uint32_t *, uint32_t, uint8_t *, size_t, int,
size_t *, uint32_t *);
-#endif
/* Local compressor structure. */
typedef struct {
@@ -70,18 +68,26 @@ typedef struct {
WT_SESSION *session;
} BZIP_OPAQUE;
-int
-wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
+/*
+ * bzip2_add_compressor --
+ * Add a bzip2 compressor.
+ */
+static int
+bzip2_add_compressor(WT_CONNECTION *connection, int raw, const char *name)
{
BZIP_COMPRESSOR *bzip_compressor;
- (void)config; /* Unused parameters */
-
+ /*
+ * There are two almost identical bzip2 compressors: one supporting raw
+ * compression (used by test/format to test raw compression), the other
+ * without raw compression, that might be useful for real applications.
+ */
if ((bzip_compressor = calloc(1, sizeof(BZIP_COMPRESSOR))) == NULL)
return (errno);
bzip_compressor->compressor.compress = bzip2_compress;
- bzip_compressor->compressor.compress_raw = NULL;
+ bzip_compressor->
+ compressor.compress_raw = raw ? bzip2_compress_raw : NULL;
bzip_compressor->compressor.decompress = bzip2_decompress;
bzip_compressor->compressor.pre_size = NULL;
bzip_compressor->compressor.terminate = bzip2_terminate;
@@ -109,15 +115,22 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
*/
bzip_compressor->bz_small = 0;
- /* Load the compressor */
-#ifdef WIREDTIGER_TEST_COMPRESS_RAW
- bzip_compressor->compressor.compress_raw = bzip2_compress_raw;
- return (connection->add_compressor(
- connection, "raw", (WT_COMPRESSOR *)bzip_compressor, NULL));
-#else
- return (connection->add_compressor(
- connection, "bzip2", (WT_COMPRESSOR *)bzip_compressor, NULL));
-#endif
+ return (connection->add_compressor( /* Load the compressor */
+ connection, name, (WT_COMPRESSOR *)bzip_compressor, NULL));
+}
+
+int
+wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
+{
+ int ret;
+
+ (void)config; /* Unused parameters */
+
+ if ((ret = bzip2_add_compressor(connection, 0, "bzip2")) != 0)
+ return (ret);
+ if ((ret = bzip2_add_compressor(connection, 1, "bzip2-raw-test")) != 0)
+ return (ret);
+ return (0);
}
/*
@@ -238,7 +251,6 @@ bzip2_compress(WT_COMPRESSOR *compressor, WT_SESSION *session,
return (0);
}
-#ifdef WIREDTIGER_TEST_COMPRESS_RAW
/*
* __bzip2_compress_raw_random --
* Return a 32-bit pseudo-random number.
@@ -328,9 +340,8 @@ bzip2_compress_raw(WT_COMPRESSOR *compressor, WT_SESSION *session,
(uintmax_t)page_max, split_pct, (uintmax_t)extra,
slots, take, offsets[take], (uintmax_t)*result_lenp);
#endif
- return (0);
+ return (take == 0 ? EAGAIN : 0);
}
-#endif
static int
bzip2_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session,
diff --git a/ext/compressors/zlib/Makefile.am b/ext/compressors/zlib/Makefile.am
new file mode 100644
index 00000000000..373277c92c2
--- /dev/null
+++ b/ext/compressors/zlib/Makefile.am
@@ -0,0 +1,6 @@
+AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include
+
+lib_LTLIBRARIES = libwiredtiger_zlib.la
+libwiredtiger_zlib_la_SOURCES = zlib_compress.c
+libwiredtiger_zlib_la_LDFLAGS = -avoid-version -module
+libwiredtiger_zlib_la_LIBADD = -lz
diff --git a/ext/compressors/zlib/zlib_compress.c b/ext/compressors/zlib/zlib_compress.c
new file mode 100644
index 00000000000..a48037c8526
--- /dev/null
+++ b/ext/compressors/zlib/zlib_compress.c
@@ -0,0 +1,367 @@
+/*-
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <zlib.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <wiredtiger.h>
+#include <wiredtiger_ext.h>
+
+/* Local compressor structure. */
+typedef struct {
+ WT_COMPRESSOR compressor; /* Must come first */
+
+ WT_EXTENSION_API *wt_api; /* Extension API */
+
+ int zlib_level; /* Configuration */
+} ZLIB_COMPRESSOR;
+
+/*
+ * zlib gives us a cookie to pass to the underlying allocation functions; we
+ * need two handles, package them up.
+ */
+typedef struct {
+ WT_COMPRESSOR *compressor;
+ WT_SESSION *session;
+} ZLIB_OPAQUE;
+
+/*
+ * zlib_error --
+ * Output an error message, and return a standard error code.
+ */
+static int
+zlib_error(
+ WT_COMPRESSOR *compressor, WT_SESSION *session, const char *call, int zret)
+{
+ WT_EXTENSION_API *wt_api;
+
+ wt_api = ((ZLIB_COMPRESSOR *)compressor)->wt_api;
+
+ (void)wt_api->err_printf(wt_api, session,
+ "zlib error: %s: %s: %d", call, zError(zret), zret);
+ return (WT_ERROR);
+}
+
+static void *
+zalloc(void *cookie, u_int number, u_int size)
+{
+ ZLIB_OPAQUE *opaque;
+ WT_EXTENSION_API *wt_api;
+
+ opaque = cookie;
+ wt_api = ((ZLIB_COMPRESSOR *)opaque->compressor)->wt_api;
+ return (wt_api->scr_alloc(
+ wt_api, opaque->session, (size_t)(number * size)));
+}
+
+static void
+zfree(void *cookie, void *p)
+{
+ ZLIB_OPAQUE *opaque;
+ WT_EXTENSION_API *wt_api;
+
+ opaque = cookie;
+ wt_api = ((ZLIB_COMPRESSOR *)opaque->compressor)->wt_api;
+ wt_api->scr_free(wt_api, opaque->session, p);
+}
+
+static int
+zlib_compress(WT_COMPRESSOR *compressor, WT_SESSION *session,
+ uint8_t *src, size_t src_len,
+ uint8_t *dst, size_t dst_len,
+ size_t *result_lenp, int *compression_failed)
+{
+ ZLIB_COMPRESSOR *zlib_compressor;
+ ZLIB_OPAQUE opaque;
+ z_stream zs;
+ int ret;
+
+ zlib_compressor = (ZLIB_COMPRESSOR *)compressor;
+
+ memset(&zs, 0, sizeof(zs));
+ zs.zalloc = zalloc;
+ zs.zfree = zfree;
+ opaque.compressor = compressor;
+ opaque.session = session;
+ zs.opaque = &opaque;
+
+ if ((ret = deflateInit(&zs, zlib_compressor->zlib_level)) != Z_OK)
+ return (zlib_error(
+ compressor, session, "deflateInit", ret));
+
+ zs.next_in = src;
+ zs.avail_in = (uint32_t)src_len;
+ zs.next_out = dst;
+ zs.avail_out = (uint32_t)dst_len - 1;
+ while ((ret = deflate(&zs, Z_FINISH)) == Z_OK)
+ ;
+ if (ret == Z_STREAM_END) {
+ *compression_failed = 0;
+ *result_lenp = zs.total_out;
+ } else
+ *compression_failed = 1;
+
+ if ((ret = deflateEnd(&zs)) != Z_OK)
+ return (
+ zlib_error(compressor, session, "deflateEnd", ret));
+
+ return (0);
+}
+
+/*
+ * zlib_find_slot --
+ * Find the slot containing the target offset (binary search).
+ */
+static inline uint32_t
+zlib_find_slot(uint32_t target, uint32_t *offsets, uint32_t slots)
+{
+ uint32_t base, indx, limit;
+
+ indx = 1;
+
+ /* Figure out which slot we got to: binary search */
+ if (target >= offsets[slots])
+ indx = slots;
+ else if (target > offsets[1])
+ for (base = 2, limit = slots - base; limit != 0; limit >>= 1) {
+ indx = base + (limit >> 1);
+ if (target < offsets[indx])
+ continue;
+ base = indx + 1;
+ --limit;
+ }
+
+ return (indx);
+}
+
+/*
+ * zlib_compress_raw --
+ * Pack records into a specified on-disk page size.
+ */
+static int
+zlib_compress_raw(WT_COMPRESSOR *compressor, WT_SESSION *session,
+ size_t page_max, int split_pct, size_t extra,
+ uint8_t *src, uint32_t *offsets, uint32_t slots,
+ uint8_t *dst, size_t dst_len, int final,
+ size_t *result_lenp, uint32_t *result_slotsp)
+{
+ ZLIB_COMPRESSOR *zlib_compressor;
+ ZLIB_OPAQUE opaque;
+ z_stream last_zs, zs;
+ uint32_t curr_slot, last_slot;
+ int ret;
+
+ curr_slot = last_slot = 0;
+ (void)split_pct;
+ (void)dst_len;
+ (void)final;
+
+ zlib_compressor = (ZLIB_COMPRESSOR *)compressor;
+
+ memset(&zs, 0, sizeof(zs));
+ zs.zalloc = zalloc;
+ zs.zfree = zfree;
+ opaque.compressor = compressor;
+ opaque.session = session;
+ zs.opaque = &opaque;
+
+ if ((ret = deflateInit(&zs,
+ zlib_compressor->zlib_level)) != Z_OK)
+ return (zlib_error(
+ compressor, session, "deflateInit", ret));
+
+ zs.next_in = src;
+ zs.next_out = dst;
+ /*
+ * Experimentally derived, reserve this many bytes for zlib to finish
+ * up a buffer. If this isn't sufficient, we don't fail but we will be
+ * inefficient.
+ */
+#define WT_ZLIB_RESERVED 12
+ zs.avail_out = (uint32_t)(page_max - extra - WT_ZLIB_RESERVED);
+ last_zs = zs;
+
+ /*
+ * Strategy: take the available output size and compress that much
+ * input. Continue until there is no input small enough or the
+ * compression fails to fit.
+ */
+ while (zs.avail_out > 0) {
+ /* Find the slot we will try to compress up to. */
+ if ((curr_slot = zlib_find_slot(
+ zs.total_in + zs.avail_out, offsets, slots)) <= last_slot)
+ break;
+
+ zs.avail_in = offsets[curr_slot] - offsets[last_slot];
+ /* Save the stream state in case the chosen data doesn't fit. */
+ last_zs = zs;
+
+ while (zs.avail_in > 0 && zs.avail_out > 0)
+ if ((ret = deflate(&zs, Z_SYNC_FLUSH)) != Z_OK)
+ return (zlib_error(
+ compressor, session, "deflate", ret));
+
+ /* Roll back the if the last deflate didn't complete. */
+ if (zs.avail_in > 0) {
+ zs = last_zs;
+ break;
+ } else
+ last_slot = curr_slot;
+ }
+
+ zs.avail_out += WT_ZLIB_RESERVED;
+ while ((ret = deflate(&zs, Z_FINISH)) == Z_OK)
+ ;
+ /*
+ * If the end marker didn't fit, report that we got no work done. WT
+ * will compress the (possibly large) page image using ordinary
+ * compression instead.
+ */
+ if (ret == Z_BUF_ERROR)
+ last_slot = 0;
+ else if (ret != Z_STREAM_END)
+ return (
+ zlib_error(compressor, session, "deflate end block", ret));
+
+ if ((ret = deflateEnd(&zs)) != Z_OK && ret != Z_DATA_ERROR)
+ return (
+ zlib_error(compressor, session, "deflateEnd", ret));
+
+ if (last_slot > 0) {
+ *result_slotsp = last_slot;
+ *result_lenp = zs.total_out;
+ } else {
+ /* We didn't manage to compress anything: don't retry. */
+ *result_slotsp = 0;
+ *result_lenp = 1;
+ }
+
+#if 0
+ fprintf(stderr,
+ "zlib_compress_raw (%s): page_max %" PRIuMAX ", slots %" PRIu32
+ ", take %" PRIu32 ": %" PRIu32 " -> %" PRIuMAX "\n",
+ final ? "final" : "not final", (uintmax_t)page_max,
+ slots, last_slot, offsets[last_slot], (uintmax_t)*result_lenp);
+#endif
+ return (0);
+}
+
+static int
+zlib_decompress(WT_COMPRESSOR *compressor, WT_SESSION *session,
+ uint8_t *src, size_t src_len,
+ uint8_t *dst, size_t dst_len,
+ size_t *result_lenp)
+{
+ ZLIB_OPAQUE opaque;
+ z_stream zs;
+ int ret, tret;
+
+ memset(&zs, 0, sizeof(zs));
+ zs.zalloc = zalloc;
+ zs.zfree = zfree;
+ opaque.compressor = compressor;
+ opaque.session = session;
+ zs.opaque = &opaque;
+
+ if ((ret = inflateInit(&zs)) != Z_OK)
+ return (zlib_error(
+ compressor, session, "inflateInit", ret));
+
+ zs.next_in = src;
+ zs.avail_in = (uint32_t)src_len;
+ zs.next_out = dst;
+ zs.avail_out = (uint32_t)dst_len;
+ while ((ret = inflate(&zs, Z_FINISH)) == Z_OK)
+ ;
+ if (ret == Z_STREAM_END) {
+ *result_lenp = zs.total_out;
+ ret = Z_OK;
+ }
+
+ if ((tret = inflateEnd(&zs)) != Z_OK && ret == Z_OK)
+ ret = tret;
+
+ return (ret == Z_OK ?
+ 0 : zlib_error(compressor, session, "inflate", ret));
+}
+
+static int
+zlib_terminate(WT_COMPRESSOR *compressor, WT_SESSION *session)
+{
+ (void)session; /* Unused parameters */
+
+ free(compressor);
+ return (0);
+}
+
+static int
+zlib_add_compressor(WT_CONNECTION *connection, int raw, const char *name)
+{
+ ZLIB_COMPRESSOR *zlib_compressor;
+
+ /*
+ * There are two almost identical zlib compressors: one supporting raw
+ * compression, and one without.
+ */
+ if ((zlib_compressor = calloc(1, sizeof(ZLIB_COMPRESSOR))) == NULL)
+ return (errno);
+
+ zlib_compressor->compressor.compress = zlib_compress;
+ zlib_compressor->compressor.compress_raw = raw ?
+ zlib_compress_raw : NULL;
+ zlib_compressor->compressor.decompress = zlib_decompress;
+ zlib_compressor->compressor.pre_size = NULL;
+ zlib_compressor->compressor.terminate = zlib_terminate;
+
+ zlib_compressor->wt_api = connection->get_extension_api(connection);
+
+ /*
+ * between 0-10: level: see zlib manual.
+ */
+ zlib_compressor->zlib_level = Z_DEFAULT_COMPRESSION;
+
+ /* Load the standard compressor. */
+ return (connection->add_compressor(
+ connection, name, &zlib_compressor->compressor, NULL));
+}
+
+int
+wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
+{
+ int ret;
+
+ (void)config; /* Unused parameters */
+
+ if ((ret = zlib_add_compressor(connection, 1, "zlib")) != 0)
+ return (ret);
+ if ((ret = zlib_add_compressor(connection, 0, "zlib-noraw")) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/ext/datasources/helium/Makefile.am b/ext/datasources/helium/Makefile.am
new file mode 100644
index 00000000000..b4e6e67e2cd
--- /dev/null
+++ b/ext/datasources/helium/Makefile.am
@@ -0,0 +1,11 @@
+AM_CPPFLAGS = -I$(top_builddir) \
+ -I$(top_srcdir)/src/include -I$(HELIUM_PATH)
+
+noinst_LTLIBRARIES = libwiredtiger_helium.la
+libwiredtiger_helium_la_SOURCES = helium.c
+libwiredtiger_helium_la_LIBADD = -L$(HELIUM_PATH) -lhe
+
+# libtool hack: noinst_LTLIBRARIES turns off building shared libraries as well
+# as installation, it will only build static libraries. As far as I can tell,
+# the "approved" libtool way to turn them back on is by adding -rpath.
+libwiredtiger_helium_la_LDFLAGS = -avoid-version -module -rpath /nowhere
diff --git a/ext/test/memrata/README b/ext/datasources/helium/README
index ee338474566..e78ba58c71d 100644
--- a/ext/test/memrata/README
+++ b/ext/datasources/helium/README
@@ -1,15 +1,15 @@
-Memrata README.
+Helium README.
-The data structures are "KVS sources" which map to one or more physical
-devices; each KVS source supports any number of "WiredTiger sources",
-where a WiredTiger source will be an object similar to a Btree "file:"
-object. Each WiredTiger source supports any number of WiredTiger cursors.
+The data structures are "Helium sources" which map to one or more physical
+volumes; each Helium source supports any number of "WiredTiger sources",
+where a WiredTiger source is an object similar to a Btree "file:" object.
+Each WiredTiger source supports any number of WiredTiger cursors.
-Each KVS source is given a logical name when the Memrata device is loaded,
-and that logical name is subsequently used when a WiredTiger source is
-created. For example, a KVS source might be named "dev1", and correspond
-to /dev/sd0 and /dev/sd1; subsequent WT_SESSION.create calls would specify
-a URI like "table:dev1/my_table".
+Each Helium source is given a logical name when first referenced, and that
+logical name is subsequently used when a WiredTiger source is created. For
+example, the logical name for a Helium source might be "dev1", and it would
+map to the Helium volumes /dev/sd0 and /dev/sd1; subsequent WT_SESSION.create
+calls specify a URI like "table:dev1/my_table".
For each WiredTiger source, we create two namespaces on the underlying device,
a "cache" and a "primary".
@@ -51,23 +51,23 @@ When a next/prev is done:
move to the next/prev visible item in the primary
return the one closest to the starting position
-Note locks are not acquired for read operations, and no flushes are done for
-any of these operations.
+Locks are not acquired for read operations, and no flushes are done for any of
+these operations.
-We also create one additional namespace, the "txn" name space, which serves
-all of the WiredTiger and KVS sources. Whenever a transaction commits, we
-insert a commit record into the txn name space and flush the device. When a
-transaction rolls back, we insert an abort record into the txn name space,
-but don't flush the device.
+We also create one additional object, the transaction name space, which serves
+all of the WiredTiger and Helium objects in a WiredTiger connection. Whenever
+a transaction involving a Helium source commits, we insert a commit record into
+the transaction name space and flush the device. When a transaction rolls back,
+we insert an abort record into the txn name space, but don't flush the device.
The visibility check is slightly different than the rest of WiredTiger: we do
not reset anything when a transaction aborts, and so we have to check if the
transaction has been aborted as well as check the transaction ID for visibility.
-We create a "cleanup" thread for every KVS source. The job of this thread is
-to migrate rows from the cache into the primary. Any committed, globally
-visible change in the cache can be copied into the primary and removed from
-the cache:
+We create a "cleanup" thread for every underlying Helium source. The job of
+this thread is to migrate rows from the cache object into the primary. Any
+committed, globally visible change in the cache can be copied into the primary
+and removed from the cache:
set BaseTxnID to the oldest transaction ID
not yet visible to a running transaction
@@ -95,24 +95,26 @@ to the primary.
No lock is required when removing rows from the transaction store, once the
transaction ID is less than the BaseTxnID, it will never be read.
-Memrata recovery is almost identical to the cleanup thread, which migrates
-rows from the cache into the primary. For every cache/primary name space,
-we migrate every commit to the primary (by definition, at recovery time it
-must be globally visible), and discard everything (by defintion, at recovery
-time anything not committed has been aborted.
+Helium recovery is almost identical to the cleanup thread, which migrates rows
+from the cache into the primary. For every cache/primary pair, migrate every
+commit to the primary (by definition, at recovery time it must be globally
+visible), and discard everything else (by definition, at recovery time anything
+not committed has been aborted.
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
Questions, problems, whatever:
-* This implementation is endian-specific, that is, a store created on a
-little-endian machine is not portable to a big-endian machine.
+* The implementation is endian-specific, that is, the WiredTiger metadata
+stored on the Helium device is on not portable to a big-endian machine.
+Helium's metadata is portable between different endian machines, so this
+should probably be fixed.
* There's a problem with transactions in WiredTiger that span more than a
single data source. For example, consider a transaction that modifies
-both a KVS object and a Btree object. If we commit and push the KVS
+both a Helium object and a Btree object. If we commit and push the Helium
commit record to stable storage, and then crash before committing the Btree
change, the enclosing WiredTiger transaction will/should end up aborting,
-and there's no way for us to back out the change in KVS. I'm leaving
+and there's no way for us to back out the change in Helium. I'm leaving
this problem alone until WiredTiger fine-grained durability is complete,
we're going to need WiredTiger support for some kind of 2PC to solve this.
diff --git a/ext/test/memrata/memrata.c b/ext/datasources/helium/helium.c
index 442c078b5a2..1239c88befa 100644
--- a/ext/test/memrata/memrata.c
+++ b/ext/datasources/helium/helium.c
@@ -24,6 +24,7 @@
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
+#include <sys/select.h>
#include <ctype.h>
#include <errno.h>
@@ -33,56 +34,80 @@
#include <stdlib.h>
#include <string.h>
-#include <kvs.h>
+#include <he.h>
+
#include <wiredtiger.h>
#include <wiredtiger_ext.h>
+typedef struct he_env HE_ENV;
+typedef struct he_item HE_ITEM;
+typedef struct he_stats HE_STATS;
+
+static int verbose = 0; /* Verbose messages */
+
/*
- * Macros to output an error message and set or return an error.
- * Requires local variables:
- * int ret;
+ * Macros to output error and verbose messages, and set or return an error.
+ * Error macros require local "ret" variable.
*
* ESET: update an error value, handling more/less important errors.
- * ERET: output a message and return the error.
- * EMSG, EMSG_ERR:
- * output a message and set the local error value, optionally jump to the
- * err label.
+ * ERET: output a message, return the error.
+ * EMSG: output a message, set the local error value.
+ * EMSG_ERR:
+ * output a message, set the local error value, jump to the err label.
+ * VMSG: verbose message.
*/
#undef ESET
#define ESET(a) do { \
- int __ret; \
- if ((__ret = (a)) != 0 && \
- (__ret == WT_PANIC || \
- ret == 0 || ret == WT_DUPLICATE_KEY || ret == WT_NOTFOUND)) \
- ret = __ret; \
+ int __v; \
+ if ((__v = (a)) != 0) { \
+ /* \
+ * On error, check for a panic (it overrides all other \
+ * returns). Else, if there's no return value or the \
+ * return value is not strictly an error, override it \
+ * with the error. \
+ */ \
+ if (__v == WT_PANIC || \
+ ret == 0 || \
+ ret == WT_DUPLICATE_KEY || ret == WT_NOTFOUND) \
+ ret = __v; \
+ /* \
+ * If we're set to a Helium error at the end of the day,\
+ * switch to a generic WiredTiger error. \
+ */ \
+ if (ret < 0 && ret > -31,800) \
+ ret = WT_ERROR; \
+ } \
} while (0)
#undef ERET
#define ERET(wtext, session, v, ...) do { \
(void) \
- wtext->err_printf(wtext, session, "memrata: " __VA_ARGS__); \
- return (v); \
+ wtext->err_printf(wtext, session, "helium: " __VA_ARGS__); \
+ ESET(v); \
+ return (ret); \
} while (0)
#undef EMSG
#define EMSG(wtext, session, v, ...) do { \
(void) \
- wtext->err_printf(wtext, session, "memrata: " __VA_ARGS__); \
+ wtext->err_printf(wtext, session, "helium: " __VA_ARGS__); \
ESET(v); \
} while (0)
#undef EMSG_ERR
#define EMSG_ERR(wtext, session, v, ...) do { \
(void) \
- wtext->err_printf(wtext, session, "memrata: " __VA_ARGS__); \
+ wtext->err_printf(wtext, session, "helium: " __VA_ARGS__); \
ESET(v); \
goto err; \
} while (0)
-
-/*
- * STRING_MATCH --
- * Return if a string matches a bytestring of a specified length.
- */
-#undef STRING_MATCH
-#define STRING_MATCH(str, bytes, len) \
- (strncmp(str, bytes, len) == 0 && (str)[(len)] == '\0')
+#undef VERBOSE_L1
+#define VERBOSE_L1 1
+#undef VERBOSE_L2
+#define VERBOSE_L2 2
+#undef VMSG
+#define VMSG(wtext, session, v, ...) do { \
+ if (verbose >= v) \
+ (void)wtext-> \
+ msg_printf(wtext, session, "helium: " __VA_ARGS__); \
+} while (0)
/*
* OVERWRITE_AND_FREE --
@@ -95,20 +120,29 @@
} while (0)
/*
- * Version each file, out of sheer raging paranoia.
+ * Version each object, out of sheer raging paranoia.
*/
-#define KVS_MAJOR 1 /* KVS major, minor version */
-#define KVS_MINOR 0
+#define WIREDTIGER_HELIUM_MAJOR 1 /* Major, minor version */
+#define WIREDTIGER_HELIUM_MINOR 0
/*
- * WiredTiger name space on the memrata device: all primary store objects are
- * named "WiredTiger.XXX", the cache store object is "WiredTiger.XXX.cache",
- * and the per-device transaction file is "WiredTiger.txn".
+ * WiredTiger name space on the Helium store: all objects are named with the
+ * WiredTiger prefix (we don't require the Helium store be exclusive to our
+ * files). Primary objects are named "WiredTiger.[name]", associated cache
+ * objects are "WiredTiger.[name].cache". The per-connection transaction
+ * object is "WiredTiger.WiredTigerTxn". When we first open a Helium volume,
+ * we open/close a file in order to apply flags for the first open of the
+ * volume, that's "WiredTiger.WiredTigerInit".
*/
#define WT_NAME_PREFIX "WiredTiger."
-#define WT_NAME_TXN "WiredTiger.txn"
+#define WT_NAME_INIT "WiredTiger.WiredTigerInit"
+#define WT_NAME_TXN "WiredTiger.WiredTigerTxn"
#define WT_NAME_CACHE ".cache"
+/*
+ * WT_SOURCE --
+ * A WiredTiger source, supporting one or more cursors.
+ */
typedef struct __wt_source {
char *uri; /* Unique name */
@@ -120,68 +154,79 @@ typedef struct __wt_source {
uint64_t append_recno; /* Allocation record number */
- int config_recno; /* config "key_format=r" */
int config_bitfield; /* config "value_format=#t" */
+ int config_compress; /* config "helium_o_compress" */
+ int config_recno; /* config "key_format=r" */
/*
- * Each WiredTiger object has a "primary" namespace in a KVS store plus
- * a "cache" namespace, which has not-yet-resolved updates. There's a
- * dirty flag so we can ignore the cache until it's used.
+ * Each WiredTiger object has a "primary" namespace in a Helium store
+ * plus a "cache" namespace, which has not-yet-resolved updates. There
+ * is a dirty flag so read-only data sets can ignore the cache.
*/
- kvs_t kvs; /* Underlying KVS object */
- kvs_t kvscache; /* Underlying KVS cache */
- int kvscache_inuse;
+ he_t he; /* Underlying Helium object */
+ he_t he_cache; /* Underlying Helium cache */
+ int he_cache_inuse;
- uint64_t cleaner_bytes; /* Bytes since clean */
- uint64_t cleaner_ops; /* Operations since clean */
-
- struct __kvs_source *ks; /* Underlying KVS source */
+ struct __he_source *hs; /* Underlying Helium source */
struct __wt_source *next; /* List of WiredTiger objects */
} WT_SOURCE;
-typedef struct __kvs_source {
+/*
+ * HELIUM_SOURCE --
+ * A Helium volume, supporting one or more WT_SOURCE objects.
+ */
+typedef struct __he_source {
/*
* XXX
* The transaction commit handler must appear first in the structure.
*/
WT_TXN_NOTIFY txn_notify; /* Transaction commit handler */
- char *name; /* Unique name */
+ WT_EXTENSION_API *wtext; /* Extension functions */
+
+ char *name; /* Unique WiredTiger name */
+ char *device; /* Unique Helium volume name */
- kvs_t kvs_device; /* Underlying KVS store */
+ /*
+ * Maintain a handle for each underlying Helium source so checkpoint is
+ * faster, we can "commit" a single handle per source, regardless of the
+ * number of objects.
+ */
+ he_t he_volume;
struct __wt_source *ws_head; /* List of WiredTiger sources */
/*
- * Each KVS source has a cleaner thread to migrate WiredTiger source
+ * Each Helium source has a cleaner thread to migrate WiredTiger source
* updates from the cache namespace to the primary namespace, based on
- * the number of bytes or the number of operations. We read these
- * fields without a lock, but serialize writes to minimize races (and
- * because it costs us nothing).
- *
- * There's a cleaner thread per KVS store because migration operations
- * can overlap.
+ * the number of bytes or the number of operations. (There's a cleaner
+ * thread per Helium store so migration operations can overlap.) We
+ * read these fields without a lock, but serialize writes to minimize
+ * races (and because it costs us nothing).
*/
- WT_EXTENSION_API *wtext; /* Extension functions */
pthread_t cleaner_id; /* Cleaner thread ID */
volatile int cleaner_stop; /* Cleaner thread quit flag */
/*
* Each WiredTiger connection has a transaction namespace which lists
* resolved transactions with their committed or aborted state as a
- * value. We create that namespace in the first KVS store created,
- * and then simply reference it from other, subsequently created KVS
- * stores.
+ * value. That namespace appears in a single Helium store (the first
+ * one created, if it doesn't already exist), and then it's referenced
+ * from other Helium stores.
*/
#define TXN_ABORTED 'A'
#define TXN_COMMITTED 'C'
#define TXN_UNRESOLVED 0
- kvs_t kvstxn; /* Underlying KVS txn store */
- int kvsowner; /* Owns transaction store */
+ he_t he_txn; /* Helium txn store */
+ int he_owner; /* Owns transaction store */
- struct __kvs_source *next; /* List of KVS sources */
-} KVS_SOURCE;
+ struct __he_source *next; /* List of Helium sources */
+} HELIUM_SOURCE;
+/*
+ * DATA_SOURCE --
+ * A WiredTiger data source, supporting one or more HELIUM_SOURCE objects.
+ */
typedef struct __data_source {
WT_DATA_SOURCE wtds; /* Must come first */
@@ -190,10 +235,13 @@ typedef struct __data_source {
pthread_rwlock_t global_lock; /* Global lock */
int lockinit; /* Lock created */
- KVS_SOURCE *kvs_head; /* List of KVS sources */
+ struct __he_source *hs_head; /* List of Helium sources */
} DATA_SOURCE;
/*
+ * CACHE_RECORD --
+ * An array of updates from the cache object.
+ *
* Values in the cache store are marshalled/unmarshalled to/from the store,
* using a simple encoding:
* {N records: 4B}
@@ -203,7 +251,7 @@ typedef struct __data_source {
* {record#1 data}
* ...
*
- * Each KVS cursor potentially has a single set of these values.
+ * Each cursor potentially has a single set of these values.
*/
typedef struct __cache_record {
uint8_t *v; /* Value */
@@ -213,15 +261,19 @@ typedef struct __cache_record {
int remove; /* 1/0 remove flag */
} CACHE_RECORD;
+/*
+ * CURSOR --
+ * A cursor, supporting a single WiredTiger cursor.
+ */
typedef struct __cursor {
WT_CURSOR wtcursor; /* Must come first */
WT_EXTENSION_API *wtext; /* Extension functions */
- WT_SOURCE *ws; /* WiredTiger source */
+ WT_SOURCE *ws; /* Underlying source */
- struct kvs_record record; /* Record */
- uint8_t __key[KVS_MAX_KEY_LEN]; /* Record.key, Record.value */
+ HE_ITEM record; /* Record */
+ uint8_t __key[HE_MAX_KEY_LEN]; /* Record.key, Record.value */
uint8_t *v;
size_t len;
size_t mem_len;
@@ -241,6 +293,26 @@ typedef struct __cursor {
} CURSOR;
/*
+ * prefix_match --
+ * Return if a string matches a prefix.
+ */
+static inline int
+prefix_match(const char *str, const char *pfx)
+{
+ return (strncmp(str, pfx, strlen(pfx)) == 0);
+}
+
+/*
+ * string_match --
+ * Return if a string matches a byte string of len bytes.
+ */
+static inline int
+string_match(const char *str, const char *bytes, size_t len)
+{
+ return (strncmp(str, bytes, len) == 0 && (str)[(len)] == '\0');
+}
+
+/*
* cursor_destroy --
* Free a cursor's memory, and optionally the cursor itself.
*/
@@ -259,7 +331,7 @@ cursor_destroy(CURSOR *cursor)
/*
* os_errno --
- * Limit our use of errno so it's easy to remove.
+ * Limit our use of errno so it's easy to find/remove.
*/
static int
os_errno(void)
@@ -272,8 +344,7 @@ os_errno(void)
* Initialize a lock.
*/
static int
-lock_init(
- WT_EXTENSION_API *wtext, WT_SESSION *session, pthread_rwlock_t *lockp)
+lock_init(WT_EXTENSION_API *wtext, WT_SESSION *session, pthread_rwlock_t *lockp)
{
int ret = 0;
@@ -304,8 +375,7 @@ lock_destroy(
* Acquire a write lock.
*/
static inline int
-writelock(
- WT_EXTENSION_API *wtext, WT_SESSION *session, pthread_rwlock_t *lockp)
+writelock(WT_EXTENSION_API *wtext, WT_SESSION *session, pthread_rwlock_t *lockp)
{
int ret = 0;
@@ -331,71 +401,90 @@ unlock(WT_EXTENSION_API *wtext, WT_SESSION *session, pthread_rwlock_t *lockp)
}
#if 0
-static int
-kvs_dump_print(uint8_t *p, size_t len, FILE *fp)
+static void
+helium_dump_kv(const char *pfx, uint8_t *p, size_t len, FILE *fp)
{
+ (void)fprintf(stderr, "%s %3zu: ", pfx, len);
for (; len > 0; --len, ++p)
if (!isspace(*p) && isprint(*p))
- putc(*p, fp);
+ (void)putc(*p, fp);
else if (len == 1 && *p == '\0') /* Skip string nuls. */
continue;
else
- fprintf(fp, "%#x", *p);
+ (void)fprintf(fp, "%#x", *p);
+ (void)putc('\n', fp);
}
/*
- * kvs_dump --
- * Dump the records in a KVS store.
+ * he_dump --
+ * Dump the records in a Helium store.
*/
static int
-kvs_dump(kvs_t kvs, const char *tag)
+helium_dump(WT_EXTENSION_API *wtext, he_t he, const char *tag)
{
- FILE *fp;
- struct kvs_record *r, _r;
- size_t maxbuf = 4 * 1024;
+ HE_ITEM *r, _r;
+ uint8_t k[4 * 1024], v[4 * 1024];
int ret = 0;
r = &_r;
memset(r, 0, sizeof(*r));
- r->key = malloc(maxbuf);
- r->key_len = 0;
- r->val = malloc(maxbuf);
- r->val_len = maxbuf;
-
- (void)snprintf(r->val, maxbuf, "dump.%s", tag);
- fp = fopen(r->val, "w");
- fprintf(fp, "== %s\n", tag);
-
- while ((ret = kvs_next(kvs, r, 0UL, (unsigned long)maxbuf)) == 0) {
- kvs_dump_print(r->key, r->key_len, fp);
- putc('\t', fp);
- kvs_dump_print(r->val, r->val_len, fp);
- putc('\n', fp);
+ r->key = k;
+ r->val = v;
- r->val_len = maxbuf;
+ (void)fprintf(stderr, "== %s\n", tag);
+ while ((ret = he_next(he, r, (size_t)0, sizeof(v))) == 0) {
+#if 0
+ uint64_t recno;
+ if ((ret = wtext->struct_unpack(wtext,
+ NULL, r->key, r->key_len, "r", &recno)) != 0)
+ return (ret);
+ fprintf(stderr, "K: %" PRIu64, recno);
+#else
+ helium_dump_kv("K: ", r->key, r->key_len, stderr);
+#endif
+ helium_dump_kv("V: ", r->val, r->val_len, stderr);
}
- if (ret == KVS_E_KEY_NOT_FOUND)
- ret = 0;
- fprintf(fp, "========================== (%d)\n", ret);
- fclose(fp);
+ if (ret != HE_ERR_ITEM_NOT_FOUND) {
+ fprintf(stderr, "he_next: %s\n", he_strerror(ret));
+ ret = WT_ERROR;
+ }
+ return (ret);
+}
- free(r->key);
- free(r->val);
+/*
+ * helium_stats --
+ * Display Helium statistics for a datastore.
+ */
+static int
+helium_stats(
+ WT_EXTENSION_API *wtext, WT_SESSION *session, he_t he, const char *tag)
+{
+ HE_STATS stats;
+ int ret = 0;
- return (ret);
+ if ((ret = he_stats(he, &stats)) != 0)
+ ERET(wtext, session, ret, "he_stats: %s", he_strerror(ret));
+ fprintf(stderr, "== %s\n", tag);
+ fprintf(stderr, "name=%s\n", stats.name);
+ fprintf(stderr, "deleted_items=%" PRIu64 "\n", stats.deleted_items);
+ fprintf(stderr, "locked_items=%" PRIu64 "\n", stats.locked_items);
+ fprintf(stderr, "valid_items=%" PRIu64 "\n", stats.valid_items);
+ fprintf(stderr, "capacity=%" PRIu64 "B\n", stats.capacity);
+ fprintf(stderr, "size=%" PRIu64 "B\n", stats.size);
+ return (0);
}
#endif
/*
- * kvs_call --
- * Call a KVS key retrieval function, handling overflow.
+ * helium_call --
+ * Call a Helium key retrieval function, handling overflow.
*/
static inline int
-kvs_call(WT_CURSOR *wtcursor, const char *fname, kvs_t kvs,
- int (*f)(kvs_t, struct kvs_record *, unsigned long, unsigned long))
+helium_call(WT_CURSOR *wtcursor, const char *fname,
+ he_t he, int (*f)(he_t, HE_ITEM *, size_t, size_t))
{
- struct kvs_record *r;
CURSOR *cursor;
+ HE_ITEM *r;
WT_EXTENSION_API *wtext;
WT_SESSION *session;
int ret = 0;
@@ -409,18 +498,17 @@ kvs_call(WT_CURSOR *wtcursor, const char *fname, kvs_t kvs,
r->val = cursor->v;
restart:
- if ((ret = f(kvs, r, 0UL, (unsigned long)cursor->mem_len)) != 0) {
- if (ret == KVS_E_KEY_NOT_FOUND)
+ if ((ret = f(he, r, (size_t)0, cursor->mem_len)) != 0) {
+ if (ret == HE_ERR_ITEM_NOT_FOUND)
return (WT_NOTFOUND);
- ERET(wtext,
- session, WT_ERROR, "%s: %s", fname, kvs_strerror(ret));
+ ERET(wtext, session, ret, "%s: %s", fname, he_strerror(ret));
}
/*
* If the returned length is larger than our passed-in length, we didn't
- * get the complete value. Grow the buffer and use kvs_get to complete
- * the retrieval (kvs_get because the call succeeded and the key was
- * copied out, so calling kvs_next/kvs_prev again would skip key/value
+ * get the complete value. Grow the buffer and use he_lookup to do the
+ * retrieval (he_lookup because the call succeeded and the key was
+ * copied out, so calling he_next/he_prev again would skip key/value
* pairs).
*
* We have to loop, another thread of control might change the length of
@@ -441,12 +529,11 @@ restart:
cursor->v = r->val = p;
cursor->mem_len = r->val_len + 32;
- if ((ret = kvs_get(
- kvs, r, 0UL, (unsigned long)cursor->mem_len)) != 0) {
- if (ret == KVS_E_KEY_NOT_FOUND)
+ if ((ret = he_lookup(he, r, (size_t)0, cursor->mem_len)) != 0) {
+ if (ret == HE_ERR_ITEM_NOT_FOUND)
goto restart;
- ERET(wtext, session,
- WT_ERROR, "kvs_get: %s", kvs_strerror(ret));
+ ERET(wtext,
+ session, ret, "he_lookup: %s", he_strerror(ret));
}
}
/* NOTREACHED */
@@ -458,47 +545,46 @@ restart:
*/
static int
txn_state_set(WT_EXTENSION_API *wtext,
- WT_SESSION *session, KVS_SOURCE *ks, uint64_t txnid, int commit)
+ WT_SESSION *session, HELIUM_SOURCE *hs, uint64_t txnid, int commit)
{
- struct kvs_record txn;
+ HE_ITEM txn;
uint8_t val;
int ret = 0;
- /* Update the store -- commits must be durable, flush the device. */
- memset(&txn, 0, sizeof(txn));
- txn.key = &txnid;
- txn.key_len = sizeof(txnid);
-
/*
+ * Update the store -- commits must be durable, flush the volume.
+ *
+ * XXX
* Not endian-portable, we're writing a native transaction ID to the
* store.
*/
+ memset(&txn, 0, sizeof(txn));
+ txn.key = &txnid;
+ txn.key_len = sizeof(txnid);
val = commit ? TXN_COMMITTED : TXN_ABORTED;
txn.val = &val;
- txn.val_len = 1;
+ txn.val_len = sizeof(val);
- if ((ret = kvs_set(ks->kvstxn, &txn)) != 0)
- ERET(wtext, session,
- WT_ERROR, "kvs_set: %s", kvs_strerror(ret));
+ if ((ret = he_update(hs->he_txn, &txn)) != 0)
+ ERET(wtext, session, ret, "he_update: %s", he_strerror(ret));
- if (commit && (ret = kvs_commit(ks->kvs_device)) != 0)
- ERET(wtext, session,
- WT_ERROR, "kvs_commit: %s", kvs_strerror(ret));
+ if (commit && (ret = he_commit(hs->he_txn)) != 0)
+ ERET(wtext, session, ret, "he_commit: %s", he_strerror(ret));
return (0);
}
/*
* txn_notify --
- * Resolve a transaction.
+ * Resolve a transaction; called from WiredTiger during commit/abort.
*/
static int
txn_notify(WT_TXN_NOTIFY *handler,
WT_SESSION *session, uint64_t txnid, int committed)
{
- KVS_SOURCE *ks;
+ HELIUM_SOURCE *hs;
- ks = (KVS_SOURCE *)handler;
- return (txn_state_set(ks->wtext, session, ks, txnid, committed));
+ hs = (HELIUM_SOURCE *)handler;
+ return (txn_state_set(hs->wtext, session, hs, txnid, committed));
}
/*
@@ -508,13 +594,13 @@ txn_notify(WT_TXN_NOTIFY *handler,
static int
txn_state(WT_CURSOR *wtcursor, uint64_t txnid)
{
- struct kvs_record txn;
CURSOR *cursor;
- KVS_SOURCE *ks;
+ HE_ITEM txn;
+ HELIUM_SOURCE *hs;
uint8_t val_buf[16];
cursor = (CURSOR *)wtcursor;
- ks = cursor->ws->ks;
+ hs = cursor->ws->hs;
memset(&txn, 0, sizeof(txn));
txn.key = &txnid;
@@ -522,7 +608,7 @@ txn_state(WT_CURSOR *wtcursor, uint64_t txnid)
txn.val = val_buf;
txn.val_len = sizeof(val_buf);
- if (kvs_get(ks->kvstxn, &txn, 0UL, (unsigned long)sizeof(val_buf)) == 0)
+ if (he_lookup(hs->he_txn, &txn, (size_t)0, sizeof(val_buf)) == 0)
return (val_buf[0]);
return (TXN_UNRESOLVED);
}
@@ -534,8 +620,8 @@ txn_state(WT_CURSOR *wtcursor, uint64_t txnid)
static int
cache_value_append(WT_CURSOR *wtcursor, int remove_op)
{
- struct kvs_record *r;
CURSOR *cursor;
+ HE_ITEM *r;
WT_EXTENSION_API *wtext;
WT_SESSION *session;
uint64_t txnid;
@@ -587,6 +673,7 @@ cache_value_append(WT_CURSOR *wtcursor, int remove_op)
* Copy the WiredTiger cursor's data into place: txn ID, remove
* tombstone, data length, data.
*
+ * XXX
* Not endian-portable, we're writing a native transaction ID to the
* store.
*/
@@ -604,7 +691,7 @@ cache_value_append(WT_CURSOR *wtcursor, int remove_op)
}
cursor->len = (size_t)(p - cursor->v);
- /* Update the underlying KVS record. */
+ /* Update the underlying Helium record. */
r->val = cursor->v;
r->val_len = cursor->len;
@@ -764,10 +851,8 @@ cache_value_visible_all(WT_CURSOR *wtcursor, uint64_t oldest)
{
CACHE_RECORD *cp;
CURSOR *cursor;
- WT_SESSION *session;
u_int i;
- session = wtcursor->session;
cursor = (CURSOR *)wtcursor;
/*
@@ -882,21 +967,23 @@ cache_value_txnmin(WT_CURSOR *wtcursor, uint64_t *txnminp)
static int
key_max_err(WT_EXTENSION_API *wtext, WT_SESSION *session, size_t len)
{
+ int ret = 0;
+
ERET(wtext, session, EINVAL,
- "key length (%" PRIuMAX " bytes) larger than the maximum Memrata "
+ "key length (%zu bytes) larger than the maximum Helium "
"key length of %d bytes",
- (uintmax_t)len, KVS_MAX_KEY_LEN);
+ len, HE_MAX_KEY_LEN);
}
/*
* copyin_key --
- * Copy a WT_CURSOR key to a struct kvs_record key.
+ * Copy a WT_CURSOR key to a HE_ITEM key.
*/
static inline int
copyin_key(WT_CURSOR *wtcursor, int allocate_key)
{
- struct kvs_record *r;
CURSOR *cursor;
+ HE_ITEM *r;
WT_EXTENSION_API *wtext;
WT_SESSION *session;
WT_SOURCE *ws;
@@ -944,14 +1031,14 @@ copyin_key(WT_CURSOR *wtcursor, int allocate_key)
if ((ret = wtext->struct_size(wtext, session,
&size, "r", wtcursor->recno)) != 0 ||
(ret = wtext->struct_pack(wtext, session,
- r->key, KVS_MAX_KEY_LEN, "r", wtcursor->recno)) != 0)
+ r->key, HE_MAX_KEY_LEN, "r", wtcursor->recno)) != 0)
return (ret);
r->key_len = size;
} else {
/* I'm not sure this test is necessary, but it's cheap. */
- if (wtcursor->key.size > KVS_MAX_KEY_LEN)
- return (key_max_err(
- wtext, session, (size_t)wtcursor->key.size));
+ if (wtcursor->key.size > HE_MAX_KEY_LEN)
+ return (
+ key_max_err(wtext, session, wtcursor->key.size));
/*
* A set cursor key might reference application memory, which
@@ -969,13 +1056,13 @@ copyin_key(WT_CURSOR *wtcursor, int allocate_key)
/*
* copyout_key --
- * Copy a struct kvs_record key to a WT_CURSOR key.
+ * Copy a HE_ITEM key to a WT_CURSOR key.
*/
static inline int
copyout_key(WT_CURSOR *wtcursor)
{
- struct kvs_record *r;
CURSOR *cursor;
+ HE_ITEM *r;
WT_EXTENSION_API *wtext;
WT_SESSION *session;
WT_SOURCE *ws;
@@ -993,14 +1080,14 @@ copyout_key(WT_CURSOR *wtcursor)
return (ret);
} else {
wtcursor->key.data = r->key;
- wtcursor->key.size = (uint32_t)r->key_len;
+ wtcursor->key.size = (size_t)r->key_len;
}
return (0);
}
/*
* copyout_val --
- * Copy a kvs store's struct kvs_record value to a WT_CURSOR value.
+ * Copy a Helium store's HE_ITEM value to a WT_CURSOR value.
*/
static inline int
copyout_val(WT_CURSOR *wtcursor, CACHE_RECORD *cp)
@@ -1011,7 +1098,7 @@ copyout_val(WT_CURSOR *wtcursor, CACHE_RECORD *cp)
if (cp == NULL) {
wtcursor->value.data = cursor->v;
- wtcursor->value.size = (uint32_t)cursor->len;
+ wtcursor->value.size = cursor->len;
} else {
wtcursor->value.data = cp->v;
wtcursor->value.size = cp->len;
@@ -1025,11 +1112,11 @@ copyout_val(WT_CURSOR *wtcursor, CACHE_RECORD *cp)
*/
static int
nextprev(WT_CURSOR *wtcursor, const char *fname,
- int (*f)(kvs_t, struct kvs_record *, unsigned long, unsigned long))
+ int (*f)(he_t, HE_ITEM *, size_t, size_t))
{
- struct kvs_record *r;
CACHE_RECORD *cp;
CURSOR *cursor;
+ HE_ITEM *r;
WT_EXTENSION_API *wtext;
WT_ITEM a, b;
WT_SESSION *session;
@@ -1050,7 +1137,7 @@ nextprev(WT_CURSOR *wtcursor, const char *fname,
* the store. We don't care if we race, we're not guaranteeing any
* special behavior with respect to phantoms.
*/
- if (ws->kvscache_inuse == 0) {
+ if (ws->he_cache_inuse == 0) {
cache_ret = WT_NOTFOUND;
goto cache_clean;
}
@@ -1079,7 +1166,7 @@ skip_deleted:
* entry, or we reach the end/beginning.
*/
for (cache_rm = 0;;) {
- if ((ret = kvs_call(wtcursor, fname, ws->kvscache, f)) != 0)
+ if ((ret = helium_call(wtcursor, fname, ws->he_cache, f)) != 0)
break;
if ((ret = cache_value_unmarshall(wtcursor)) != 0)
return (ret);
@@ -1134,7 +1221,7 @@ skip_deleted:
cache_clean:
/* Get the next/prev entry from the store. */
- ret = kvs_call(wtcursor, fname, ws->kvs, f);
+ ret = helium_call(wtcursor, fname, ws->he, f);
if (ret != 0 && ret != WT_NOTFOUND)
return (ret);
@@ -1154,7 +1241,7 @@ cache_clean:
if ((ret = wtext->collate(wtext, session, &a, &b, &cmp)) != 0)
return (ret);
- if (f == kvs_next) {
+ if (f == he_next) {
if (cmp >= 0)
ret = WT_NOTFOUND;
else
@@ -1196,34 +1283,34 @@ cache_clean:
}
/*
- * kvs_cursor_next --
+ * helium_cursor_next --
* WT_CURSOR.next method.
*/
static int
-kvs_cursor_next(WT_CURSOR *wtcursor)
+helium_cursor_next(WT_CURSOR *wtcursor)
{
- return (nextprev(wtcursor, "kvs_next", kvs_next));
+ return (nextprev(wtcursor, "he_next", he_next));
}
/*
- * kvs_cursor_prev --
+ * helium_cursor_prev --
* WT_CURSOR.prev method.
*/
static int
-kvs_cursor_prev(WT_CURSOR *wtcursor)
+helium_cursor_prev(WT_CURSOR *wtcursor)
{
- return (nextprev(wtcursor, "kvs_prev", kvs_prev));
+ return (nextprev(wtcursor, "he_prev", he_prev));
}
/*
- * kvs_cursor_reset --
+ * helium_cursor_reset --
* WT_CURSOR.reset method.
*/
static int
-kvs_cursor_reset(WT_CURSOR *wtcursor)
+helium_cursor_reset(WT_CURSOR *wtcursor)
{
- struct kvs_record *r;
CURSOR *cursor;
+ HE_ITEM *r;
cursor = (CURSOR *)wtcursor;
r = &cursor->record;
@@ -1237,11 +1324,11 @@ kvs_cursor_reset(WT_CURSOR *wtcursor)
}
/*
- * kvs_cursor_search --
+ * helium_cursor_search --
* WT_CURSOR.search method.
*/
static int
-kvs_cursor_search(WT_CURSOR *wtcursor)
+helium_cursor_search(WT_CURSOR *wtcursor)
{
CACHE_RECORD *cp;
CURSOR *cursor;
@@ -1259,7 +1346,8 @@ kvs_cursor_search(WT_CURSOR *wtcursor)
* Check for an entry in the cache. If we find one, unmarshall it
* and check for a visible entry we can return.
*/
- if ((ret = kvs_call(wtcursor, "kvs_get", ws->kvscache, kvs_get)) == 0) {
+ if ((ret =
+ helium_call(wtcursor, "he_lookup", ws->he_cache, he_lookup)) == 0) {
if ((ret = cache_value_unmarshall(wtcursor)) != 0)
return (ret);
if (cache_value_visible(wtcursor, &cp))
@@ -1269,18 +1357,18 @@ kvs_cursor_search(WT_CURSOR *wtcursor)
return (ret);
/* Check for an entry in the primary store. */
- if ((ret = kvs_call(wtcursor, "kvs_get", ws->kvs, kvs_get)) != 0)
+ if ((ret = helium_call(wtcursor, "he_lookup", ws->he, he_lookup)) != 0)
return (ret);
return (copyout_val(wtcursor, NULL));
}
/*
- * kvs_cursor_search_near --
+ * helium_cursor_search_near --
* WT_CURSOR.search_near method.
*/
static int
-kvs_cursor_search_near(WT_CURSOR *wtcursor, int *exact)
+helium_cursor_search_near(WT_CURSOR *wtcursor, int *exact)
{
int ret = 0;
@@ -1294,7 +1382,7 @@ kvs_cursor_search_near(WT_CURSOR *wtcursor, int *exact)
*/
/* Search for an exact match. */
- if ((ret = kvs_cursor_search(wtcursor)) == 0) {
+ if ((ret = helium_cursor_search(wtcursor)) == 0) {
*exact = 0;
return (0);
}
@@ -1302,7 +1390,7 @@ kvs_cursor_search_near(WT_CURSOR *wtcursor, int *exact)
return (ret);
/* Search for a key that's larger. */
- if ((ret = kvs_cursor_next(wtcursor)) == 0) {
+ if ((ret = helium_cursor_next(wtcursor)) == 0) {
*exact = 1;
return (0);
}
@@ -1310,7 +1398,7 @@ kvs_cursor_search_near(WT_CURSOR *wtcursor, int *exact)
return (ret);
/* Search for a key that's smaller. */
- if ((ret = kvs_cursor_prev(wtcursor)) == 0) {
+ if ((ret = helium_cursor_prev(wtcursor)) == 0) {
*exact = -1;
return (0);
}
@@ -1319,16 +1407,16 @@ kvs_cursor_search_near(WT_CURSOR *wtcursor, int *exact)
}
/*
- * kvs_cursor_insert --
+ * helium_cursor_insert --
* WT_CURSOR.insert method.
*/
static int
-kvs_cursor_insert(WT_CURSOR *wtcursor)
+helium_cursor_insert(WT_CURSOR *wtcursor)
{
- struct kvs_record *r;
CACHE_RECORD *cp;
CURSOR *cursor;
- KVS_SOURCE *ks;
+ HE_ITEM *r;
+ HELIUM_SOURCE *hs;
WT_EXTENSION_API *wtext;
WT_SESSION *session;
WT_SOURCE *ws;
@@ -1338,13 +1426,16 @@ kvs_cursor_insert(WT_CURSOR *wtcursor)
cursor = (CURSOR *)wtcursor;
wtext = cursor->wtext;
ws = cursor->ws;
- ks = ws->ks;
+ hs = ws->hs;
r = &cursor->record;
/* Get the WiredTiger cursor's key. */
if ((ret = copyin_key(wtcursor, 1)) != 0)
return (ret);
+ VMSG(wtext, session, VERBOSE_L2,
+ "I %.*s.%.*s", (int)r->key_len, r->key, (int)r->val_len, r->val);
+
/* Clear the value, assume we're adding the first cache entry. */
cursor->len = 0;
@@ -1353,7 +1444,8 @@ kvs_cursor_insert(WT_CURSOR *wtcursor)
return (ret);
/* Read the record from the cache store. */
- switch (ret = kvs_call(wtcursor, "kvs_get", ws->kvscache, kvs_get)) {
+ switch (ret = helium_call(
+ wtcursor, "he_lookup", ws->he_cache, he_lookup)) {
case 0:
/* Crack the record. */
if ((ret = cache_value_unmarshall(wtcursor)) != 0)
@@ -1385,8 +1477,8 @@ kvs_cursor_insert(WT_CURSOR *wtcursor)
break;
/* If overwrite is false, an entry is an error. */
- if ((ret = kvs_call(
- wtcursor, "kvs_get", ws->kvs, kvs_get)) != WT_NOTFOUND) {
+ if ((ret = helium_call(
+ wtcursor, "he_lookup", ws->he, he_lookup)) != WT_NOTFOUND) {
if (ret == 0)
ret = WT_DUPLICATE_KEY;
goto err;
@@ -1398,21 +1490,17 @@ kvs_cursor_insert(WT_CURSOR *wtcursor)
}
/*
- * Create a new cache value based on the current cache record plus the
- * WiredTiger cursor's value.
+ * Create a new value using the current cache record plus the WiredTiger
+ * cursor's value, and update the cache.
*/
if ((ret = cache_value_append(wtcursor, 0)) != 0)
goto err;
-
- /* Push the record into the cache. */
- if ((ret = kvs_set(ws->kvscache, r)) != 0)
- EMSG(wtext, session, WT_ERROR,
- "kvs_set: %s", kvs_strerror(ret));
+ if ((ret = he_update(ws->he_cache, r)) != 0)
+ EMSG(wtext, session, ret, "he_update: %s", he_strerror(ret));
/* Update the state while still holding the lock. */
- ws->kvscache_inuse = 1;
- ws->cleaner_bytes += wtcursor->value.size;
- ++ws->cleaner_ops;
+ if (ws->he_cache_inuse == 0)
+ ws->he_cache_inuse = 1;
/* Discard the lock. */
err: ESET(unlock(wtext, session, &ws->lock));
@@ -1420,7 +1508,7 @@ err: ESET(unlock(wtext, session, &ws->lock));
/* If successful, request notification at transaction resolution. */
if (ret == 0)
ESET(
- wtext->transaction_notify(wtext, session, &ks->txn_notify));
+ wtext->transaction_notify(wtext, session, &hs->txn_notify));
return (ret);
}
@@ -1432,10 +1520,10 @@ err: ESET(unlock(wtext, session, &ws->lock));
static int
update(WT_CURSOR *wtcursor, int remove_op)
{
- struct kvs_record *r;
CACHE_RECORD *cp;
CURSOR *cursor;
- KVS_SOURCE *ks;
+ HE_ITEM *r;
+ HELIUM_SOURCE *hs;
WT_EXTENSION_API *wtext;
WT_SESSION *session;
WT_SOURCE *ws;
@@ -1445,13 +1533,18 @@ update(WT_CURSOR *wtcursor, int remove_op)
cursor = (CURSOR *)wtcursor;
wtext = cursor->wtext;
ws = cursor->ws;
- ks = ws->ks;
+ hs = ws->hs;
r = &cursor->record;
/* Get the WiredTiger cursor's key. */
if ((ret = copyin_key(wtcursor, 0)) != 0)
return (ret);
+ VMSG(wtext, session, VERBOSE_L2,
+ "%c %.*s.%.*s",
+ remove_op ? 'R' : 'U',
+ (int)r->key_len, r->key, (int)r->val_len, r->val);
+
/* Clear the value, assume we're adding the first cache entry. */
cursor->len = 0;
@@ -1460,7 +1553,8 @@ update(WT_CURSOR *wtcursor, int remove_op)
return (ret);
/* Read the record from the cache store. */
- switch (ret = kvs_call(wtcursor, "kvs_get", ws->kvscache, kvs_get)) {
+ switch (ret = helium_call(
+ wtcursor, "he_lookup", ws->he_cache, he_lookup)) {
case 0:
/* Crack the record. */
if ((ret = cache_value_unmarshall(wtcursor)) != 0)
@@ -1491,8 +1585,8 @@ update(WT_CURSOR *wtcursor, int remove_op)
break;
/* If overwrite is false, no entry is an error. */
- if ((ret = kvs_call(
- wtcursor, "kvs_get", ws->kvs, kvs_get)) != 0)
+ if ((ret =
+ helium_call(wtcursor, "he_lookup", ws->he, he_lookup)) != 0)
goto err;
/*
@@ -1513,10 +1607,12 @@ update(WT_CURSOR *wtcursor, int remove_op)
goto err;
/* Push the record into the cache. */
- if ((ret = kvs_set(ws->kvscache, r)) != 0)
- EMSG(wtext, session, WT_ERROR,
- "kvs_set: %s", kvs_strerror(ret));
- ws->kvscache_inuse = 1;
+ if ((ret = he_update(ws->he_cache, r)) != 0)
+ EMSG(wtext, session, ret, "he_update: %s", he_strerror(ret));
+
+ /* Update the state while still holding the lock. */
+ if (ws->he_cache_inuse == 0)
+ ws->he_cache_inuse = 1;
/* Discard the lock. */
err: ESET(unlock(wtext, session, &ws->lock));
@@ -1524,27 +1620,27 @@ err: ESET(unlock(wtext, session, &ws->lock));
/* If successful, request notification at transaction resolution. */
if (ret == 0)
ESET(
- wtext->transaction_notify(wtext, session, &ks->txn_notify));
+ wtext->transaction_notify(wtext, session, &hs->txn_notify));
return (ret);
}
/*
- * kvs_cursor_update --
+ * helium_cursor_update --
* WT_CURSOR.update method.
*/
static int
-kvs_cursor_update(WT_CURSOR *wtcursor)
+helium_cursor_update(WT_CURSOR *wtcursor)
{
return (update(wtcursor, 0));
}
/*
- * kvs_cursor_remove --
+ * helium_cursor_remove --
* WT_CURSOR.remove method.
*/
static int
-kvs_cursor_remove(WT_CURSOR *wtcursor)
+helium_cursor_remove(WT_CURSOR *wtcursor)
{
CURSOR *cursor;
WT_SOURCE *ws;
@@ -1558,18 +1654,18 @@ kvs_cursor_remove(WT_CURSOR *wtcursor)
*/
if (ws->config_bitfield) {
wtcursor->value.size = 1;
- wtcursor->value.data = "\0";
+ wtcursor->value.data = "";
return (update(wtcursor, 0));
}
return (update(wtcursor, 1));
}
/*
- * kvs_cursor_close --
+ * helium_cursor_close --
* WT_CURSOR.close method.
*/
static int
-kvs_cursor_close(WT_CURSOR *wtcursor)
+helium_cursor_close(WT_CURSOR *wtcursor)
{
CURSOR *cursor;
WT_EXTENSION_API *wtext;
@@ -1595,27 +1691,27 @@ kvs_cursor_close(WT_CURSOR *wtcursor)
* ws_source_name --
* Build a namespace name.
*/
-static inline int
+static int
ws_source_name(WT_DATA_SOURCE *wtds,
WT_SESSION *session, const char *uri, const char *suffix, char **pp)
{
DATA_SOURCE *ds;
WT_EXTENSION_API *wtext;
size_t len;
+ int ret = 0;
const char *p;
ds = (DATA_SOURCE *)wtds;
wtext = ds->wtext;
/*
- * Create the store's name. Application URIs are "memrata:device/XXX";
- * we want the names on the memrata device to be obviously WiredTiger's,
- * and the device name isn't interesting. Convert to "WiredTiger:XXX",
+ * Create the store's name. Application URIs are "helium:device/name";
+ * we want the names on the Helium device to be obviously WiredTiger's,
+ * and the device name isn't interesting. Convert to "WiredTiger:name",
* and add an optional suffix.
*/
- if (strncmp(uri, "memrata:", sizeof("memrata:") - 1) != 0 ||
- (p = strchr(uri, '/')) == NULL)
- ERET(wtext, session, EINVAL, "%s: illegal memrata URI", uri);
+ if (!prefix_match(uri, "helium:") || (p = strchr(uri, '/')) == NULL)
+ ERET(wtext, session, EINVAL, "%s: illegal Helium URI", uri);
++p;
len = strlen(WT_NAME_PREFIX) +
@@ -1628,85 +1724,39 @@ ws_source_name(WT_DATA_SOURCE *wtds,
}
/*
- * ws_source_drop_namespace --
- * Drop a namespace.
- */
-static int
-ws_source_drop_namespace(WT_DATA_SOURCE *wtds, WT_SESSION *session,
- const char *uri, const char *suffix, kvs_t kvs_device)
-{
- DATA_SOURCE *ds;
- WT_EXTENSION_API *wtext;
- int ret = 0;
- char *p;
-
- ds = (DATA_SOURCE *)wtds;
- wtext = ds->wtext;
- p = NULL;
-
- /* Drop the underlying KVS namespace. */
- if ((ret = ws_source_name(wtds, session, uri, suffix, &p)) != 0)
- return (ret);
- if ((ret = kvs_delete_namespace(kvs_device, p)) != 0)
- EMSG(wtext, session, WT_ERROR,
- "kvs_delete_namespace: %s: %s", p, kvs_strerror(ret));
-
- free(p);
- return (ret);
-}
-
-/*
- * ws_source_rename_namespace --
- * Rename a namespace.
- */
-static int
-ws_source_rename_namespace(WT_DATA_SOURCE *wtds, WT_SESSION *session,
- const char *uri, const char *newuri, const char *suffix, kvs_t kvs_device)
-{
- DATA_SOURCE *ds;
- WT_EXTENSION_API *wtext;
- int ret = 0;
- char *p, *pnew;
-
- ds = (DATA_SOURCE *)wtds;
- wtext = ds->wtext;
- p = pnew = NULL;
-
- /* Rename the underlying KVS namespace. */
- ret = ws_source_name(wtds, session, uri, suffix, &p);
- if (ret == 0)
- ret = ws_source_name(wtds, session, newuri, suffix, &pnew);
- if (ret == 0 && (ret = kvs_rename_namespace(kvs_device, p, pnew)) != 0)
- EMSG(wtext, session, WT_ERROR,
- "kvs_rename_namespace: %s: %s", p, kvs_strerror(ret));
-
- free(p);
- free(pnew);
- return (ret);
-}
-
-/*
* ws_source_close --
- * Kill a WT_SOURCE structure.
+ * Close a WT_SOURCE reference.
*/
static int
ws_source_close(WT_EXTENSION_API *wtext, WT_SESSION *session, WT_SOURCE *ws)
{
- int ret = 0;
+ int ret = 0, tret;
+ /*
+ * Warn if open cursors: it shouldn't happen because the upper layers of
+ * WiredTiger prevent it, so we don't do anything more than warn.
+ */
if (ws->ref != 0)
EMSG(wtext, session, WT_ERROR,
"%s: open object with %u open cursors being closed",
ws->uri, ws->ref);
- if (ws->kvs != NULL && (ret = kvs_close(ws->kvs)) != 0)
- EMSG(wtext, session, WT_ERROR,
- "kvs_close: %s: %s", ws->uri, kvs_strerror(ret));
- ws->kvs = NULL;
- if (ws->kvscache != NULL && (ret = kvs_close(ws->kvscache)) != 0)
- EMSG(wtext, session, WT_ERROR,
- "kvs_close: %s(cache): %s", ws->uri, kvs_strerror(ret));
- ws->kvscache = NULL;
+ if (ws->he != NULL) {
+ if ((tret = he_commit(ws->he)) != 0)
+ EMSG(wtext, session, tret,
+ "he_commit: %s: %s", ws->uri, he_strerror(tret));
+ if ((tret = he_close(ws->he)) != 0)
+ EMSG(wtext, session, tret,
+ "he_close: %s: %s", ws->uri, he_strerror(tret));
+ ws->he = NULL;
+ }
+ if (ws->he_cache != NULL) {
+ if ((tret = he_close(ws->he_cache)) != 0)
+ EMSG(wtext, session, tret,
+ "he_close: %s(cache): %s",
+ ws->uri, he_strerror(tret));
+ ws->he_cache = NULL;
+ }
if (ws->lockinit)
ESET(lock_destroy(wtext, session, &ws->lock));
@@ -1718,33 +1768,36 @@ ws_source_close(WT_EXTENSION_API *wtext, WT_SESSION *session, WT_SOURCE *ws)
}
/*
- * ws_source_open_namespace --
- * Open a namespace.
+ * ws_source_open_object --
+ * Open an object in the Helium store.
*/
static int
-ws_source_open_namespace(WT_DATA_SOURCE *wtds, WT_SESSION *session,
- const char *uri, const char *suffix, kvs_t kvs_device, int flags,
- kvs_t *kvsp)
+ws_source_open_object(WT_DATA_SOURCE *wtds, WT_SESSION *session,
+ HELIUM_SOURCE *hs,
+ const char *uri, const char *suffix, int flags, he_t *hep)
{
DATA_SOURCE *ds;
WT_EXTENSION_API *wtext;
- kvs_t kvs;
+ he_t he;
char *p;
int ret = 0;
- *kvsp = NULL;
+ *hep = NULL;
ds = (DATA_SOURCE *)wtds;
wtext = ds->wtext;
p = NULL;
- /* Open the underlying KVS namespace. */
+ /* Open the underlying Helium object. */
if ((ret = ws_source_name(wtds, session, uri, suffix, &p)) != 0)
return (ret);
- if ((kvs = kvs_open_namespace(kvs_device, p, flags)) == NULL)
- EMSG(wtext, session, WT_ERROR,
- "kvs_open_namespace: %s: %s", p, kvs_strerror(os_errno()));
- *kvsp = kvs;
+ VMSG(wtext, session, VERBOSE_L1, "open %s/%s", hs->name, p);
+ if ((he = he_open(hs->device, p, flags, NULL)) == NULL) {
+ ret = os_errno();
+ EMSG(wtext, session, ret,
+ "he_open: %s/%s: %s", hs->name, p, he_strerror(ret));
+ }
+ *hep = he;
free(p);
return (ret);
@@ -1763,7 +1816,7 @@ ws_source_open(WT_DATA_SOURCE *wtds, WT_SESSION *session,
const char *uri, WT_CONFIG_ARG *config, u_int flags, WT_SOURCE **refp)
{
DATA_SOURCE *ds;
- KVS_SOURCE *ks;
+ HELIUM_SOURCE *hs;
WT_CONFIG_ITEM a;
WT_EXTENSION_API *wtext;
WT_SOURCE *ws;
@@ -1778,26 +1831,26 @@ ws_source_open(WT_DATA_SOURCE *wtds, WT_SESSION *session,
ws = NULL;
/*
- * The URI will be "memrata:" followed by a KVS name and object name
- * pair separated by a slash, for example, "memrata:dev/object".
+ * The URI will be "helium:" followed by a Helium name and object name
+ * pair separated by a slash, for example, "helium:volume/object".
*/
- if (strncmp(uri, "memrata:", strlen("memrata:")) != 0)
+ if (!prefix_match(uri, "helium:"))
goto bad_name;
- p = uri + strlen("memrata:");
+ p = uri + strlen("helium:");
if (p[0] == '/' || (t = strchr(p, '/')) == NULL || t[1] == '\0')
bad_name: ERET(wtext, session, EINVAL, "%s: illegal name format", uri);
len = (size_t)(t - p);
- /* Find a matching KVS device. */
- for (ks = ds->kvs_head; ks != NULL; ks = ks->next)
- if (STRING_MATCH(ks->name, p, len))
+ /* Find a matching Helium device. */
+ for (hs = ds->hs_head; hs != NULL; hs = hs->next)
+ if (string_match(hs->name, p, len))
break;
- if (ks == NULL)
+ if (hs == NULL)
ERET(wtext, NULL,
- EINVAL, "%s: no matching Memrata store found", uri);
+ EINVAL, "%s: no matching Helium store found", uri);
/*
- * We're about to walk the KVS device's list of files, acquire the
+ * We're about to walk the Helium device's list of files, acquire the
* global lock.
*/
if ((ret = writelock(wtext, session, &ds->global_lock)) != 0)
@@ -1808,7 +1861,7 @@ bad_name: ERET(wtext, session, EINVAL, "%s: illegal name format", uri);
* for the object's lock, optionally check if the object is busy, and
* return.
*/
- for (ws = ks->ws_head; ws != NULL; ws = ws->next)
+ for (ws = hs->ws_head; ws != NULL; ws = ws->next)
if (strcmp(ws->uri, uri) == 0) {
/* Check to see if the object is busy. */
if (ws->ref != 0 && (flags & WS_SOURCE_OPEN_BUSY)) {
@@ -1836,45 +1889,35 @@ bad_name: ERET(wtext, session, EINVAL, "%s: illegal name format", uri);
if ((ret = lock_init(wtext, session, &ws->lock)) != 0)
goto err;
ws->lockinit = 1;
- ws->ks = ks;
+ ws->hs = hs;
/*
- * Open the underlying KVS namespaces, then push the change.
+ * Open the underlying Helium objects, then push the change.
*
* The naming scheme is simple: the URI names the primary store, and the
* URI with a trailing suffix names the associated caching store.
*
- * We can set debug and truncate flags, we always set the create flag,
- * our caller handles attempts to create existing objects.
+ * We can set truncate flag, we always set the create flag, our caller
+ * handles attempts to create existing objects.
*/
- oflags = KVS_O_CREATE;
+ oflags = HE_O_CREATE;
if ((ret = wtext->config_get(wtext,
- session, config, "kvs_open_o_debug", &a)) == 0 && a.val != 0)
- oflags |= KVS_O_DEBUG;
- if (ret != 0 && ret != WT_NOTFOUND) {
- EMSG(wtext, session, ret,
- "kvs_open_o_debug configuration: %s", wtext->strerror(ret));
- goto err;
- }
- if ((ret = wtext->config_get(wtext,
- session, config, "kvs_open_o_truncate", &a)) == 0 && a.val != 0)
- oflags |= KVS_O_TRUNCATE;
- if (ret != 0 && ret != WT_NOTFOUND) {
- EMSG(wtext, session, ret,
- "kvs_open_o_truncate configuration: %s",
+ session, config, "helium_o_truncate", &a)) == 0 && a.val != 0)
+ oflags |= HE_O_TRUNCATE;
+ if (ret != 0 && ret != WT_NOTFOUND)
+ EMSG_ERR(wtext, session, ret,
+ "helium_o_truncate configuration: %s",
wtext->strerror(ret));
- goto err;
- }
- if ((ret = ws_source_open_namespace(wtds,
- session, uri, NULL, ks->kvs_device, oflags, &ws->kvs)) != 0)
+ if ((ret = ws_source_open_object(
+ wtds, session, hs, uri, NULL, oflags, &ws->he)) != 0)
goto err;
- if ((ret = ws_source_open_namespace(wtds, session,
- uri, WT_NAME_CACHE, ks->kvs_device, oflags, &ws->kvscache)) != 0)
+ if ((ret = ws_source_open_object(
+ wtds, session, hs, uri, WT_NAME_CACHE, oflags, &ws->he_cache)) != 0)
goto err;
- if ((ret = kvs_commit(ws->kvs)) != 0)
- EMSG_ERR(wtext, session, WT_ERROR,
- "kvs_commit: %s", kvs_strerror(ret));
+ if ((ret = he_commit(ws->he)) != 0)
+ EMSG_ERR(wtext, session, ret,
+ "he_commit: %s", he_strerror(ret));
/* Optionally trade the global lock for the object lock. */
if (!(flags & WS_SOURCE_OPEN_GLOBAL) &&
@@ -1882,8 +1925,8 @@ bad_name: ERET(wtext, session, EINVAL, "%s: illegal name format", uri);
goto err;
/* Insert the new entry at the head of the list. */
- ws->next = ks->ws_head;
- ks->ws_head = ws;
+ ws->next = hs->ws_head;
+ hs->ws_head = ws;
*refp = ws;
ws = NULL;
@@ -1905,7 +1948,7 @@ err: if (ws != NULL)
/*
* master_uri_get --
- * Get the KVS master record for a URI.
+ * Get the Helium master record for a URI.
*/
static int
master_uri_get(WT_DATA_SOURCE *wtds,
@@ -1922,7 +1965,7 @@ master_uri_get(WT_DATA_SOURCE *wtds,
/*
* master_uri_drop --
- * Drop the KVS master record for a URI.
+ * Drop the Helium master record for a URI.
*/
static int
master_uri_drop(WT_DATA_SOURCE *wtds, WT_SESSION *session, const char *uri)
@@ -1938,7 +1981,7 @@ master_uri_drop(WT_DATA_SOURCE *wtds, WT_SESSION *session, const char *uri)
/*
* master_uri_rename --
- * Rename the KVS master record for a URI.
+ * Rename the Helium master record for a URI.
*/
static int
master_uri_rename(WT_DATA_SOURCE *wtds,
@@ -1971,14 +2014,14 @@ err: free((void *)value);
/*
* master_uri_set --
- * Set the KVS master record for a URI.
+ * Set the Helium master record for a URI.
*/
static int
master_uri_set(WT_DATA_SOURCE *wtds,
WT_SESSION *session, const char *uri, WT_CONFIG_ARG *config)
{
DATA_SOURCE *ds;
- WT_CONFIG_ITEM a, b;
+ WT_CONFIG_ITEM a, b, c;
WT_EXTENSION_API *wtext;
int exclusive, ret = 0;
char value[1024];
@@ -2016,14 +2059,27 @@ master_uri_set(WT_DATA_SOURCE *wtds,
wtext->strerror(ret));
}
+ /* Get the compression configuration. */
+ if ((ret = wtext->config_get(
+ wtext, session, config, "helium_o_compress", &c)) != 0) {
+ if (ret == WT_NOTFOUND)
+ c.val = 0;
+ else
+ ERET(wtext, session, ret,
+ "helium_o_compress configuration: %s",
+ wtext->strerror(ret));
+ }
+
/*
* Create a new reference using insert (which fails if the record
- * already exists). If that succeeds, we just used up a unique ID,
- * update the master ID record.
+ * already exists).
*/
(void)snprintf(value, sizeof(value),
- "version=(major=%d,minor=%d),key_format=%.*s,value_format=%.*s",
- KVS_MAJOR, KVS_MINOR, (int)a.len, a.str, (int)b.len, b.str);
+ "wiredtiger_helium_version=(major=%d,minor=%d),"
+ "key_format=%.*s,value_format=%.*s,"
+ "helium_o_compress=%d",
+ WIREDTIGER_HELIUM_MAJOR, WIREDTIGER_HELIUM_MINOR,
+ (int)a.len, a.str, (int)b.len, b.str, c.val ? 1 : 0);
if ((ret = wtext->metadata_insert(wtext, session, uri, value)) == 0)
return (0);
if (ret == WT_DUPLICATE_KEY)
@@ -2032,11 +2088,11 @@ master_uri_set(WT_DATA_SOURCE *wtds,
}
/*
- * kvs_session_open_cursor --
+ * helium_session_open_cursor --
* WT_SESSION.open_cursor method.
*/
static int
-kvs_session_open_cursor(WT_DATA_SOURCE *wtds, WT_SESSION *session,
+helium_session_open_cursor(WT_DATA_SOURCE *wtds, WT_SESSION *session,
const char *uri, WT_CONFIG_ARG *config, WT_CURSOR **new_cursor)
{
CURSOR *cursor;
@@ -2078,15 +2134,15 @@ kvs_session_open_cursor(WT_DATA_SOURCE *wtds, WT_SESSION *session,
"collator configuration: %s", wtext->strerror(ret));
/* Finish initializing the cursor. */
- cursor->wtcursor.close = kvs_cursor_close;
- cursor->wtcursor.insert = kvs_cursor_insert;
- cursor->wtcursor.next = kvs_cursor_next;
- cursor->wtcursor.prev = kvs_cursor_prev;
- cursor->wtcursor.remove = kvs_cursor_remove;
- cursor->wtcursor.reset = kvs_cursor_reset;
- cursor->wtcursor.search = kvs_cursor_search;
- cursor->wtcursor.search_near = kvs_cursor_search_near;
- cursor->wtcursor.update = kvs_cursor_update;
+ cursor->wtcursor.close = helium_cursor_close;
+ cursor->wtcursor.insert = helium_cursor_insert;
+ cursor->wtcursor.next = helium_cursor_next;
+ cursor->wtcursor.prev = helium_cursor_prev;
+ cursor->wtcursor.remove = helium_cursor_remove;
+ cursor->wtcursor.reset = helium_cursor_reset;
+ cursor->wtcursor.search = helium_cursor_search;
+ cursor->wtcursor.search_near = helium_cursor_search_near;
+ cursor->wtcursor.update = helium_cursor_update;
cursor->wtext = wtext;
cursor->record.key = cursor->__key;
@@ -2123,21 +2179,28 @@ kvs_session_open_cursor(WT_DATA_SOURCE *wtds, WT_SESSION *session,
ws->config_bitfield =
v.len == 2 && isdigit(v.str[0]) && v.str[1] == 't';
+ if ((ret = wtext->config_strget(
+ wtext, session, value, "helium_o_compress", &v)) != 0)
+ EMSG_ERR(wtext, session, ret,
+ "helium_o_compress configuration: %s",
+ wtext->strerror(ret));
+ ws->config_compress = v.val ? 1 : 0;
+
/*
* If it's a record-number key, read the last record from the
* object and set the allocation record value.
*/
if (ws->config_recno) {
wtcursor = (WT_CURSOR *)cursor;
- if ((ret = kvs_cursor_reset(wtcursor)) != 0)
+ if ((ret = helium_cursor_reset(wtcursor)) != 0)
goto err;
- if ((ret = kvs_cursor_prev(wtcursor)) == 0)
+ if ((ret = helium_cursor_prev(wtcursor)) == 0)
ws->append_recno = wtcursor->recno;
else if (ret != WT_NOTFOUND)
goto err;
- if ((ret = kvs_cursor_reset(wtcursor)) != 0)
+ if ((ret = helium_cursor_reset(wtcursor)) != 0)
goto err;
}
@@ -2161,11 +2224,11 @@ err: if (ws != NULL && locked)
}
/*
- * kvs_session_create --
+ * helium_session_create --
* WT_SESSION.create method.
*/
static int
-kvs_session_create(WT_DATA_SOURCE *wtds,
+helium_session_create(WT_DATA_SOURCE *wtds,
WT_SESSION *session, const char *uri, WT_CONFIG_ARG *config)
{
DATA_SOURCE *ds;
@@ -2191,7 +2254,7 @@ kvs_session_create(WT_DATA_SOURCE *wtds,
* We've discarded the lock, but that's OK, creates are single-threaded
* at the WiredTiger level, it's not our problem to solve.
*
- * If unable to enter a WiredTiger record, leave the KVS store alone.
+ * If unable to enter a WiredTiger record, leave the Helium store alone.
* A subsequent create should do the right thing, we aren't leaving
* anything in an inconsistent state.
*/
@@ -2199,15 +2262,15 @@ kvs_session_create(WT_DATA_SOURCE *wtds,
}
/*
- * kvs_session_drop --
+ * helium_session_drop --
* WT_SESSION.drop method.
*/
static int
-kvs_session_drop(WT_DATA_SOURCE *wtds,
+helium_session_drop(WT_DATA_SOURCE *wtds,
WT_SESSION *session, const char *uri, WT_CONFIG_ARG *config)
{
DATA_SOURCE *ds;
- KVS_SOURCE *ks;
+ HELIUM_SOURCE *hs;
WT_EXTENSION_API *wtext;
WT_SOURCE **p, *ws;
int ret = 0;
@@ -2217,7 +2280,7 @@ kvs_session_drop(WT_DATA_SOURCE *wtds,
/*
* Get a locked reference to the data source: hold the global lock,
- * we are going to change the list of objects for a KVS store.
+ * we're changing the HELIUM_SOURCE's list of WT_SOURCE objects.
*
* Remove the entry from the WT_SOURCE list -- it's a singly-linked
* list, find the reference to it.
@@ -2225,28 +2288,23 @@ kvs_session_drop(WT_DATA_SOURCE *wtds,
if ((ret = ws_source_open(wtds, session, uri, config,
WS_SOURCE_OPEN_BUSY | WS_SOURCE_OPEN_GLOBAL, &ws)) != 0)
return (ret);
- ks = ws->ks;
- for (p = &ks->ws_head; *p != NULL; p = &(*p)->next)
+ hs = ws->hs;
+ for (p = &hs->ws_head; *p != NULL; p = &(*p)->next)
if (*p == ws) {
*p = (*p)->next;
break;
}
- /* Close the source, discarding the handles and structure. */
+ /* Drop the underlying Helium objects. */
+ ESET(he_remove(ws->he));
+ ws->he = NULL; /* The handle is dead. */
+ ESET(he_remove(ws->he_cache));
+ ws->he_cache = NULL; /* The handle is dead. */
+
+ /* Close the source, discarding the structure. */
ESET(ws_source_close(wtext, session, ws));
ws = NULL;
- /* Drop the underlying namespaces. */
- ESET(ws_source_drop_namespace(
- wtds, session, uri, NULL, ks->kvs_device));
- ESET(ws_source_drop_namespace(
- wtds, session, uri, WT_NAME_CACHE, ks->kvs_device));
-
- /* Push the change. */
- if ((ret = kvs_commit(ks->kvs_device)) != 0)
- EMSG(wtext, session, WT_ERROR,
- "kvs_commit: %s", kvs_strerror(ret));
-
/* Discard the metadata entry. */
ESET(master_uri_drop(wtds, session, uri));
@@ -2262,19 +2320,18 @@ kvs_session_drop(WT_DATA_SOURCE *wtds,
}
/*
- * kvs_session_rename --
+ * helium_session_rename --
* WT_SESSION.rename method.
*/
static int
-kvs_session_rename(WT_DATA_SOURCE *wtds, WT_SESSION *session,
+helium_session_rename(WT_DATA_SOURCE *wtds, WT_SESSION *session,
const char *uri, const char *newuri, WT_CONFIG_ARG *config)
{
DATA_SOURCE *ds;
- KVS_SOURCE *ks;
WT_EXTENSION_API *wtext;
WT_SOURCE *ws;
int ret = 0;
- char *copy;
+ char *p;
ds = (DATA_SOURCE *)wtds;
wtext = ds->wtext;
@@ -2287,27 +2344,26 @@ kvs_session_rename(WT_DATA_SOURCE *wtds, WT_SESSION *session,
if ((ret = ws_source_open(wtds, session, uri, config,
WS_SOURCE_OPEN_BUSY | WS_SOURCE_OPEN_GLOBAL, &ws)) != 0)
return (ret);
- ks = ws->ks;
- /* Get a copy of the new name. */
- if ((copy = strdup(newuri)) == NULL) {
+ /* Get a copy of the new name for the WT_SOURCE structure. */
+ if ((p = strdup(newuri)) == NULL) {
ret = os_errno();
goto err;
}
free(ws->uri);
- ws->uri = copy;
- copy = NULL;
+ ws->uri = p;
- /* Rename the underlying namespaces. */
- ESET(ws_source_rename_namespace(
- wtds, session, uri, newuri, NULL, ks->kvs_device));
- ESET(ws_source_rename_namespace(
- wtds, session, uri, newuri, WT_NAME_CACHE, ks->kvs_device));
-
- /* Push the change. */
- if ((ret = kvs_commit(ws->kvs)) != 0)
- EMSG(wtext, session, WT_ERROR,
- "kvs_commit: %s", kvs_strerror(ret));
+ /* Rename the underlying Helium objects. */
+ ESET(ws_source_name(wtds, session, newuri, NULL, &p));
+ if (ret == 0) {
+ ESET(he_rename(ws->he, p));
+ free(p);
+ }
+ ESET(ws_source_name(wtds, session, newuri, WT_NAME_CACHE, &p));
+ if (ret == 0) {
+ ESET(he_rename(ws->he_cache, p));
+ free(p);
+ }
/* Update the metadata record. */
ESET(master_uri_rename(wtds, session, uri, newuri));
@@ -2325,17 +2381,17 @@ err: ESET(unlock(wtext, session, &ds->global_lock));
}
/*
- * kvs_session_truncate --
+ * helium_session_truncate --
* WT_SESSION.truncate method.
*/
static int
-kvs_session_truncate(WT_DATA_SOURCE *wtds,
+helium_session_truncate(WT_DATA_SOURCE *wtds,
WT_SESSION *session, const char *uri, WT_CONFIG_ARG *config)
{
DATA_SOURCE *ds;
WT_EXTENSION_API *wtext;
WT_SOURCE *ws;
- int ret = 0;
+ int ret = 0, tret;
ds = (DATA_SOURCE *)wtds;
wtext = ds->wtext;
@@ -2346,47 +2402,42 @@ kvs_session_truncate(WT_DATA_SOURCE *wtds,
return (ret);
/* Truncate the underlying namespaces. */
- if ((ret = kvs_truncate(ws->kvs)) != 0)
- EMSG(wtext, session, WT_ERROR,
- "kvs_truncate: %s: %s", ws->uri, kvs_strerror(ret));
- if ((ret = kvs_truncate(ws->kvscache)) != 0)
- EMSG(wtext, session, WT_ERROR,
- "kvs_truncate: %s: %s", ws->uri, kvs_strerror(ret));
+ if ((tret = he_truncate(ws->he)) != 0)
+ EMSG(wtext, session, tret,
+ "he_truncate: %s: %s", ws->uri, he_strerror(tret));
+ if ((tret = he_truncate(ws->he_cache)) != 0)
+ EMSG(wtext, session, tret,
+ "he_truncate: %s: %s", ws->uri, he_strerror(tret));
ESET(unlock(wtext, session, &ws->lock));
return (ret);
}
/*
- * kvs_session_verify --
+ * helium_session_verify --
* WT_SESSION.verify method.
*/
static int
-kvs_session_verify(WT_DATA_SOURCE *wtds,
+helium_session_verify(WT_DATA_SOURCE *wtds,
WT_SESSION *session, const char *uri, WT_CONFIG_ARG *config)
{
- DATA_SOURCE *ds;
- WT_EXTENSION_API *wtext;
-
+ (void)wtds;
+ (void)session;
(void)uri;
(void)config;
-
- ds = (DATA_SOURCE *)wtds;
- wtext = ds->wtext;
-
- ERET(wtext, session, ENOTSUP, "verify: %s", strerror(ENOTSUP));
+ return (0);
}
/*
- * kvs_session_checkpoint --
+ * helium_session_checkpoint --
* WT_SESSION.checkpoint method.
*/
static int
-kvs_session_checkpoint(
+helium_session_checkpoint(
WT_DATA_SOURCE *wtds, WT_SESSION *session, WT_CONFIG_ARG *config)
{
DATA_SOURCE *ds;
- KVS_SOURCE *ks;
+ HELIUM_SOURCE *hs;
WT_EXTENSION_API *wtext;
int ret = 0;
@@ -2395,208 +2446,68 @@ kvs_session_checkpoint(
ds = (DATA_SOURCE *)wtds;
wtext = ds->wtext;
- /*
- * Flush the device.
- *
- * XXX
- * This is a placeholder until we figure out what recovery is going
- * to look like.
- */
- if ((ks = ds->kvs_head) != NULL &&
- (ret = kvs_commit(ks->kvs_device)) != 0)
- ERET(wtext, session, WT_ERROR,
- "kvs_commit: %s", kvs_strerror(ret));
-
- return (0);
-}
-
-/*
- * kvs_config_devices --
- * Convert the device list into an argv[] array.
- */
-static int
-kvs_config_devices(
- WT_EXTENSION_API *wtext, WT_CONFIG_ITEM *orig, char ***devices)
-{
- WT_CONFIG_ITEM k, v;
- WT_CONFIG_SCAN *scan;
- size_t len;
- u_int cnt, slots;
- int ret = 0;
- char **argv, **p;
-
- argv = NULL;
-
- /* Set up the scan of the device list. */
- if ((ret = wtext->config_scan_begin(
- wtext, NULL, orig->str, orig->len, &scan)) != 0)
- EMSG_ERR(wtext, NULL, ret,
- "WT_EXTENSION_API.config_scan_begin: %s",
- wtext->strerror(ret));
-
- for (cnt = slots = 0; (ret = wtext->
- config_scan_next(wtext, scan, &k, &v)) == 0; ++cnt) {
- if (cnt + 1 >= slots) { /* NULL-terminate the array */
- len = slots + 20 * sizeof(*argv);
- if ((p = realloc(argv, len)) == NULL) {
- ret = os_errno();
- goto err;
- }
- argv = p;
- slots += 20;
- }
- len = k.len + 1;
- if ((argv[cnt] = calloc(len, sizeof(**argv))) == NULL) {
- ret = os_errno();
- goto err;
- }
- argv[cnt + 1] = NULL;
- memcpy(argv[cnt], k.str, k.len);
- }
- if (ret != WT_NOTFOUND)
- EMSG_ERR(wtext, NULL, ret,
- "WT_EXTENSION_API.config_scan_next: %s",
- wtext->strerror(ret));
- if ((ret = wtext->config_scan_end(wtext, scan)) != 0)
- EMSG_ERR(wtext, NULL, ret,
- "WT_EXTENSION_API.config_scan_end: %s",
- wtext->strerror(ret));
+ /* Flush all volumes. */
+ if ((hs = ds->hs_head) != NULL &&
+ (ret = he_commit(hs->he_volume)) != 0)
+ ERET(wtext, session, ret,
+ "he_commit: %s: %s", hs->device, he_strerror(ret));
- *devices = argv;
return (0);
-
-err: if (argv != NULL) {
- for (p = argv; *p != NULL; ++p)
- free(*p);
- free(argv);
- }
- return (ret);
-}
-
-/*
- * kvs_config_read --
- * Read KVS configuration.
- */
-static int
-kvs_config_read(WT_EXTENSION_API *wtext, WT_CONFIG_ITEM *config,
- char ***devices, struct kvs_config *kvs_config, int *flagsp)
-{
- WT_CONFIG_ITEM k, v;
- WT_CONFIG_SCAN *scan;
- int ret = 0, tret;
-
- *flagsp = 0; /* Return default values. */
- if ((ret = kvs_default_config(kvs_config)) != 0)
- ERET(wtext, NULL,
- EINVAL, "kvs_default_config: %s", kvs_strerror(os_errno()));
-
- /* Set up the scan of the configuration arguments list. */
- if ((ret = wtext->config_scan_begin(
- wtext, NULL, config->str, config->len, &scan)) != 0)
- ERET(wtext, NULL, ret,
- "WT_EXTENSION_API.config_scan_begin: %s",
- wtext->strerror(ret));
- while ((ret = wtext->config_scan_next(wtext, scan, &k, &v)) == 0) {
- if (STRING_MATCH("kvs_devices", k.str, k.len)) {
- if ((ret = kvs_config_devices(wtext, &v, devices)) != 0)
- return (ret);
- continue;
- }
-
-#define KVS_CONFIG_SET(s, f) \
- if (STRING_MATCH(s, k.str, k.len)) { \
- kvs_config->f = (unsigned long)v.val; \
- continue; \
- }
- KVS_CONFIG_SET("kvs_parallelism", parallelism);
- KVS_CONFIG_SET("kvs_granularity", granularity);
- KVS_CONFIG_SET("kvs_avg_key_len", avg_key_len);
- KVS_CONFIG_SET("kvs_avg_val_len", avg_val_len);
- KVS_CONFIG_SET("kvs_write_bufs", write_bufs);
- KVS_CONFIG_SET("kvs_read_bufs", read_bufs);
- KVS_CONFIG_SET("kvs_commit_timeout", commit_timeout);
- KVS_CONFIG_SET("kvs_reclaim_threshold", reclaim_threshold);
- KVS_CONFIG_SET("kvs_reclaim_period", reclaim_period);
-
-#define KVS_FLAG_SET(s, f) \
- if (STRING_MATCH(s, k.str, k.len)) { \
- if (v.val != 0) \
- *flagsp |= f; \
- continue; \
- }
- /*
- * We don't export KVS_O_CREATE: WT_SESSION.create
- * always adds it in.
- */
- KVS_FLAG_SET("kvs_open_o_debug", KVS_O_DEBUG);
- KVS_FLAG_SET("kvs_open_o_truncate", KVS_O_TRUNCATE);
-
- EMSG_ERR(wtext, NULL, EINVAL,
- "unknown configuration key value pair %.*s/%.*s",
- (int)k.len, k.str, (int)v.len, v.str);
- }
-
- if (ret == WT_NOTFOUND)
- ret = 0;
- if (ret != 0)
- EMSG_ERR(wtext, NULL, ret,
- "WT_EXTENSION_API.config_scan_next: %s",
- wtext->strerror(ret));
-
-err: if ((tret = wtext->config_scan_end(wtext, scan)) != 0)
- EMSG(wtext, NULL, tret,
- "WT_EXTENSION_API.config_scan_end: %s",
- wtext->strerror(ret));
-
- return (ret);
}
/*
- * kvs_source_close --
- * Kill a KVS_SOURCE structure.
+ * helium_source_close --
+ * Discard a HELIUM_SOURCE.
*/
static int
-kvs_source_close(WT_EXTENSION_API *wtext, WT_SESSION *session, KVS_SOURCE *ks)
+helium_source_close(
+ WT_EXTENSION_API *wtext, WT_SESSION *session, HELIUM_SOURCE *hs)
{
WT_SOURCE *ws;
int ret = 0, tret;
/* Resolve the cache into the primary one last time and quit. */
- if (ks->cleaner_id != 0) {
- ks->cleaner_stop = 1;
+ if (hs->cleaner_id != 0) {
+ hs->cleaner_stop = 1;
- if ((tret = pthread_join(ks->cleaner_id, NULL)) != 0)
+ if ((tret = pthread_join(hs->cleaner_id, NULL)) != 0)
EMSG(wtext, session, tret,
"pthread_join: %s", strerror(tret));
- ks->cleaner_id = 0;
+ hs->cleaner_id = 0;
}
/* Close the underlying WiredTiger sources. */
- while ((ws = ks->ws_head) != NULL) {
- ks->ws_head = ws->next;
+ while ((ws = hs->ws_head) != NULL) {
+ hs->ws_head = ws->next;
ESET(ws_source_close(wtext, session, ws));
}
- /* Flush and close the KVS source. */
- if (ks->kvs_device != NULL) {
- if ((tret = kvs_commit(ks->kvs_device)) != 0)
- EMSG(wtext, session, WT_ERROR,
- "kvs_commit: %s: %s", ks->name, kvs_strerror(tret));
+ /* If the owner, close the database transaction store. */
+ if (hs->he_txn != NULL && hs->he_owner) {
+ if ((tret = he_close(hs->he_txn)) != 0)
+ EMSG(wtext, session, tret,
+ "he_close: %s: %s: %s",
+ hs->name, WT_NAME_TXN, he_strerror(tret));
+ hs->he_txn = NULL;
+ }
- /* If the owner, close the database transaction store. */
- if (ks->kvsowner && (tret = kvs_close(ks->kvstxn)) != 0)
+ /* Flush and close the Helium source. */
+ if (hs->he_volume != NULL) {
+ if ((tret = he_commit(hs->he_volume)) != 0)
EMSG(wtext, session, tret,
- "kvs_close: %s: %s",
- WT_NAME_TXN, kvs_strerror(tret));
+ "he_commit: %s: %s",
+ hs->device, he_strerror(tret));
- if ((tret = kvs_close(ks->kvs_device)) != 0)
- EMSG(wtext, session, WT_ERROR,
- "kvs_close: %s: %s", ks->name, kvs_strerror(tret));
- ks->kvs_device = NULL;
+ if ((tret = he_close(hs->he_volume)) != 0)
+ EMSG(wtext, session, tret,
+ "he_close: %s: %s: %s",
+ hs->name, WT_NAME_INIT, he_strerror(tret));
+ hs->he_volume = NULL;
}
- free(ks->name);
- OVERWRITE_AND_FREE(ks);
+ free(hs->name);
+ free(hs->device);
+ OVERWRITE_AND_FREE(hs);
return (ret);
}
@@ -2609,12 +2520,12 @@ static int
cache_cleaner(WT_EXTENSION_API *wtext,
WT_CURSOR *wtcursor, uint64_t oldest, uint64_t *txnminp)
{
- struct kvs_record *r;
CACHE_RECORD *cp;
CURSOR *cursor;
+ HE_ITEM *r;
WT_SOURCE *ws;
uint64_t txnid;
- int locked, recovery, ret = 0;
+ int locked, pushed, recovery, ret = 0;
/*
* Called in two ways: in normal processing mode where we're supplied a
@@ -2633,14 +2544,14 @@ cache_cleaner(WT_EXTENSION_API *wtext,
cursor = (CURSOR *)wtcursor;
ws = cursor->ws;
r = &cursor->record;
- locked = 0;
+ locked = pushed = 0;
/*
* For every cache key where all updates are globally visible:
* Migrate the most recent update value to the primary store.
*/
for (r->key_len = 0; (ret =
- kvs_call(wtcursor, "kvs_next", ws->kvscache, kvs_next)) == 0;) {
+ helium_call(wtcursor, "he_next", ws->he_cache, he_next)) == 0;) {
/*
* Unmarshall the value, and if all of the updates are globally
* visible, update the primary with the last committed update.
@@ -2660,8 +2571,10 @@ cache_cleaner(WT_EXTENSION_API *wtext,
cache_value_last_not_aborted(wtcursor, &cp);
if (cp == NULL)
continue;
+
+ pushed = 1;
if (cp->remove) {
- if ((ret = kvs_del(ws->kvs, r)) == 0)
+ if ((ret = he_delete(ws->he, r)) == 0)
continue;
/*
@@ -2669,35 +2582,50 @@ cache_cleaner(WT_EXTENSION_API *wtext,
* primary at all, that is, an insert and remove pair
* may be confined to the cache.
*/
- if (ret == KVS_E_KEY_NOT_FOUND) {
+ if (ret == HE_ERR_ITEM_NOT_FOUND) {
ret = 0;
continue;
}
- ERET(wtext, NULL, WT_ERROR,
- "kvs_del: %s", kvs_strerror(ret));
+ ERET(wtext, NULL, ret,
+ "he_delete: %s", he_strerror(ret));
} else {
r->val = cp->v;
r->val_len = cp->len;
- if ((ret = kvs_set(ws->kvs, r)) == 0)
+ /*
+ * If compression configured for this datastore, set the
+ * compression flag, we're updating the "real" store.
+ */
+ if (ws->config_compress)
+ r->flags |= HE_I_COMPRESS;
+ ret = he_update(ws->he, r);
+ r->flags = 0;
+ if (ret == 0)
continue;
- ERET(wtext, NULL, WT_ERROR,
- "kvs_set: %s", kvs_strerror(ret));
+
+ ERET(wtext, NULL, ret,
+ "he_update: %s", he_strerror(ret));
}
}
if (ret == WT_NOTFOUND)
ret = 0;
if (ret != 0)
- ERET(wtext, NULL, WT_ERROR,
- "kvs_next: %s", kvs_strerror(ret));
+ ERET(wtext, NULL, ret, "he_next: %s", he_strerror(ret));
+
+ /*
+ * If we didn't move any keys from the cache to the primary, quit. It's
+ * possible we could still remove values from the cache, but not likely,
+ * and another pass would probably be wasted effort (especially locked).
+ */
+ if (!pushed)
+ return (0);
/*
* Push the store to stable storage for correctness. (It doesn't matter
- * what Memrata handle we push, so we just push one of them.)
+ * what Helium handle we commit, so we just commit one of them.)
*/
- if ((ret = kvs_commit(ws->kvs)) != 0)
- ERET(wtext, NULL, WT_ERROR,
- "kvs_commit: %s", kvs_strerror(ret));
+ if ((ret = he_commit(ws->he)) != 0)
+ ERET(wtext, NULL, ret, "he_commit: %s", he_strerror(ret));
/*
* If we're performing recovery, that's all we need to do, we're going
@@ -2719,7 +2647,7 @@ cache_cleaner(WT_EXTENSION_API *wtext,
locked = 1;
for (r->key_len = 0; (ret =
- kvs_call(wtcursor, "kvs_next", ws->kvscache, kvs_next)) == 0;) {
+ helium_call(wtcursor, "he_next", ws->he_cache, he_next)) == 0;) {
/*
* Unmarshall the value, and if all of the updates are globally
* visible, remove the cache entry.
@@ -2727,9 +2655,9 @@ cache_cleaner(WT_EXTENSION_API *wtext,
if ((ret = cache_value_unmarshall(wtcursor)) != 0)
goto err;
if (cache_value_visible_all(wtcursor, oldest)) {
- if ((ret = kvs_del(ws->kvscache, r)) != 0)
- EMSG_ERR(wtext, NULL, WT_ERROR,
- "kvs_del: %s", kvs_strerror(ret));
+ if ((ret = he_delete(ws->he_cache, r)) != 0)
+ EMSG_ERR(wtext, NULL, ret,
+ "he_delete: %s", he_strerror(ret));
continue;
}
@@ -2752,8 +2680,7 @@ cache_cleaner(WT_EXTENSION_API *wtext,
if (ret == WT_NOTFOUND)
ret = 0;
if (ret != 0)
- EMSG_ERR(wtext, NULL, WT_ERROR,
- "kvs_next: %s", kvs_strerror(ret));
+ EMSG_ERR(wtext, NULL, ret, "he_next: %s", he_strerror(ret));
err: if (locked)
ESET(unlock(wtext, NULL, &ws->lock));
@@ -2766,11 +2693,11 @@ err: if (locked)
* Discard no longer needed entries from the transaction store.
*/
static int
-txn_cleaner(WT_CURSOR *wtcursor, kvs_t kvstxn, uint64_t txnmin)
+txn_cleaner(WT_CURSOR *wtcursor, he_t he_txn, uint64_t txnmin)
{
CURSOR *cursor;
+ HE_ITEM *r;
WT_EXTENSION_API *wtext;
- struct kvs_record *r;
uint64_t txnid;
int ret = 0;
@@ -2783,23 +2710,23 @@ txn_cleaner(WT_CURSOR *wtcursor, kvs_t kvstxn, uint64_t txnmin)
* oldest transaction ID that appears anywhere in any cache.
*/
for (r->key_len = 0;
- (ret = kvs_call(wtcursor, "kvs_next", kvstxn, kvs_next)) == 0;) {
+ (ret = helium_call(wtcursor, "he_next", he_txn, he_next)) == 0;) {
memcpy(&txnid, r->key, sizeof(txnid));
- if (txnid < txnmin && (ret = kvs_del(kvstxn, r)) != 0)
- ERET(wtext, NULL, WT_ERROR,
- "kvs_del: %s", kvs_strerror(ret));
+ if (txnid < txnmin && (ret = he_delete(he_txn, r)) != 0)
+ ERET(wtext, NULL, ret,
+ "he_delete: %s", he_strerror(ret));
}
if (ret == WT_NOTFOUND)
ret = 0;
if (ret != 0)
- ERET(wtext, NULL, WT_ERROR, "kvs_next: %s", kvs_strerror(ret));
+ ERET(wtext, NULL, ret, "he_next: %s", he_strerror(ret));
return (0);
}
/*
* fake_cursor --
- * Fake up enough of a cursor to do KVS operations.
+ * Fake up enough of a cursor to do Helium operations.
*/
static int
fake_cursor(WT_EXTENSION_API *wtext, WT_CURSOR **wtcursorp)
@@ -2832,49 +2759,32 @@ fake_cursor(WT_EXTENSION_API *wtext, WT_CURSOR **wtcursorp)
}
/*
- * kvs_cleaner --
+ * cache_cleaner_worker --
* Thread to migrate data from the cache to the primary.
*/
static void *
-kvs_cleaner(void *arg)
+cache_cleaner_worker(void *arg)
{
struct timeval t;
CURSOR *cursor;
- KVS_SOURCE *ks;
+ HELIUM_SOURCE *hs;
+ HE_STATS stats;
WT_CURSOR *wtcursor;
WT_EXTENSION_API *wtext;
WT_SOURCE *ws;
uint64_t oldest, txnmin, txntmp;
int cleaner_stop, delay, ret = 0;
- ks = (KVS_SOURCE *)arg;
+ hs = (HELIUM_SOURCE *)arg;
cursor = NULL;
- wtext = ks->wtext;
+ wtext = hs->wtext;
if ((ret = fake_cursor(wtext, &wtcursor)) != 0)
- EMSG_ERR(wtext, NULL, ret, "kvs_cleaner: %s", strerror(ret));
+ EMSG_ERR(wtext, NULL, ret, "cleaner: %s", strerror(ret));
cursor = (CURSOR *)wtcursor;
- for (delay = 1;;) {
- /*
- * Check the underlying caches for either a number of operations
- * or a number of bytes. It's more expensive to return values
- * from the cache (because we have to marshall/unmarshall them),
- * but there's no information yet on how to tune the values.
- *
- * For now, use 10MB as the limit, and a corresponding number of
- * operations, assuming roughly 40B per key/value pair.
- */
-#undef BYTELIMIT
-#define BYTELIMIT (10 * 1048576)
-#undef OPLIMIT
-#define OPLIMIT (BYTELIMIT / (2 * 20))
- for (ws = ks->ws_head; ws != NULL; ws = ws->next)
- if (ws->cleaner_ops > OPLIMIT ||
- ws->cleaner_bytes > BYTELIMIT)
- break;
-
+ for (cleaner_stop = delay = 0; !cleaner_stop;) {
/*
* Check if this will be the final run; cleaner_stop is declared
* volatile, and so the read will happen. We don't much care if
@@ -2882,16 +2792,45 @@ kvs_cleaner(void *arg)
* and finds the variable set. Store the read locally, reading
* the variable twice might race.
*/
- cleaner_stop = ks->cleaner_stop;
- if (ws == NULL && !cleaner_stop) {
- if (delay < 5) /* At least every 5 seconds. */
- ++delay;
+ cleaner_stop = hs->cleaner_stop;
+
+ /*
+ * Delay if this isn't the final run and the last pass didn't
+ * find any work to do.
+ */
+ if (!cleaner_stop && delay != 0) {
t.tv_sec = delay;
t.tv_usec = 0;
(void)select(0, NULL, NULL, NULL, &t);
- continue;
}
+ /* Run at least every 5 seconds. */
+ if (delay < 5)
+ ++delay;
+
+ /*
+ * Clean the datastore caches, depending on their size. It's
+ * both more and less expensive to return values from the cache:
+ * more because we have to marshall/unmarshall the values, less
+ * because there's only a single call, to the cache store rather
+ * one to the cache and one to the primary. I have no turning
+ * information, for now simply set the limit at 50MB.
+ */
+#undef CACHE_SIZE_TRIGGER
+#define CACHE_SIZE_TRIGGER (50 * 1048576)
+ for (ws = hs->ws_head; ws != NULL; ws = ws->next) {
+ if ((ret = he_stats(ws->he_cache, &stats)) != 0)
+ EMSG_ERR(wtext, NULL,
+ ret, "he_stats: %s", he_strerror(ret));
+ if (stats.size > CACHE_SIZE_TRIGGER)
+ break;
+ }
+ if (!cleaner_stop && ws == NULL)
+ continue;
+
+ /* There was work to do, don't delay before checking again. */
+ delay = 0;
+
/*
* Get the oldest transaction ID not yet visible to a running
* transaction. Do this before doing anything else, avoiding
@@ -2900,11 +2839,14 @@ kvs_cleaner(void *arg)
oldest = wtext->transaction_oldest(wtext);
/*
+ * If any cache needs cleaning, clean them all, because we have
+ * to know the minimum transaction ID referenced by any cache.
+ *
* For each cache/primary pair, migrate whatever records we can,
* tracking the lowest transaction ID of any entry in any cache.
*/
txnmin = UINT64_MAX;
- for (ws = ks->ws_head; ws != NULL; ws = ws->next) {
+ for (ws = hs->ws_head; ws != NULL; ws = ws->next) {
cursor->ws = ws;
if ((ret = cache_cleaner(
wtext, wtcursor, oldest, &txntmp)) != 0)
@@ -2923,11 +2865,8 @@ kvs_cleaner(void *arg)
* problem here.
*/
cursor->ws = NULL;
- if ((ret = txn_cleaner(wtcursor, ks->kvstxn, txnmin)) != 0)
+ if ((ret = txn_cleaner(wtcursor, hs->he_txn, txnmin)) != 0)
goto err;
-
- if (cleaner_stop)
- break;
}
err: cursor_destroy(cursor);
@@ -2935,152 +2874,219 @@ err: cursor_destroy(cursor);
}
/*
- * kvs_source_open --
- * Allocate and open a KVS source.
+ * helium_config_read --
+ * Parse the Helium configuration.
*/
static int
-kvs_source_open(DATA_SOURCE *ds, WT_CONFIG_ITEM *k, WT_CONFIG_ITEM *v)
+helium_config_read(WT_EXTENSION_API *wtext, WT_CONFIG_ITEM *config,
+ char **devicep, HE_ENV *envp, int *env_setp, int *flagsp)
{
- struct kvs_config kvs_config;
- KVS_SOURCE *ks;
+ WT_CONFIG_ITEM k, v;
+ WT_CONFIG_SCAN *scan;
+ int ret = 0, tret;
+
+ *env_setp = 0;
+ *flagsp = 0;
+
+ /* Set up the scan of the configuration arguments list. */
+ if ((ret = wtext->config_scan_begin(
+ wtext, NULL, config->str, config->len, &scan)) != 0)
+ ERET(wtext, NULL, ret,
+ "WT_EXTENSION_API.config_scan_begin: %s",
+ wtext->strerror(ret));
+ while ((ret = wtext->config_scan_next(wtext, scan, &k, &v)) == 0) {
+ if (string_match("helium_devices", k.str, k.len)) {
+ if ((*devicep = calloc(1, v.len + 1)) == NULL)
+ return (os_errno());
+ memcpy(*devicep, v.str, v.len);
+ continue;
+ }
+ if (string_match("helium_env_read_cache_size", k.str, k.len)) {
+ envp->read_cache_size = (uint64_t)v.val;
+ *env_setp = 1;
+ continue;
+ }
+ if (string_match("helium_env_write_cache_size", k.str, k.len)) {
+ envp->write_cache_size = (uint64_t)v.val;
+ *env_setp = 1;
+ continue;
+ }
+ if (string_match("helium_o_volume_truncate", k.str, k.len)) {
+ if (v.val != 0)
+ *flagsp |= HE_O_VOLUME_TRUNCATE;
+ continue;
+ }
+ EMSG_ERR(wtext, NULL, EINVAL,
+ "unknown configuration key value pair %.*s=%.*s",
+ (int)k.len, k.str, (int)v.len, v.str);
+ }
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+ if (ret != 0)
+ EMSG_ERR(wtext, NULL, ret,
+ "WT_EXTENSION_API.config_scan_next: %s",
+ wtext->strerror(ret));
+
+err: if ((tret = wtext->config_scan_end(wtext, scan)) != 0)
+ EMSG(wtext, NULL, tret,
+ "WT_EXTENSION_API.config_scan_end: %s",
+ wtext->strerror(tret));
+
+ return (ret);
+}
+
+/*
+ * helium_source_open --
+ * Allocate and open a Helium source.
+ */
+static int
+helium_source_open(DATA_SOURCE *ds, WT_CONFIG_ITEM *k, WT_CONFIG_ITEM *v)
+{
+ struct he_env env;
+ HELIUM_SOURCE *hs;
WT_EXTENSION_API *wtext;
- int flags, ret = 0;
- char **device_list, **p;
+ int env_set, flags, ret = 0;
wtext = ds->wtext;
+ hs = NULL;
- ks = NULL;
- device_list = NULL;
+ VMSG(wtext, NULL, VERBOSE_L1, "volume %.*s=%.*s",
+ (int)k->len, k->str, (int)v->len, v->str);
- /* Check for a KVS source we've already opened. */
- for (ks = ds->kvs_head; ks != NULL; ks = ks->next)
- if (STRING_MATCH(ks->name, k->str, k->len))
+ /*
+ * Check for a Helium source we've already opened: we don't check the
+ * value (which implies you can open the same underlying stores using
+ * more than one name, but I don't know of any problems that causes),
+ * we only check the key, that is, the top-level WiredTiger name.
+ */
+ for (hs = ds->hs_head; hs != NULL; hs = hs->next)
+ if (string_match(hs->name, k->str, k->len))
ERET(wtext, NULL,
- EINVAL, "%s: device already open", ks->name);
+ EINVAL, "%s: device already open", hs->name);
- /* Allocate and initialize a new underlying KVS source object. */
- if ((ks = calloc(1, sizeof(*ks))) == NULL ||
- (ks->name = calloc(1, k->len + 1)) == NULL) {
- free(ks);
+ /* Allocate and initialize a new underlying Helium source object. */
+ if ((hs = calloc(1, sizeof(*hs))) == NULL ||
+ (hs->name = calloc(1, k->len + 1)) == NULL) {
+ free(hs);
return (os_errno());
}
- memcpy(ks->name, k->str, k->len);
-
- ks->txn_notify.notify = txn_notify;
- ks->wtext = wtext;
-
- /*
- * Read the configuration. We require a list of devices underlying the
- * KVS source, parse the device list found in the configuration string
- * into an array of paths.
- */
- if ((ret =
- kvs_config_read(wtext, v, &device_list, &kvs_config, &flags)) != 0)
+ memcpy(hs->name, k->str, k->len);
+ hs->txn_notify.notify = txn_notify;
+ hs->wtext = wtext;
+
+ /* Read the configuration, require a device naming the Helium store. */
+ memset(&env, 0, sizeof(env));
+ if ((ret = helium_config_read(
+ wtext, v, &hs->device, &env, &env_set, &flags)) != 0)
goto err;
- if (device_list == NULL || device_list[0] == NULL)
+ if (hs->device == NULL)
EMSG_ERR(wtext, NULL,
- EINVAL, "%s: no devices specified", ks->name);
+ EINVAL, "%s: no Helium volumes specified", hs->name);
- /* Open the underlying KVS store (creating it if necessary). */
- ks->kvs_device =
- kvs_open(device_list, &kvs_config, flags | KVS_O_CREATE);
- if (ks->kvs_device == NULL)
- EMSG_ERR(wtext, NULL, WT_ERROR,
- "kvs_open: %s: %s", ks->name, kvs_strerror(os_errno()));
+ /*
+ * Open the Helium volume, creating it if necessary. We have to open
+ * an object at the same time, that's why we have object flags as well
+ * as volume flags.
+ */
+ flags |= HE_O_CREATE |
+ HE_O_TRUNCATE | HE_O_VOLUME_CLEAN | HE_O_VOLUME_CREATE;
+ if ((hs->he_volume = he_open(
+ hs->device, WT_NAME_INIT, flags, env_set ? &env : NULL)) == NULL) {
+ ret = os_errno();
+ EMSG_ERR(wtext, NULL, ret,
+ "he_open: %s: %s: %s",
+ hs->name, WT_NAME_INIT, he_strerror(ret));
+ }
/* Insert the new entry at the head of the list. */
- ks->next = ds->kvs_head;
- ds->kvs_head = ks;
+ hs->next = ds->hs_head;
+ ds->hs_head = hs;
if (0) {
-err: if (ks != NULL)
- ESET(kvs_source_close(wtext, NULL, ks));
+err: if (hs != NULL)
+ ESET(helium_source_close(wtext, NULL, hs));
}
-
- if (device_list != NULL) {
- for (p = device_list; *p != NULL; ++p)
- free(*p);
- free(device_list);
- }
-
return (ret);
}
/*
- * kvs_source_open_txn --
+ * helium_source_open_txn --
* Open the database-wide transaction store.
*/
static int
-kvs_source_open_txn(DATA_SOURCE *ds)
+helium_source_open_txn(DATA_SOURCE *ds)
{
- KVS_SOURCE *ks, *kstxn;
+ HELIUM_SOURCE *hs, *hs_txn;
WT_EXTENSION_API *wtext;
- kvs_t kvstxn, t;
+ he_t he_txn, t;
int ret = 0;
wtext = ds->wtext;
/*
- * The global txn namespace is per connection, it spans multiple KVS
+ * The global txn namespace is per connection, it spans multiple Helium
* sources.
*
- * We've opened the KVS sources: check to see if any of them already
+ * We've opened the Helium sources: check to see if any of them already
* have a transaction store, and make sure we only find one.
*/
- kstxn = NULL;
- kvstxn = NULL;
- for (ks = ds->kvs_head; ks != NULL; ks = ks->next)
- if ((t = kvs_open_namespace(
- ks->kvs_device, WT_NAME_TXN, 0)) != NULL) {
- if (kstxn != NULL) {
- (void)kvs_close(t);
- (void)kvs_close(kvstxn);
- ERET(wtext, NULL, WT_ERROR,
+ hs_txn = NULL;
+ he_txn = NULL;
+ for (hs = ds->hs_head; hs != NULL; hs = hs->next)
+ if ((t = he_open(hs->device, WT_NAME_TXN, 0, NULL)) != NULL) {
+ if (hs_txn != NULL) {
+ (void)he_close(t);
+ (void)he_close(hs_txn);
+ ERET(wtext, NULL, WT_PANIC,
"found multiple transaction stores, "
"unable to proceed");
}
- kvstxn = t;
- kstxn = ks;
+ he_txn = t;
+ hs_txn = hs;
}
/*
* If we didn't find a transaction store, open a transaction store in
- * the first KVS source we loaded. (It could just as easily be the
- * last one we loaded, we're just picking one, but picking the first
+ * the first Helium source we loaded. (It could just as easily be
+ * the last one we loaded, we're just picking one, but picking the first
* seems slightly less likely to make people wonder.)
*/
- if ((ks = kstxn) == NULL) {
- for (ks = ds->kvs_head; ks->next != NULL; ks = ks->next)
+ if ((hs = hs_txn) == NULL) {
+ for (hs = ds->hs_head; hs->next != NULL; hs = hs->next)
;
- if ((kvstxn = kvs_open_namespace(
- ks->kvs_device, WT_NAME_TXN, KVS_O_CREATE)) == NULL)
- ERET(wtext, NULL, WT_ERROR,
- "kvs_open_namespace: %s: %s",
- WT_NAME_TXN, kvs_strerror(os_errno()));
+ if ((he_txn = he_open(
+ hs->device, WT_NAME_TXN, HE_O_CREATE, NULL)) == NULL) {
+ ret = os_errno();
+ ERET(wtext, NULL, ret,
+ "he_open: %s: %s: %s",
+ hs->name, WT_NAME_TXN, he_strerror(ret));
+ }
/* Push the change. */
- if ((ret = kvs_commit(ks->kvs_device)) != 0)
- ERET(wtext, NULL, WT_ERROR,
- "kvs_commit: %s", kvs_strerror(ret));
+ if ((ret = he_commit(he_txn)) != 0)
+ ERET(wtext, NULL, ret,
+ "he_commit: %s", he_strerror(ret));
}
+ VMSG(wtext, NULL, VERBOSE_L1, "%s" "transactional store on %s",
+ hs_txn == NULL ? "creating " : "", hs->name);
- /* Set the owner field, this KVS source has to be closed last. */
- ks->kvsowner = 1;
+ /* Set the owner field, this Helium source has to be closed last. */
+ hs->he_owner = 1;
- /* Add a reference to the open transaction store in each KVS source. */
- for (ks = ds->kvs_head; ks != NULL; ks = ks->next)
- ks->kvstxn = kvstxn;
+ /* Add a reference to the transaction store in each Helium source. */
+ for (hs = ds->hs_head; hs != NULL; hs = hs->next)
+ hs->he_txn = he_txn;
return (0);
}
/*
- * kvs_source_recover_namespace --
- * Recover a single cache/primary pair in a KVS namespace.
+ * helium_source_recover_namespace --
+ * Recover a single cache/primary pair in a Helium namespace.
*/
static int
-kvs_source_recover_namespace(WT_DATA_SOURCE *wtds,
- KVS_SOURCE *ks, const char *name, WT_CONFIG_ARG *config)
+helium_source_recover_namespace(WT_DATA_SOURCE *wtds,
+ HELIUM_SOURCE *hs, const char *name, WT_CONFIG_ARG *config)
{
CURSOR *cursor;
DATA_SOURCE *ds;
@@ -3099,17 +3105,17 @@ kvs_source_recover_namespace(WT_DATA_SOURCE *wtds,
uri = NULL;
/*
- * The name we store on the Memrata device is a translation of the
+ * The name we store on the Helium device is a translation of the
* WiredTiger name: do the reverse process here so we can use the
* standard source-open function.
*/
- p = name + (sizeof(WT_NAME_PREFIX) - 1);
- len = strlen("memrata:") + strlen(ks->name) + strlen(p) + 10;
+ p = name + strlen(WT_NAME_PREFIX);
+ len = strlen("helium:") + strlen(hs->name) + strlen(p) + 10;
if ((uri = malloc(len)) == NULL) {
ret = os_errno();
goto err;
}
- (void)snprintf(uri, len, "memrata:%s/%s", ks->name, p);
+ (void)snprintf(uri, len, "helium:%s/%s", hs->name, p);
/*
* Open the cache/primary pair by going through the full open process,
@@ -3122,21 +3128,20 @@ kvs_source_recover_namespace(WT_DATA_SOURCE *wtds,
/* Fake up a cursor. */
if ((ret = fake_cursor(wtext, &wtcursor)) != 0)
- EMSG_ERR(wtext, NULL, ret,
- "kvs_source_recover_namespace: %s", strerror(ret));
+ EMSG_ERR(wtext, NULL, ret, "recovery: %s", strerror(ret));
cursor = (CURSOR *)wtcursor;
cursor->ws = ws;
/* Process, then clear, the cache. */
if ((ret = cache_cleaner(wtext, wtcursor, 0, NULL)) != 0)
goto err;
- if ((ret = kvs_truncate(ws->kvscache)) != 0)
- EMSG_ERR(wtext, NULL, WT_ERROR,
- "kvs_truncate: %s(cache): %s", ws->uri, kvs_strerror(ret));
+ if ((ret = he_truncate(ws->he_cache)) != 0)
+ EMSG_ERR(wtext, NULL, ret,
+ "he_truncate: %s(cache): %s", ws->uri, he_strerror(ret));
/* Close the underlying WiredTiger sources. */
-err: while ((ws = ks->ws_head) != NULL) {
- ks->ws_head = ws->next;
+err: while ((ws = hs->ws_head) != NULL) {
+ hs->ws_head = ws->next;
ESET(ws_source_close(wtext, NULL, ws));
}
@@ -3146,36 +3151,36 @@ err: while ((ws = ks->ws_head) != NULL) {
return (ret);
}
-struct kvs_namespace_cookie {
+struct helium_namespace_cookie {
char **list;
u_int list_cnt;
u_int list_max;
};
/*
- * kvs_namespace_list --
+ * helium_namespace_list --
* Get a list of the objects we're going to recover.
*/
static int
-kvs_namespace_list(void *cookie, const char *name)
+helium_namespace_list(void *cookie, const char *name)
{
- struct kvs_namespace_cookie *names;
- const char *p;
+ struct helium_namespace_cookie *names;
void *allocp;
names = cookie;
- /* Ignore any files without a WiredTiger prefix. */
- if (strncmp(name, WT_NAME_PREFIX, sizeof(WT_NAME_PREFIX) - 1) != 0)
+ /*
+ * Ignore any files without a WiredTiger prefix.
+ * Ignore the metadata and cache files.
+ */
+ if (!prefix_match(name, WT_NAME_PREFIX))
+ return (0);
+ if (strcmp(name, WT_NAME_INIT) == 0)
return (0);
-
- /* Ignore the transaction store. */
if (strcmp(name, WT_NAME_TXN) == 0)
return (0);
-
- /* Ignore the "cache" files. */
- p = name + (sizeof(WT_NAME_PREFIX) - 1);
- if ((p = strchr(p, '.')) != NULL && strcmp(p, WT_NAME_CACHE) == 0)
+ if (string_match(
+ strrchr(name, '.'), WT_NAME_CACHE, strlen(WT_NAME_CACHE)))
return (0);
if (names->list_cnt + 1 >= names->list_max) {
@@ -3193,13 +3198,14 @@ kvs_namespace_list(void *cookie, const char *name)
}
/*
- * kvs_source_recover --
- * Recover the KVS source.
+ * helium_source_recover --
+ * Recover the HELIUM_SOURCE.
*/
static int
-kvs_source_recover(WT_DATA_SOURCE *wtds, KVS_SOURCE *ks, WT_CONFIG_ARG *config)
+helium_source_recover(
+ WT_DATA_SOURCE *wtds, HELIUM_SOURCE *hs, WT_CONFIG_ARG *config)
{
- struct kvs_namespace_cookie names;
+ struct helium_namespace_cookie names;
DATA_SOURCE *ds;
WT_EXTENSION_API *wtext;
u_int i;
@@ -3207,25 +3213,27 @@ kvs_source_recover(WT_DATA_SOURCE *wtds, KVS_SOURCE *ks, WT_CONFIG_ARG *config)
ds = (DATA_SOURCE *)wtds;
wtext = ds->wtext;
-
memset(&names, 0, sizeof(names));
- /* Get a list of the cache/primary object pairs in the KVS source. */
- if ((ret = kvs_namespaces(
- ks->kvs_device, kvs_namespace_list, &names)) != 0)
- ERET(wtext, NULL, WT_ERROR,
- "kvs_namespaces: %s: %s", ks->name, kvs_strerror(ret));
+ VMSG(wtext, NULL, VERBOSE_L1, "recover %s", hs->name);
+
+ /* Get a list of the cache/primary object pairs in the Helium source. */
+ if ((ret = he_enumerate(
+ hs->device, helium_namespace_list, &names)) != 0)
+ ERET(wtext, NULL, ret,
+ "he_enumerate: %s: %s", hs->name, he_strerror(ret));
/* Recover the objects. */
for (i = 0; i < names.list_cnt; ++i)
- if ((ret = kvs_source_recover_namespace(
- wtds, ks, names.list[i], config)) != 0)
+ if ((ret = helium_source_recover_namespace(
+ wtds, hs, names.list[i], config)) != 0)
goto err;
/* Clear the transaction store. */
- if ((ret = kvs_truncate(ks->kvstxn)) != 0)
- EMSG_ERR(wtext, NULL, WT_ERROR,
- "kvs_truncate: %s: %s", WT_NAME_TXN, kvs_strerror(ret));
+ if ((ret = he_truncate(hs->he_txn)) != 0)
+ EMSG_ERR(wtext, NULL, ret,
+ "he_truncate: %s: %s: %s",
+ hs->name, WT_NAME_TXN, he_strerror(ret));
err: for (i = 0; i < names.list_cnt; ++i)
free(names.list[i]);
@@ -3235,14 +3243,14 @@ err: for (i = 0; i < names.list_cnt; ++i)
}
/*
- * kvs_terminate --
+ * helium_terminate --
* Unload the data-source.
*/
static int
-kvs_terminate(WT_DATA_SOURCE *wtds, WT_SESSION *session)
+helium_terminate(WT_DATA_SOURCE *wtds, WT_SESSION *session)
{
DATA_SOURCE *ds;
- KVS_SOURCE *ks, *last;
+ HELIUM_SOURCE *hs, *last;
WT_EXTENSION_API *wtext;
int ret = 0;
@@ -3254,20 +3262,20 @@ kvs_terminate(WT_DATA_SOURCE *wtds, WT_SESSION *session)
ret = writelock(wtext, session, &ds->global_lock);
/*
- * Close the KVS sources, close the KVS source that "owns" the
+ * Close the Helium sources, close the Helium source that "owns" the
* database transaction store last.
*/
last = NULL;
- while ((ks = ds->kvs_head) != NULL) {
- ds->kvs_head = ks->next;
- if (ks->kvsowner) {
- last = ks;
+ while ((hs = ds->hs_head) != NULL) {
+ ds->hs_head = hs->next;
+ if (hs->he_owner) {
+ last = hs;
continue;
}
- ESET(kvs_source_close(wtext, session, ks));
+ ESET(helium_source_close(wtext, session, hs));
}
if (last != NULL)
- ESET(kvs_source_close(wtext, session, last));
+ ESET(helium_source_close(wtext, session, last));
/* Unlock and destroy the system. */
if (ds->lockinit) {
@@ -3282,7 +3290,7 @@ kvs_terminate(WT_DATA_SOURCE *wtds, WT_SESSION *session)
/*
* wiredtiger_extension_init --
- * Initialize the KVS connector code.
+ * Initialize the Helium connector code.
*/
int
wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
@@ -3291,44 +3299,47 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
* List of the WT_DATA_SOURCE methods -- it's static so it breaks at
* compile-time should the structure change underneath us.
*/
- static WT_DATA_SOURCE wtds = {
- kvs_session_create, /* session.create */
+ static const WT_DATA_SOURCE wtds = {
+ helium_session_create, /* session.create */
NULL, /* No session.compaction */
- kvs_session_drop, /* session.drop */
- kvs_session_open_cursor, /* session.open_cursor */
- kvs_session_rename, /* session.rename */
+ helium_session_drop, /* session.drop */
+ helium_session_open_cursor, /* session.open_cursor */
+ helium_session_rename, /* session.rename */
NULL, /* No session.salvage */
- kvs_session_truncate, /* session.truncate */
+ helium_session_truncate, /* session.truncate */
NULL, /* No session.range_truncate */
- kvs_session_verify, /* session.verify */
- kvs_session_checkpoint, /* session.checkpoint */
- kvs_terminate /* termination */
+ helium_session_verify, /* session.verify */
+ helium_session_checkpoint, /* session.checkpoint */
+ helium_terminate /* termination */
};
static const char *session_create_opts[] = {
- "kvs_open_o_truncate=0",
- "kvs_open_o_debug=0",
+ "helium_o_compress=0", /* HE_I_COMPRESS */
+ "helium_o_truncate=0", /* HE_O_TRUNCATE */
NULL
};
DATA_SOURCE *ds;
- KVS_SOURCE *ks;
+ HELIUM_SOURCE *hs;
WT_CONFIG_ITEM k, v;
WT_CONFIG_SCAN *scan;
WT_EXTENSION_API *wtext;
- int ret = 0;
+ int vmajor, vminor, ret = 0;
const char **p;
- (void)config; /* Unused parameters */
-
ds = NULL;
- /* Acquire the extension API */
+
wtext = connection->get_extension_api(connection);
/* Check the library version */
-#if KVS_VERSION_MAJOR != 4 || KVS_VERSION_MINOR != 13
+#if HE_VERSION_MAJOR != 2 || HE_VERSION_MINOR != 2
ERET(wtext, NULL, EINVAL,
- "unsupported KVS library version %d.%d, expected version 4.13",
- KVS_VERSION_MAJOR, KVS_VERSION_MINOR);
+ "unsupported Levyx/Helium header file %d.%d, expected version 2.2",
+ HE_VERSION_MAJOR, HE_VERSION_MINOR);
#endif
+ he_version(&vmajor, &vminor);
+ if (vmajor != 2 || vminor != 2)
+ ERET(wtext, NULL, EINVAL,
+ "unsupported Levyx/Helium library version %d.%d, expected "
+ "version 2.2", vmajor, vminor);
/* Allocate and initialize the local data-source structure. */
if ((ds = calloc(1, sizeof(DATA_SOURCE))) == NULL)
@@ -3345,15 +3356,20 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
"WT_EXTENSION_API.config_get: config: %s",
wtext->strerror(ret));
- /* Step through the list of KVS sources, opening each one. */
+ /* Step through the list of Helium sources, opening each one. */
if ((ret =
wtext->config_scan_begin(wtext, NULL, v.str, v.len, &scan)) != 0)
EMSG_ERR(wtext, NULL, ret,
"WT_EXTENSION_API.config_scan_begin: config: %s",
wtext->strerror(ret));
- while ((ret = wtext->config_scan_next(wtext, scan, &k, &v)) == 0)
- if ((ret = kvs_source_open(ds, &k, &v)) != 0)
+ while ((ret = wtext->config_scan_next(wtext, scan, &k, &v)) == 0) {
+ if (string_match("helium_verbose", k.str, k.len)) {
+ verbose = v.val == 0 ? 0 : 1;
+ continue;
+ }
+ if ((ret = helium_source_open(ds, &k, &v)) != 0)
goto err;
+ }
if (ret != WT_NOTFOUND)
EMSG_ERR(wtext, NULL, ret,
"WT_EXTENSION_API.config_scan_next: config: %s",
@@ -3364,26 +3380,26 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
wtext->strerror(ret));
/* Find and open the database transaction store. */
- if ((ret = kvs_source_open_txn(ds)) != 0)
+ if ((ret = helium_source_open_txn(ds)) != 0)
return (ret);
- /* Recover each KVS source. */
- for (ks = ds->kvs_head; ks != NULL; ks = ks->next)
- if ((ret = kvs_source_recover(&ds->wtds, ks, config)) != 0)
+ /* Recover each Helium source. */
+ for (hs = ds->hs_head; hs != NULL; hs = hs->next)
+ if ((ret = helium_source_recover(&ds->wtds, hs, config)) != 0)
goto err;
- /* Start each KVS source cleaner thread. */
- for (ks = ds->kvs_head; ks != NULL; ks = ks->next)
+ /* Start each Helium source cleaner thread. */
+ for (hs = ds->hs_head; hs != NULL; hs = hs->next)
if ((ret = pthread_create(
- &ks->cleaner_id, NULL, kvs_cleaner, ks)) != 0)
+ &hs->cleaner_id, NULL, cache_cleaner_worker, hs)) != 0)
EMSG_ERR(wtext, NULL, ret,
"%s: pthread_create: cleaner thread: %s",
- ks->name, strerror(ret));
+ hs->name, strerror(ret));
- /* Add KVS-specific configuration options. */
+ /* Add Helium-specific WT_SESSION.create configuration options. */
for (p = session_create_opts; *p != NULL; ++p)
if ((ret = connection->configure_method(connection,
- "session.create", "memrata:", *p, "boolean", NULL)) != 0)
+ "session.create", "helium:", *p, "boolean", NULL)) != 0)
EMSG_ERR(wtext, NULL, ret,
"WT_CONNECTION.configure_method: session.create: "
"%s: %s",
@@ -3391,19 +3407,19 @@ wiredtiger_extension_init(WT_CONNECTION *connection, WT_CONFIG_ARG *config)
/* Add the data source */
if ((ret = connection->add_data_source(
- connection, "memrata:", (WT_DATA_SOURCE *)ds, NULL)) != 0)
+ connection, "helium:", (WT_DATA_SOURCE *)ds, NULL)) != 0)
EMSG_ERR(wtext, NULL, ret,
"WT_CONNECTION.add_data_source: %s", wtext->strerror(ret));
return (0);
err: if (ds != NULL)
- ESET(kvs_terminate((WT_DATA_SOURCE *)ds, NULL));
+ ESET(helium_terminate((WT_DATA_SOURCE *)ds, NULL));
return (ret);
}
/*
* wiredtiger_extension_terminate --
- * Shutdown the KVS connector code.
+ * Shutdown the Helium connector code.
*/
int
wiredtiger_extension_terminate(WT_CONNECTION *connection)
diff --git a/ext/test/memrata/Makefile.am b/ext/test/memrata/Makefile.am
deleted file mode 100644
index 1962680d3fe..00000000000
--- a/ext/test/memrata/Makefile.am
+++ /dev/null
@@ -1,12 +0,0 @@
-AM_CPPFLAGS = -I$(top_builddir) \
- -I$(top_srcdir)/src/include -I$(MEMRATA_PATH)
-
-noinst_LTLIBRARIES = libwiredtiger_memrata.la
-libwiredtiger_memrata_la_SOURCES = memrata.c
-libwiredtiger_memrata_la_LIBADD = \
- -L$(MEMRATA_PATH) -lkvs -L$(BERKELEY_DB_PATH)/lib -ldb
-
-# libtool hack: noinst_LTLIBRARIES turns off building shared libraries as well
-# as installation, it will only build static libraries. As far as I can tell,
-# the "approved" libtool way to turn them back on is by adding -rpath.
-libwiredtiger_memrata_la_LDFLAGS = -avoid-version -module -rpath /nowhere
diff --git a/lang/python/wiredtiger.i b/lang/python/wiredtiger.i
index 670486a541d..31dc159410b 100644
--- a/lang/python/wiredtiger.i
+++ b/lang/python/wiredtiger.i
@@ -57,7 +57,7 @@ from packing import pack, unpack
}
%typemap(in, numinputs=0) WT_EVENT_HANDLER * %{
- $1 = &pyApiEventHandler;
+ $1 = &pyApiEventHandler;
%}
/* Set the return value to the returned connection, session, or cursor */
@@ -528,8 +528,8 @@ typedef int int_void;
%extend __wt_session {
int log_printf(const char *msg) {
- return self->log_printf(self, "%s", msg);
- }
+ return self->log_printf(self, "%s", msg);
+ }
int _freecb() {
return (sessionFreeHandler(self));
@@ -601,7 +601,7 @@ writeToPythonStream(const char *streamname, const char *message)
strcpy(&msg[msglen], "\n");
/* Acquire python Global Interpreter Lock. Otherwise can segfault. */
- SWIG_PYTHON_THREAD_BEGIN_BLOCK;
+ SWIG_PYTHON_THREAD_BEGIN_BLOCK;
ret = 1;
if ((sys = PyImport_ImportModule("sys")) == NULL)
@@ -617,7 +617,7 @@ writeToPythonStream(const char *streamname, const char *message)
ret = 0;
err: /* Release python Global Interpreter Lock */
- SWIG_PYTHON_THREAD_END_BLOCK;
+ SWIG_PYTHON_THREAD_END_BLOCK;
if (arglist)
Py_XDECREF(arglist);
@@ -656,11 +656,11 @@ pythonClose(PY_CALLBACK *pcb)
{
int ret;
- /*
- * Ensure the global interpreter lock is held - so that Python
- * doesn't shut down threads while we use them.
- */
- SWIG_PYTHON_THREAD_BEGIN_BLOCK;
+ /*
+ * Ensure the global interpreter lock is held - so that Python
+ * doesn't shut down threads while we use them.
+ */
+ SWIG_PYTHON_THREAD_BEGIN_BLOCK;
ret = 0;
if (PyObject_SetAttrString(pcb->pyobj, "this", Py_None) == -1) {
@@ -669,7 +669,7 @@ pythonClose(PY_CALLBACK *pcb)
}
Py_XDECREF(pcb->pyobj);
- SWIG_PYTHON_THREAD_END_BLOCK;
+ SWIG_PYTHON_THREAD_END_BLOCK;
return (ret);
}
diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c
index f5b0180b2a6..d57162c06a9 100644
--- a/src/btree/bt_evict.c
+++ b/src/btree/bt_evict.c
@@ -246,7 +246,7 @@ __evict_worker(WT_SESSION_IMPL *session)
"Eviction pass with: Max: %" PRIu64
" In use: %" PRIu64 " Dirty: %" PRIu64 " Internal: %s",
bytes_max, bytes_inuse, dirty_inuse,
- F_ISSET(cache, WT_EVICT_INTERNAL) ? "yes" : "no");
+ LF_ISSET(WT_EVICT_PASS_INTERNAL) ? "yes" : "no");
/*
* When the cache is full, track whether pages are being
@@ -858,7 +858,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
btree->evict_page->ref->state == WT_REF_EVICT_WALK);
walk_flags = WT_TREE_EVICT;
- if (F_ISSET(cache, WT_EVICT_INTERNAL))
+ if (LF_ISSET(WT_EVICT_PASS_INTERNAL))
walk_flags |= WT_TREE_SKIP_LEAF;
/*
* Get some more eviction candidate pages.
@@ -887,7 +887,13 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
if (WT_PAGE_IS_ROOT(page))
continue;
- /* Look for a split-merge (grand)parent page to merge. */
+ /*
+ * Look for a split-merge (grand)parent page to merge.
+ *
+ * Only look for a parent at exactly the right height above: if
+ * the stack is deep enough, we'll find it eventually, and we
+ * don't want to do too much work on every level.
+ */
levels = 0;
if (__wt_btree_mergeable(page))
for (levels = 1;
@@ -900,85 +906,96 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, uint32_t flags)
continue;
/*
- * Only look for a parent at exactly the right height above: if
- * the stack is deep enough, we'll find it eventually, and we
- * don't want to do too much work on every level.
- *
+ * Use the EVICT_LRU flag to avoid putting pages onto the list
+ * multiple times.
+ */
+ if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU))
+ continue;
+
+ /*
* !!!
- * Don't restrict ourselves to only the top-most page (that is,
- * don't require that page->parent is not mergeable). If there
- * is a big, busy enough split-merge tree, the top-level merge
- * will only happen if we can lock the whole subtree
- * exclusively. Consider smaller merges in case locking the
- * whole tree fails.
+ * In normal operation, don't restrict ourselves to only the
+ * top-most page (that is, don't require that page->parent is
+ * not mergeable). If there is a big, busy enough split-merge
+ * tree, the top-level merge will only happen if we can lock
+ * the whole subtree exclusively. Consider smaller merges in
+ * case locking the whole tree fails.
*/
- if (levels != 0 && levels != WT_MERGE_STACK_MIN)
+ if (levels != 0) {
+ if (levels < WT_MERGE_STACK_MIN)
+ continue;
+
+ /*
+ * Concentrate near the top of a stack -- with forced
+ * eviction, stacks of split-merge pages can get very
+ * deep, and merging near the bottom isn't helpful.
+ */
+ if (LF_ISSET(WT_EVICT_PASS_INTERNAL) &&
+ __wt_btree_mergeable(page->parent) &&
+ __wt_btree_mergeable(page->parent->parent))
+ continue;
+
+ /* The remaining checks don't apply to merges. */
+ goto add;
+ } else if (LF_ISSET(WT_EVICT_PASS_INTERNAL))
continue;
/*
- * If this page has never been considered for eviction, set its
- * read generation to a little bit in the future and move on,
- * give readers a chance to start updating the read generation.
+ * If this page has never been considered for eviction,
+ * set its read generation to a little bit in the
+ * future and move on, give readers a chance to start
+ * updating the read generation.
*/
if (page->read_gen == WT_READ_GEN_NOTSET) {
- page->read_gen = __wt_cache_read_gen_set(session);
+ page->read_gen =
+ __wt_cache_read_gen_set(session);
continue;
}
/*
- * Use the EVICT_LRU flag to avoid putting pages onto the list
- * multiple times.
+ * If the file is being checkpointed, there's a period
+ * of time where we can't discard any page with a
+ * modification structure because it might race with
+ * the checkpointing thread.
+ *
+ * During this phase, there is little point trying to
+ * evict dirty pages: we might be lucky and find an
+ * internal page that has not yet been checkpointed,
+ * but much more likely is that we will waste effort
+ * considering dirty leaf pages that cannot be evicted
+ * because they have modifications more recent than the
+ * checkpoint.
*/
- if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU))
+ modified = __wt_page_is_modified(page);
+ if (modified && btree->checkpointing)
continue;
- /* The following checks apply to eviction but not merges. */
- if (levels == 0) {
- /*
- * If the file is being checkpointed, there's a period
- * of time where we can't discard any page with a
- * modification structure because it might race with
- * the checkpointing thread.
- *
- * During this phase, there is little point trying to
- * evict dirty pages: we might be lucky and find an
- * internal page that has not yet been checkpointed,
- * but much more likely is that we will waste effort
- * considering dirty leaf pages that cannot be evicted
- * because they have modifications more recent than the
- * checkpoint.
- */
- modified = __wt_page_is_modified(page);
- if (modified && btree->checkpointing)
- continue;
-
- /* Optionally ignore clean pages. */
- if (!modified && LF_ISSET(WT_EVICT_PASS_DIRTY))
- continue;
+ /* Optionally ignore clean pages. */
+ if (!modified && LF_ISSET(WT_EVICT_PASS_DIRTY))
+ continue;
- /*
- * If the oldest transaction hasn't changed since the
- * last time this page was written, it's unlikely that
- * we can make progress. Similarly, if the most recent
- * update on the page is not yet globally visible,
- * eviction will fail. These heuristics attempt to
- * avoid repeated attempts to evict the same page.
- *
- * That said, if eviction is stuck, or the file is
- * being checkpointed, try anyway: maybe a transaction
- * that were running last time we wrote the page has
- * since rolled back, or we can help get the checkpoint
- * completed sooner.
- */
- if (modified && !F_ISSET(cache, WT_EVICT_STUCK) &&
- (page->modify->disk_snap_min ==
- S2C(session)->txn_global.oldest_id ||
- !__wt_txn_visible_all(session,
- page->modify->update_txn)))
- continue;
- }
+ /*
+ * If the oldest transaction hasn't changed since the
+ * last time this page was written, it's unlikely that
+ * we can make progress. Similarly, if the most recent
+ * update on the page is not yet globally visible,
+ * eviction will fail. These heuristics attempt to
+ * avoid repeated attempts to evict the same page.
+ *
+ * That said, if eviction is stuck, or the file is
+ * being checkpointed, try anyway: maybe a transaction
+ * that were running last time we wrote the page has
+ * since rolled back, or we can help get the checkpoint
+ * completed sooner.
+ */
+ if (modified && !F_ISSET(cache, WT_EVICT_STUCK) &&
+ (page->modify->disk_snap_min ==
+ S2C(session)->txn_global.oldest_id ||
+ !__wt_txn_visible_all(session,
+ page->modify->update_txn)))
+ continue;
- WT_ASSERT(session, evict->page == NULL);
+add: WT_ASSERT(session, evict->page == NULL);
__evict_init_candidate(session, evict, page);
++evict;
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index 5c1f45a8030..f6cc4cc6fb3 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -11,7 +11,7 @@ static int __btree_conf(WT_SESSION_IMPL *, WT_CKPT *ckpt);
static int __btree_get_last_recno(WT_SESSION_IMPL *);
static int __btree_page_sizes(WT_SESSION_IMPL *);
static int __btree_preload(WT_SESSION_IMPL *);
-static int __btree_tree_open_empty(WT_SESSION_IMPL *, int);
+static int __btree_tree_open_empty(WT_SESSION_IMPL *, int, int);
static int pse1(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t);
static int pse2(WT_SESSION_IMPL *, const char *, uint32_t, uint32_t, int);
@@ -102,7 +102,8 @@ __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[])
ckpt.raw.data, ckpt.raw.size,
root_addr, &root_addr_size, readonly));
if (creation || root_addr_size == 0)
- WT_ERR(__btree_tree_open_empty(session, creation));
+ WT_ERR(__btree_tree_open_empty(
+ session, creation, readonly));
else {
WT_ERR(__wt_btree_tree_open(
session, root_addr, root_addr_size));
@@ -355,7 +356,7 @@ err: __wt_buf_free(session, &dsk);
* Create an empty in-memory tree.
*/
static int
-__btree_tree_open_empty(WT_SESSION_IMPL *session, int creation)
+__btree_tree_open_empty(WT_SESSION_IMPL *session, int creation, int readonly)
{
WT_BTREE *btree;
WT_DECL_RET;
@@ -423,23 +424,31 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, int creation)
* the root page dirty to force a write, and without reconciling the
* leaf page we won't realize there's no records to write, we'll write
* a root page, which isn't correct for an empty tree.
- * Earlier versions of this code kept the leaf page clean, but with
- * the "empty" flag set in the leaf page's modification structure; in
- * that case, checkpoints works (forced reconciliation of a root with
- * a single "empty" page wouldn't write any blocks). That version had
+ *
+ * Earlier versions of this code kept the leaf page clean, but with the
+ * "empty" flag set in the leaf page's modification structure; in that
+ * case, checkpoints works (forced reconciliation of a root with a
+ * single "empty" page wouldn't write any blocks). That version had
* memory leaks because the eviction code didn't correctly handle pages
* that were "clean" (and so never reconciled), yet "modified" with an
* "empty" flag. The goal of this code is to mimic a real tree that
* simply has no records, for whatever reason, and trust reconciliation
* to figure out it's empty and not write any blocks.
- * We do not set the tree's modified flag because the checkpoint code
- * skips unmodified files in closing checkpoints (checkpoints that don't
- * require a write unless the file is actually dirty). There's no need
- * to reconcile this file unless the application does a real checkpoint
- * or it's actually modified.
+ *
+ * We do not set the tree's modified flag because the checkpoint code
+ * skips unmodified files in closing checkpoints (checkpoints that
+ * don't require a write unless the file is actually dirty). There's
+ * no need to reconcile this file unless the application does a real
+ * checkpoint or it's actually modified.
+ *
+ * Only do this for a live tree, not for checkpoints. If we open an
+ * empty checkpoint, the leaf page cannot be dirty or eviction may try
+ * to write it, which will fail because checkpoints are read-only.
*/
- WT_ERR(__wt_page_modify_init(session, leaf));
- __wt_page_only_modify_set(session, leaf);
+ if (!readonly) {
+ WT_ERR(__wt_page_modify_init(session, leaf));
+ __wt_page_only_modify_set(session, leaf);
+ }
btree->root_page = root;
diff --git a/src/btree/rec_evict.c b/src/btree/rec_evict.c
index f7d146c42e2..0713989af58 100644
--- a/src/btree/rec_evict.c
+++ b/src/btree/rec_evict.c
@@ -441,32 +441,32 @@ ckpt: WT_STAT_FAST_CONN_INCR(session, cache_eviction_checkpoint);
*/
if (__wt_page_is_modified(page) &&
!F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)) {
- ret = __wt_rec_write(session, page,
- NULL, WT_EVICTION_SERVER_LOCKED | WT_SKIP_UPDATE_QUIT);
-
- /*
- * Update the page's modification reference, reconciliation
- * might have changed it.
- */
- mod = page->modify;
-
/*
- * If reconciliation failed due to active modifications and
- * the page is a lot larger than the maximum allowed, it is
- * likely that we are having trouble reconciling it due to
- * contention, attempt to split the page in memory.
+ * If the page is larger than the maximum allowed, attempt to
+ * split the page in memory before evicting it. The in-memory
+ * split checks for left and right splits, and prevents the
+ * tree deepening unnecessarily.
*
* Note, we won't be here if recursively descending a tree of
* pages: dirty row-store leaf pages can't be merged into their
* parents, which means if top wasn't true in this test, we'd
* have returned busy before attempting reconciliation.
*/
- if (ret == EBUSY &&
- page->type == WT_PAGE_ROW_LEAF &&
+ if (page->type == WT_PAGE_ROW_LEAF &&
+ !F_ISSET_ATOMIC(page, WT_PAGE_WAS_SPLIT) &&
__wt_eviction_force_check(session, page)) {
*inmem_split = 1;
return (0);
}
+
+ ret = __wt_rec_write(session, page,
+ NULL, WT_EVICTION_SERVER_LOCKED | WT_SKIP_UPDATE_QUIT);
+
+ /*
+ * Update the page's modification reference, reconciliation
+ * might have changed it.
+ */
+ mod = page->modify;
if (ret == EBUSY) {
/* Give up if there are unwritten changes */
WT_VERBOSE_RET(session, evict,
diff --git a/src/btree/rec_merge.c b/src/btree/rec_merge.c
index 7599fa8cb84..cf8ef88c5ac 100644
--- a/src/btree/rec_merge.c
+++ b/src/btree/rec_merge.c
@@ -307,14 +307,9 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top)
if (visit_state.maxdepth < WT_MERGE_STACK_MIN)
return (EBUSY);
- /*
- * Don't allow split merges to generate arbitrarily large pages.
- * Ideally we would choose a size based on the internal_page_max
- * setting for the btree, but we don't have the correct btree handle
- * available.
- */
- if (visit_state.refcnt > WT_MERGE_MAX_REFS)
- return (EBUSY);
+ /* Pages cannot grow larger than 2**32, but that should never happen. */
+ if (visit_state.refcnt > UINT32_MAX)
+ return (ENOMEM);
/*
* Now we either collapse the internal pages into one split-merge page,
@@ -332,17 +327,19 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top)
* In the normal case where there are live children spread
* through the subtree, create two child pages.
*
- * Handle the case where the only live child is first / last
- * specially: put the live child into the top-level page.
+ * Handle the case where the live children are all near the
+ * beginning / end specially: put the last live child into the
+ * top-level page, to avoid getting much deeper during
+ * append-only workloads.
*
* Set SPLIT_MERGE on the internal pages if there are any live
* children: they can't be evicted, so there is no point
* permanently deepening the tree.
*/
- if (visit_state.first_live == visit_state.last_live &&
- (visit_state.first_live == 0 ||
- visit_state.first_live == refcnt - 1))
- split = (visit_state.first_live == 0) ? 1 : refcnt - 1;
+ if (visit_state.last_live <= refcnt / 10)
+ split = 1;
+ else if (visit_state.first_live >= (9 * refcnt) / 10)
+ split = refcnt - 1;
else
split = (refcnt + 1) / 2;
@@ -370,7 +367,7 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top)
else {
WT_ERR(__wt_btree_new_modified_page(
session, page_type, split,
- visit_state.first_live < split, &lchild));
+ split < WT_MERGE_FULL_PAGE, &lchild));
visit_state.first = lchild;
}
@@ -380,8 +377,8 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top)
visit_state.second_ref = &newtop->u.intl.t[1];
} else {
WT_ERR(__wt_btree_new_modified_page(
- session, page_type,
- refcnt - split, visit_state.last_live >= split,
+ session, page_type, refcnt - split,
+ refcnt - split < WT_MERGE_FULL_PAGE,
&rchild));
visit_state.second = rchild;
visit_state.second_ref =
@@ -389,17 +386,15 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top)
}
} else {
/*
- * Create a new split-merge page for small merges, or if the
- * page above is a split merge page. When we do a big enough
- * merge, we create a real page at the top and don't consider
- * it as a merge candidate again. Over time with an insert
- * workload the tree will grow deeper, but that's inevitable,
- * and this keeps individual merges small.
+ * Create a new split-merge page for small merges. When we do
+ * a big enough merge, we create a real page at the top and
+ * don't consider it as a merge candidate again. Over time
+ * with an insert workload the tree will grow deeper, but
+ * that's inevitable, and this keeps individual merges small.
*/
WT_ERR(__wt_btree_new_modified_page(
session, page_type, refcnt,
- refcnt < WT_MERGE_FULL_PAGE ||
- __wt_btree_mergeable(top->parent),
+ refcnt < WT_MERGE_FULL_PAGE,
&newtop));
visit_state.first = newtop;
diff --git a/src/btree/rec_track.c b/src/btree/rec_track.c
index 1ea5c1093d5..99e9aebc14f 100644
--- a/src/btree/rec_track.c
+++ b/src/btree/rec_track.c
@@ -382,13 +382,13 @@ __ovfl_reuse_dump(WT_SESSION_IMPL *session, WT_PAGE *page)
/*
* __ovfl_reuse_skip_search --
- * Return the first matching value in the overflow reuse list.
+ * Return the first, not in-use, matching value in the overflow reuse list.
*/
static WT_OVFL_REUSE *
__ovfl_reuse_skip_search(
WT_OVFL_REUSE **head, const void *value, size_t value_size)
{
- WT_OVFL_REUSE **e;
+ WT_OVFL_REUSE **e, *next;
size_t len;
int cmp, i;
@@ -404,13 +404,29 @@ __ovfl_reuse_skip_search(
}
/*
- * Return any exact matches: we don't care in what search level
- * we found a match.
+ * Values are not unique, and it's possible to have long lists
+ * of identical overflow items. (We've seen it in benchmarks.)
+ * Move through a list of identical items at the current level
+ * as long as the next one is in-use, otherwise, drop down a
+ * level. When at the bottom level, return items if reusable,
+ * else NULL.
*/
len = WT_MIN((*e)->value_size, value_size);
cmp = memcmp(WT_OVFL_REUSE_VALUE(*e), value, len);
- if (cmp == 0 && (*e)->value_size == value_size)
- return (*e);
+ if (cmp == 0 && (*e)->value_size == value_size) {
+ if (i == 0)
+ return (F_ISSET(*e,
+ WT_OVFL_REUSE_INUSE) ? NULL : *e);
+ if ((next = (*e)->next[i]) == NULL ||
+ !F_ISSET(next, WT_OVFL_REUSE_INUSE) ||
+ next->value_size != len || memcmp(
+ WT_OVFL_REUSE_VALUE(next), value, len) != 0) {
+ --i; /* Drop down a level */
+ --e;
+ } else /* Keep going at this level */
+ e = &(*e)->next[i];
+ continue;
+ }
/*
* If the skiplist value is larger than the search value, or
@@ -612,28 +628,19 @@ __wt_ovfl_reuse_search(WT_SESSION_IMPL *session, WT_PAGE *page,
head = page->modify->ovfl_track->ovfl_reuse;
/*
- * The search function returns the first matching record in the list,
- * which may be the first of many, overflow records may be identical.
- * Find one without the in-use flag set and put it back into service.
+ * The search function returns the first matching record in the list
+ * which does not have the in-use flag set, or NULL.
*/
if ((reuse = __ovfl_reuse_skip_search(head, value, value_size)) == NULL)
return (0);
- do {
- if (!F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) {
- *addrp = WT_OVFL_REUSE_ADDR(reuse);
- *addr_sizep = reuse->addr_size;
- F_SET(reuse, WT_OVFL_REUSE_INUSE);
- if (WT_VERBOSE_ISSET(session, overflow))
- WT_RET(__ovfl_reuse_verbose(
- session, page, reuse, "reclaim"));
- return (1);
- }
- } while ((reuse = reuse->next[0]) != NULL &&
- reuse->value_size == value_size &&
- memcmp(WT_OVFL_REUSE_VALUE(reuse), value, value_size) == 0);
+ *addrp = WT_OVFL_REUSE_ADDR(reuse);
+ *addr_sizep = reuse->addr_size;
+ F_SET(reuse, WT_OVFL_REUSE_INUSE);
- return (0);
+ if (WT_VERBOSE_ISSET(session, overflow))
+ WT_RET(__ovfl_reuse_verbose(session, page, reuse, "reclaim"));
+ return (1);
}
/*
diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c
index 66ce4c089e8..81a4ec7a025 100644
--- a/src/btree/rec_write.c
+++ b/src/btree/rec_write.c
@@ -1616,10 +1616,10 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, WT_RECONCILE *r, int final)
* We can't compress the first 64B of the block (it must be
* written without compression), and a possible split point
* may appear in that 64B; keep it simple, ignore the first
- * 1KB of data, anybody splitting a smaller than 1KB piece
- * (as calculated before compression), is doing us wrong.
+ * allocation size of data, anybody splitting smaller than
+ * that (as calculated before compression), is doing it wrong.
*/
- if ((len = WT_PTRDIFF(cell, dsk)) > 1024)
+ if ((len = WT_PTRDIFF(cell, dsk)) > btree->allocsize)
r->raw_offsets[++slots] =
WT_STORE_SIZE(len - WT_BLOCK_COMPRESS_SKIP);
@@ -1677,12 +1677,19 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, WT_RECONCILE *r, int final)
* compression function.
*/
memcpy(dst->mem, dsk, WT_BLOCK_COMPRESS_SKIP);
- WT_ERR(compressor->compress_raw(compressor, wt_session,
+ ret = compressor->compress_raw(compressor, wt_session,
r->page_size_max, btree->split_pct,
WT_BLOCK_COMPRESS_SKIP, (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP,
r->raw_offsets, slots,
(uint8_t *)dst->mem + WT_BLOCK_COMPRESS_SKIP,
- result_len, final, &result_len, &result_slots));
+ result_len, final, &result_len, &result_slots);
+ if (ret == EAGAIN) {
+ ret = 0;
+ if (!final)
+ goto more_rows;
+ result_slots = 0;
+ }
+ WT_ERR(ret);
dst->size = (uint32_t)result_len + WT_BLOCK_COMPRESS_SKIP;
if (result_slots != 0) {
@@ -1701,11 +1708,14 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session, WT_RECONCILE *r, int final)
* There may be a remnant in the working buffer that didn't get
* compressed; copy it down to the start of the working buffer
* and update the starting record number, free space and so on.
+ * !!!
+ * Note use of memmove, the source and destination buffers can
+ * overlap.
*/
len = WT_PTRDIFF(r->first_free, (uint8_t *)dsk +
r->raw_offsets[result_slots] + WT_BLOCK_COMPRESS_SKIP);
dsk_start = WT_PAGE_HEADER_BYTE(btree, dsk);
- (void)memcpy(dsk_start, (uint8_t *)r->first_free - len, len);
+ (void)memmove(dsk_start, (uint8_t *)r->first_free - len, len);
r->entries -= r->raw_entries[result_slots - 1];
r->first_free = dsk_start + len;
diff --git a/src/docs/compression.dox b/src/docs/compression.dox
index 59f03f4e8ef..92f5c27f25e 100644
--- a/src/docs/compression.dox
+++ b/src/docs/compression.dox
@@ -1,48 +1,36 @@
/*! @page compression Compressors
This section explains how to configure WiredTiger's builtin support for
-the bzip2 and snappy compression engines.
+the snappy and bzip2 compression engines.
-@section compression_bzip2 Using bzip2 compression
+@section compression_zlib Using zlib compression
-To use the builtin support for
-<a href="http://www.bzip.org/">Julian Seward's bzip2</a>
-compression, first check that bzip2 is installed in include and library
-directories searched by the compiler. Once bzip2 is installed, you can
-enable bzip2 using the \c --enable-bzip2 option to configure.
+To use the builtin support for Greg Roelofs' and Mark Adler's
+<a href="http://www.zlib.net/">zlib</a>
+compression, first check that zlib is installed in include and library
+directories searched by the compiler. Once zlib is installed, you can
+enable zlib using the \c --enable-zlib option to configure.
-If bzip2 is installed in a location not normally searched by the
-compiler toolchain, you'll need to modify the \c CPPFLAGS and \c LDFLAGS
-to indicate these locations. For example, with the bzip2 includes and
+If zlib is installed in a location not normally searched by the compiler
+toolchain, you'll need to modify the \c CPPFLAGS and \c LDFLAGS to
+indicate these locations. For example, with the zlib includes and
libraries installed in \c /usr/local/include and \c /usr/local/lib, you
-should run configure as follows:
+would run configure with the following additional arguments:
@code
-cd build_posix
-../configure --enable-bzip2 CPPFLAGS="-I/usr/local/include" LDFLAGS="-L/usr/local/include"
+--enable-zlib CPPFLAGS="-I/usr/local/include" LDFLAGS="-L/usr/local/include"
@endcode
-When opening the WiredTiger database, load the bzip2 shared library as
-an extension. For example, with the bzip2 library installed in
+When opening the WiredTiger database, load the zlib shared library as
+an extension. For example, with the WiredTiger library installed in
\c /usr/local/lib, you would use the following extension:
-@snippet ex_all.c Configure bzip2 extension
+@snippet ex_all.c Configure zlib extension
Finally, when creating the WiredTiger object, set \c block_compressor
-to \c bzip2:
-
-@snippet ex_all.c Create a bzip2 compressed table
-
-If necessary, you can confirm the compressor is working by running the
-compression part of the test suite:
+to \c zlib:
-@code
-cd build_posix
-python ../test/suite/run.py compress
-@endcode
-
-Review the test output to verify the bzip2 part of the test passes and
-was not skipped.
+@snippet ex_all.c Create a zlib compressed table
@section compression_snappy Using snappy compression
@@ -56,15 +44,14 @@ If snappy is installed in a location not normally searched by the
compiler toolchain, you'll need to modify the \c CPPFLAGS and \c LDFLAGS
to indicate these locations. For example, with the snappy includes and
libraries installed in \c /usr/local/include and \c /usr/local/lib, you
-should run configure as follows:
+would run configure with the following additional arguments:
@code
-cd build_posix
-../configure --enable-snappy CPPFLAGS="-I/usr/local/include" LDFLAGS="-L/usr/local/include"
+--enable-snappy CPPFLAGS="-I/usr/local/include" LDFLAGS="-L/usr/local/include"
@endcode
When opening the WiredTiger database, load the snappy shared library as
-an extension. For example, with the snappy library installed in
+an extension. For example, with the WiredTiger library installed in
\c /usr/local/lib, you would use the following extension:
@snippet ex_all.c Configure snappy extension
@@ -74,16 +61,34 @@ to \c snappy:
@snippet ex_all.c Create a snappy compressed table
-If necessary, you can confirm the compressor is working by running the
-compression part of the test suite:
+@section compression_bzip2 Using bzip2 compression
+
+To use the builtin support for
+<a href="http://www.bzip.org/">Julian Seward's bzip2</a>
+compression, first check that bzip2 is installed in include and library
+directories searched by the compiler. Once bzip2 is installed, you can
+enable bzip2 using the \c --enable-bzip2 option to configure.
+
+If bzip2 is installed in a location not normally searched by the
+compiler toolchain, you'll need to modify the \c CPPFLAGS and \c LDFLAGS
+to indicate these locations. For example, with the bzip2 includes and
+libraries installed in \c /usr/local/include and \c /usr/local/lib, you
+would run configure with the following additional arguments:
@code
-cd build_posix
-python ../test/suite/run.py compress
+--enable-bzip2 CPPFLAGS="-I/usr/local/include" LDFLAGS="-L/usr/local/include"
@endcode
-Review the test output to verify the snappy part of the test passes and
-was not skipped.
+When opening the WiredTiger database, load the bzip2 shared library as
+an extension. For example, with the WiredTiger library installed in
+\c /usr/local/lib, you would use the following extension:
+
+@snippet ex_all.c Configure bzip2 extension
+
+Finally, when creating the WiredTiger object, set \c block_compressor
+to \c bzip2:
+
+@snippet ex_all.c Create a bzip2 compressed table
@section compression_upgrading Upgrading compression engines
diff --git a/src/docs/helium.dox b/src/docs/helium.dox
new file mode 100644
index 00000000000..cd6b47fb968
--- /dev/null
+++ b/src/docs/helium.dox
@@ -0,0 +1,125 @@
+/*! @page helium WiredTiger Helium support
+
+WiredTiger supports Levyx Inc., Helium Data Store volumes as a data-source.
+
+To configure one or more Helium volumes as WiredTiger data sources, take
+the following steps.
+
+@section helium_build Building the WiredTiger Helium Support
+
+To build the Helium support, use the configuration option \c --with-helium=DIR.
+For example:
+
+@code
+% cd wiredtiger
+% ls /usr/local/lib/Helium
+Helium Programmer's Reference.pdf libhe.a
+README.TXT libhe.so
+he.h
+% ./configure --with-helium=/usr/local/lib/Helium && make
+@endcode
+
+@section helium_load Loading the WiredTiger Helium Support
+
+Next, add code to your application to load the Helium shared library.
+
+The following example loads the Helium shared library, configuring and
+naming two separate Helium volumes. The first volume is named \c dev1,
+the second volume is named \c dev2. Volume \c dev1 has two underlying
+physical Helium devices, \c /dev/disk3s1 and \c /dev/disk4s1. Volume
+\c dev2 has a single underlying physical Helium device, \c /dev/disk5s1.
+
+@code
+#define HELIUM_LIBRARY_PATH "test/helium/.libs/libwiredtiger_helium.so""
+ret = connection->load_extension(connection, HELIUM_LIBRARY_PATH,
+ "config=["
+ "dev1=[helium_devices=[\"he://.//dev/disk3s1,/dev/disk4s1\"],"
+ "helium_o_volume_truncate=1],"
+ "dev2=[helium_devices=[\"he://.//dev/disk5s1\"],"
+ "helium_o_volume_truncate=1]]");
+@endcode
+
+The \c helium_devices configuration string takes a WiredTiger string
+which is a comma-separated list of Helium devices. (Note the quoting
+required for that to be possible.)
+
+In this example, both Helium volumes are configured to be truncated when
+first opened, and all previously existing contents discarded.
+
+When configuring a Helium volume, the following non-standard configuration
+strings are supported:
+
+<table>
+@hrow{String, Type, Meaning}
+@row{helium_devices, list, WiredTiger URI to Helium volume mapping}
+@row{helium_env_read_cache_size, int, struct he_env read_cache_size value}
+@row{helium_env_write_cache_size, int, struct he_env write_cache_size value}
+@row{helium_o_volume_truncate, boolean, HE_O_VOLUME_TRUNCATE flag}
+</table>
+
+With the exception of the configuration string \c helium_devices (which
+is WiredTiger specific), see the Helium documentation for details on
+their use.
+
+@section helium_objects Creating WiredTiger objects on Helium volumes
+
+When creating WiredTiger objects on Helium volumes, the volume names are
+used as part of the URI specified to WiredTiger methods such as
+WT_SESSION::create or WT_SESSION::rename, separated from the object name
+by a single slash character.
+
+Additionally, the \c helium \c type configuration string must be included.
+
+The following example creates a table named \c access on the Helium
+volume \c dev1, and then opens a cursor on the table:
+
+@code
+WT_CURSOR *cursor;
+WT_SESSION *session;
+
+/* Create the access table. */
+ret = session->create(
+ session, "table:dev1/access", "key_format=S,value_format=S,type=helium");
+
+/* Open a cursor on the access table. */
+ret = session->open_cursor(session, "table:dev1/access", NULL, NULL, &cursor);
+@endcode
+
+When calling WT_SESSION::create to create an object on a Helium volume,
+the following additional configuration strings are supported:
+
+<table>
+@hrow{String, Type, Meaning}
+@row{helium_o_compress, boolean, HE_I_COMPRESS flag}
+@row{helium_o_truncate, boolean, HE_O_TRUNCATE flag}
+</table>
+
+See the Helium device documentation for details on their use.
+
+For example, creating and truncating a table could be done as follows:
+
+@code
+WT_SESSION *session;
+
+/* Create and truncate the access table. */
+ret = session->create(session, "table:dev1/access",
+ "key_format=S,value_format=S,type=helium,helium_open_o_truncate=1");
+@endcode
+
+@section helium_notes Helium notes
+
+- Helium volumes do not support hot backup.
+- Helium volumes do not support named checkpoints.
+- Helium volumes do not support compression of any kind.
+- Helium volumes do not support bulk load as a special case, and configuring
+cursors for bulk load has no effect.
+- Inserting a new record after the current maximum record in a fixed-length
+bit field column-store (that is, a store with an 'r' type key and 't' type
+value) does not implicitly create the missing records.
+
+@section helium_limitations Helium limitations
+
+- WiredTiger transactions cannot include operations on both Helium volumes
+and other stores; this will be corrected in a future release.
+
+*/
diff --git a/src/docs/hot_backup.dox b/src/docs/hot_backup.dox
index 0971eca948a..9c0326bcb17 100644
--- a/src/docs/hot_backup.dox
+++ b/src/docs/hot_backup.dox
@@ -10,15 +10,15 @@ To perform a hot backup:
1. Open a cursor on the backup data source, which begins the process of
a hot backup.
-2. Copy each file returned by the WT_CURSOR::next method into a
-different directory.
+2. Copy each file returned by the WT_CURSOR::next method to the hot
+backup location, for example, a different directory.
3. Close the cursor; the cursor must not be closed until all of the
files have been copied.
-The directory to which the files are copied may subsequently be
-specified as an directory to the ::wiredtiger_open function and accessed
-as a WiredTiger database home.
+A directory to which the files are copied may subsequently be specified
+as an directory to the ::wiredtiger_open function and accessed as a
+WiredTiger database home.
Notes:
diff --git a/src/docs/memrata.dox b/src/docs/memrata.dox
deleted file mode 100644
index c915f0c59ea..00000000000
--- a/src/docs/memrata.dox
+++ /dev/null
@@ -1,129 +0,0 @@
-/*! @page memrata WiredTiger Memrata support
-
-WiredTiger supports Memrata KVS devices as a data-source.
-
-To configure one or more Memrata KVS devices as WiredTiger data sources,
-take the following steps.
-
-@section memrata_build Building the WiredTiger Memrata Support
-
-To build the Memrata support, add a link in the WiredTiger build
-directory to the installed location of the Memrata software. For
-example:
-
-@code
-% cd wiredtiger
-% ls /usr/local/memrata
-kvs.h libkvs.a libkvs.so
-kvs.h.4.2 libkvs.a.4.2 libkvs.so.4.2
-% ln -s /usr/local/memrata memrata
-% ./configure && make
-@endcode
-
-@section memrata_load Loading the WiredTiger Memrata Support
-
-Second, change your application to load the Memrata shared library. The
-following example loads the Memrata shared library, configuring and
-naming two separate Memrata device pools. The first device pool is
-named \c dev1, the second device pool is named \c dev2. Device pool \c
-dev1 has two underlying Memrata devices, \c /dev/ssd0 and \c /dev/ssd1.
-Device pool \c dev2 has a single underlying Memrata device, \c
-/dev/ssd2.
-
-@code
-#define MEMRATA_LIBRARY_PATH "test/memrata/.libs/libwiredtiger_memrata.so""
-ret = connection->load_extension(connection, MEMRATA_LIBRARY_PATH,
- "config=["
- "dev1=[kvs_devices=[/dev/ssd0,/dev/ssd1],kvs_open_o_truncate=1],"
- "dev2=[kvs_devices=[/dev/ssd2],kvs_open_o_truncate=1]]");
-@endcode
-
-The \c kvs_devices configuration string takes a WiredTiger configuration
-list, that is, a comma-separated list of Memrata devices.
-
-In this example, both device pools are configured to be truncated (that
-is, all previously existing contents discarded), when they are configured.
-
-When loading a Memrata device, the following additional configuration strings
-are supported:
-
-<table>
-@hrow{String, Type}
-@row{kvs_devices, list of lists}
-@row{kvs_parallelism, int}
-@row{kvs_granularity, int}
-@row{kvs_avg_key_len, int}
-@row{kvs_avg_val_len, int}
-@row{kvs_write_bufs, int}
-@row{kvs_read_bufs, int}
-@row{kvs_commit_timeout, int}
-@row{kvs_reclaim_threshold, int}
-@row{kvs_reclaim_period, int}
-@row{kvs_open_o_debug, boolean}
-@row{kvs_open_o_truncate, boolean}
-</table>
-
-With the exception of the configuration string \c kvs_devices (which is
-WiredTiger specific), see the Memrata device documentation for details
-on their use.
-
-@section memrata_objects Creating Memrata-backed objects
-
-The device pool names are used as part of the URI specified to WiredTiger
-methods such as WT_SESSION::create or WT_SESSION::rename, separated from
-the object name by a single slash character.
-
-Additionally, the \c memrata \c type configuration string must be included.
-
-The following example creates a Memrata table named \c access in the
-device pool \c dev1, and then opens a cursor on the table:
-
-@code
-WT_CURSOR *cursor;
-WT_SESSION *session;
-
-/* Create the access table. */
-ret = session->create(
- session, "table:dev1/access", "key_format=S,value_format=S,type=memrata");
-
-/* Open a cursor on the access table. */
-ret = session->open_cursor(session, "table:dev1/access", NULL, NULL, &cursor);
-@endcode
-
-When creating a Memrata-backed object with the WT_SESSION::create method,
-the following additional configuration strings are supported:
-
-<table>
-@hrow{String, Type}
-@row{kvs_open_o_debug, boolean}
-@row{kvs_open_o_truncate, boolean}
-</table>
-
-See the Memrata device documentation for details on their use.
-
-For example, creating and truncating a table could be done as follows:
-
-@code
-WT_SESSION *session;
-
-/* Create and truncate the access table. */
-ret = session->create(session, "table:dev1/access",
- "key_format=S,value_format=S,type=memrata,kvs_open_o_truncate=1");
-@endcode
-
-@section memrata_notes Memrata notes
-
-- Memrata devices do not support named checkpoints.
-- Inserting a new record after the current maximum record in a fixed-length
-bit field column-store (that is, a store with an 'r' type key and 't' type
-value) does not implicitly create the missing records.
-- Memrata devices do not support bulk load as a special case, and configuring
-cursors for bulk load has no effect.
-- Memrata devices do not support compression of any kind.
-
-@section memrata_limitations Memrata limitations
-
-- WiredTiger transactions cannot include operations on both Memrata devices
-and other stores.
-
-*/
diff --git a/src/docs/programming.dox b/src/docs/programming.dox
index 5bf5d965afc..54e641fa3a4 100644
--- a/src/docs/programming.dox
+++ b/src/docs/programming.dox
@@ -33,7 +33,7 @@ WiredTiger applications:
@section programming_extending Extending WiredTiger
- @subpage custom_data_sources
-- @subpage memrata
+- @subpage helium
@section programming_admin Administering a WiredTiger database
diff --git a/src/docs/spell.ok b/src/docs/spell.ok
index 1012eef1f93..6d24c474e19 100644
--- a/src/docs/spell.ok
+++ b/src/docs/spell.ok
@@ -1,5 +1,6 @@
personal_ws-1.1 en 200
APIs
+Adler's
Atomicity
BLOBs
CFLAGS
@@ -12,6 +13,7 @@ DbCursor
DbEnv
DbMultiple
EB
+EAGAIN
EBUSY
EINVAL
EmpId
@@ -28,18 +30,21 @@ LIBS
LSB
LSM
Lameter
+Levyx
MERCHANTABILITY
MVCC's
Makefiles
-Memrata
Mewhort
NOTFOUND
NUMA
NoSQL
+README
RepMgr
+Roelofs
Rrx
Seward's
SiS
+TXT
URIs
Vv
WiredTiger
@@ -87,6 +92,7 @@ command's
comparator
cond
config
+configurign
conn
const
control's
@@ -146,6 +152,7 @@ firstname
fnv
fput
freelist
+fsync
gcc
gdbm
getopt
@@ -181,6 +188,7 @@ lastname
len
li
libdir
+libhe
libkvs
libtool
libwiredtiger
@@ -207,7 +215,6 @@ maxleafpage
memalloc
memfree
memp
-memrata
metadata
minkey
mkdir
@@ -356,3 +363,4 @@ writelocks
wrlock
xa
yieldcpu
+zlib
diff --git a/src/docs/top/Doxyfile b/src/docs/top/Doxyfile
index 59a3667b169..ed4f2eb8c3b 100644
--- a/src/docs/top/Doxyfile
+++ b/src/docs/top/Doxyfile
@@ -2,7 +2,7 @@
PROJECT_NUMBER = "Developer Site"
OUTPUT_DIRECTORY = ../../docs/top
-INPUT = top license.dox
+INPUT = top community.dox license.dox
EXCLUDE =
GENERATE_TREEVIEW = NO
diff --git a/src/docs/top/main.dox b/src/docs/top/main.dox
index 821f22102d3..5481d2deae5 100644
--- a/src/docs/top/main.dox
+++ b/src/docs/top/main.dox
@@ -6,9 +6,9 @@ WiredTiger is an high performance, scalable, production quality, NoSQL,
@section releases Releases
<table>
-@row{<b>WiredTiger 2.0.1</b> (current),
- <a href="releases/wiredtiger-2.0.1.tar.bz2"><b>[Release package]</b></a>,
- <a href="2.0.1/index.html"><b>[Documentation]</b></a>}
+@row{<b>WiredTiger 2.1.0</b> (current),
+ <a href="releases/wiredtiger-2.1.0.tar.bz2"><b>[Release package]</b></a>,
+ <a href="2.1.0/index.html"><b>[Documentation]</b></a>}
@row{<b>WiredTiger 1.6.6</b> (previous),
<a href="releases/wiredtiger-1.6.6.tar.bz2"><b>[Release package]</b></a>,
<a href="1.6.6/index.html"><b>[Documentation]</b></a>}
diff --git a/src/docs/upgrading.dox b/src/docs/upgrading.dox
index 9c250824fee..e59b031a1ff 100644
--- a/src/docs/upgrading.dox
+++ b/src/docs/upgrading.dox
@@ -10,6 +10,26 @@ In the 2.1 release of WiredTiger WT_ITEM::size type has changed from
resolve compile-time errors.
</dd>
+<dt>WT_COMPRESSOR::compress_raw signature</dt>
+<dd>
+In the 2.1 release of WiredTiger, the behavior of the compress_raw
+callback has changed so that it will only be retried if it returns
+\c EAGAIN. If it returns zero and sets \c result_slots to zero,
+WiredTiger will assume that raw compression has failed and will fall
+back to calling WT_COMPRESSOR::compress.
+</dd>
+
+<dt>Transaction sync default setting</dt>
+<dd>
+In the 2.1 release of WiredTiger the ::wiredtiger_open \c transaction_sync
+configuration setting default value has changed from "dsync" to "fsync".
+This is due to enhancements to the group commit implementation in
+WiredTiger - which mean that greater throughput can be achieved with
+explicit "fsync" calls than by enabling "dsync" on a file handle.
+Applications that don't execute concurrent transactions may see better
+throughput with transaction_sync set to "dsync".
+</dd>
+
@section version_20 Upgrading to Version 2.0
<dl>
diff --git a/src/include/btmem.h b/src/include/btmem.h
index e4b30f03ab9..7f0bf280d5c 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -521,14 +521,9 @@ struct __wt_ref {
* WT_MERGE_FULL_PAGE --
* When the result of a merge contains more than this number of keys, it is
* considered "done" and will not be merged again.
- *
- * WT_MERGE_MAX_REFS --
- * Don't complete merges that contain more than this number of keys, they tend
- * to generate pathological trees.
*/
#define WT_MERGE_STACK_MIN 3
#define WT_MERGE_FULL_PAGE 100
-#define WT_MERGE_MAX_REFS 1000
/*
* WT_ROW --
diff --git a/src/include/btree.i b/src/include/btree.i
index fc9a73f4d9d..f09d05178ab 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -528,6 +528,41 @@ __wt_ref_info(WT_SESSION_IMPL *session, WT_PAGE *page,
}
/*
+ * __wt_eviction_force_check --
+ * Check if a page matches the criteria for forced eviction.
+ */
+static inline int
+__wt_eviction_force_check(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_BTREE *btree;
+
+ btree = S2BT(session);
+
+ /* Pages are usually small enough, check that first. */
+ if (page->memory_footprint < btree->maxmempage)
+ return (0);
+
+ /* Leaf pages only. */
+ if (page->type != WT_PAGE_COL_FIX &&
+ page->type != WT_PAGE_COL_VAR &&
+ page->type != WT_PAGE_ROW_LEAF)
+ return (0);
+
+ /* Eviction may be turned off, although that's rare. */
+ if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
+ return (0);
+
+ /*
+ * It's hard to imagine a page with a huge memory footprint that has
+ * never been modified, but check to be sure.
+ */
+ if (page->modify == NULL)
+ return (0);
+
+ return (1);
+}
+
+/*
* __wt_page_release --
* Release a reference to a page.
*/
@@ -557,7 +592,7 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_PAGE *page)
return (ret);
}
- ret = __wt_evict_page(session, page);
+ WT_TRET(__wt_evict_page(session, page));
if (ret == 0)
WT_STAT_FAST_CONN_INCR(session, cache_eviction_force);
else
@@ -642,43 +677,8 @@ __wt_page_hazard_check(WT_SESSION_IMPL *session, WT_PAGE *page)
}
/*
- * __wt_eviction_force_check --
- * Check if a page matches the criteria for forced eviction.
- */
-static inline int
-__wt_eviction_force_check(WT_SESSION_IMPL *session, WT_PAGE *page)
-{
- WT_BTREE *btree;
-
- btree = S2BT(session);
-
- /* Pages are usually small enough, check that first. */
- if (page->memory_footprint < btree->maxmempage)
- return (0);
-
- /* Leaf pages only. */
- if (page->type != WT_PAGE_COL_FIX &&
- page->type != WT_PAGE_COL_VAR &&
- page->type != WT_PAGE_ROW_LEAF)
- return (0);
-
- /* Eviction may be turned off, although that's rare. */
- if (F_ISSET(btree, WT_BTREE_NO_EVICTION))
- return (0);
-
- /*
- * It's hard to imagine a page with a huge memory footprint that has
- * never been modified, but check to be sure.
- */
- if (page->modify == NULL)
- return (0);
-
- return (1);
-}
-
-/*
* __wt_eviction_force --
- * Check if the current transaction permits forced eviction of a page.
+ * Check if the current transaction permits forced eviction of a page.
*/
static inline int
__wt_eviction_force_txn_check(WT_SESSION_IMPL *session, WT_PAGE *page)
@@ -702,7 +702,7 @@ __wt_eviction_force_txn_check(WT_SESSION_IMPL *session, WT_PAGE *page)
/*
* __wt_eviction_force --
- * Forcefully evict a page, if possible.
+ * Forcefully evict a page, if possible.
*/
static inline int
__wt_eviction_force(WT_SESSION_IMPL *session, WT_PAGE *page)
@@ -852,7 +852,7 @@ __wt_lex_compare_skip(
/*
* __wt_btree_mergeable --
- * Determines whether the given page is a candidate for merging.
+ * Determines whether the given page is a candidate for merging.
*/
static inline int
__wt_btree_mergeable(WT_PAGE *page)
diff --git a/src/include/stat.h b/src/include/stat.h
index 6717b4d081f..ea2a4068f96 100644
--- a/src/include/stat.h
+++ b/src/include/stat.h
@@ -182,6 +182,8 @@ struct __wt_connection_stats {
WT_STATS log_slot_transitions;
WT_STATS log_sync;
WT_STATS log_writes;
+ WT_STATS lsm_checkpoint_throttle;
+ WT_STATS lsm_merge_throttle;
WT_STATS lsm_rows_merged;
WT_STATS memory_allocation;
WT_STATS memory_free;
@@ -275,9 +277,11 @@ struct __wt_dsrc_stats {
WT_STATS cursor_search_near;
WT_STATS cursor_update;
WT_STATS cursor_update_bytes;
+ WT_STATS lsm_checkpoint_throttle;
WT_STATS lsm_chunk_count;
WT_STATS lsm_generation_max;
WT_STATS lsm_lookup_no_bloom;
+ WT_STATS lsm_merge_throttle;
WT_STATS rec_dictionary;
WT_STATS rec_overflow_key_internal;
WT_STATS rec_overflow_key_leaf;
diff --git a/src/include/txn.i b/src/include/txn.i
index fc0a4d2317f..cdfe697ee51 100644
--- a/src/include/txn.i
+++ b/src/include/txn.i
@@ -18,6 +18,8 @@ __txn_next_op(WT_SESSION_IMPL *session, WT_TXN_OP **opp)
WT_TXN *txn;
txn = &session->txn;
+ *opp = NULL;
+
WT_ASSERT(session, F_ISSET(txn, TXN_RUNNING));
WT_RET(__wt_realloc_def(session, &txn->mod_alloc,
txn->mod_count + 1, &txn->mod));
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index be4474ed14f..b5634c9d205 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -855,7 +855,9 @@ struct __wt_session {
* value can be created. Must be larger than chunk_size., an integer
* between 100MB and 10TB; default \c 5GB.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;chunk_size, the maximum size of the
- * in-memory chunk of an LSM tree., an integer between 512K and 500MB;
+ * in-memory chunk of an LSM tree. This limit is soft - it is possible
+ * for chunks to be temporarily larger than this value. This overrides
+ * the \c memory_page_max setting., an integer between 512K and 500MB;
* default \c 10MB.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;merge_max, the
* maximum number of chunks to include in a merge operation., an integer
@@ -872,7 +874,8 @@ struct __wt_session {
* memory before being reconciled to disk. The specified size will be
* adjusted to a lower bound of <code>50 * leaf_page_max</code>. This
* limit is soft - it is possible for pages to be temporarily larger
- * than this value., an integer between 512B and 10TB; default \c 5MB.}
+ * than this value. This setting is ignored for LSM trees\, see \c
+ * chunk_size., an integer between 512B and 10TB; default \c 5MB.}
* @config{os_cache_dirty_max, maximum dirty system buffer cache usage\,
* in bytes. If non-zero\, schedule writes for dirty blocks belonging
* to this object in the system buffer cache after that many bytes from
@@ -2133,14 +2136,19 @@ struct __wt_compressor {
* set \c result_slotsp to the number of byte strings encoded and
* \c result_lenp to the bytes needed for the encoded representation.
*
- * WiredTiger repeatedly calls the callback function until all rows on
- * the page have been encoded. There is no requirement the callback
- * encode any or all of the byte strings passed by WiredTiger. If the
- * callback does not encode any of the byte strings, the callback must
- * set \c result_slotsp to 0. In this case, WiredTiger will accumulate
- * more rows and repeat the call; if there are no more rows to
- * accumulate, WiredTiger writes the remaining rows without further
- * calls to the callback.
+ * There is no requirement the callback encode any or all of the byte
+ * strings passed by WiredTiger. If the callback does not encode any
+ * of the byte strings and compression should not be retried, the
+ * callback should set \c result_slotsp to 0.
+ *
+ * If the callback does not encode any of the byte strings and
+ * compression should be retried with additional byte strings, the
+ * callback must return \c EAGAIN. In that case, WiredTiger will
+ * accumulate more rows and repeat the call.
+ *
+ * If there are no more rows to accumulate or the callback indicates
+ * that it cannot be retried, WiredTiger writes the remaining rows
+ * using \c WT_COMPRESSOR::compress.
*
* On entry, \c final is zero if there are more rows to be written as
* part of this page (if there will be additional data provided to the
@@ -2580,42 +2588,46 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_CONN_LOG_SYNC 1063
/*! log: log write operations */
#define WT_STAT_CONN_LOG_WRITES 1064
+/*! sleep for LSM checkpoint throttle */
+#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1065
+/*! sleep for LSM merge throttle */
+#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1066
/*! rows merged in an LSM tree */
-#define WT_STAT_CONN_LSM_ROWS_MERGED 1065
+#define WT_STAT_CONN_LSM_ROWS_MERGED 1067
/*! memory allocations */
-#define WT_STAT_CONN_MEMORY_ALLOCATION 1066
+#define WT_STAT_CONN_MEMORY_ALLOCATION 1068
/*! memory frees */
-#define WT_STAT_CONN_MEMORY_FREE 1067
+#define WT_STAT_CONN_MEMORY_FREE 1069
/*! memory re-allocations */
-#define WT_STAT_CONN_MEMORY_GROW 1068
+#define WT_STAT_CONN_MEMORY_GROW 1070
/*! total read I/Os */
-#define WT_STAT_CONN_READ_IO 1069
+#define WT_STAT_CONN_READ_IO 1071
/*! page reconciliation calls */
-#define WT_STAT_CONN_REC_PAGES 1070
+#define WT_STAT_CONN_REC_PAGES 1072
/*! page reconciliation calls for eviction */
-#define WT_STAT_CONN_REC_PAGES_EVICTION 1071
+#define WT_STAT_CONN_REC_PAGES_EVICTION 1073
/*! reconciliation failed because an update could not be included */
-#define WT_STAT_CONN_REC_SKIPPED_UPDATE 1072
+#define WT_STAT_CONN_REC_SKIPPED_UPDATE 1074
/*! pthread mutex shared lock read-lock calls */
-#define WT_STAT_CONN_RWLOCK_READ 1073
+#define WT_STAT_CONN_RWLOCK_READ 1075
/*! pthread mutex shared lock write-lock calls */
-#define WT_STAT_CONN_RWLOCK_WRITE 1074
+#define WT_STAT_CONN_RWLOCK_WRITE 1076
/*! open cursor count */
-#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1075
+#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1077
/*! transactions */
-#define WT_STAT_CONN_TXN_BEGIN 1076
+#define WT_STAT_CONN_TXN_BEGIN 1078
/*! transaction checkpoints */
-#define WT_STAT_CONN_TXN_CHECKPOINT 1077
+#define WT_STAT_CONN_TXN_CHECKPOINT 1079
/*! transaction checkpoint currently running */
-#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1078
+#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1080
/*! transactions committed */
-#define WT_STAT_CONN_TXN_COMMIT 1079
+#define WT_STAT_CONN_TXN_COMMIT 1081
/*! transaction failures due to cache overflow */
-#define WT_STAT_CONN_TXN_FAIL_CACHE 1080
+#define WT_STAT_CONN_TXN_FAIL_CACHE 1082
/*! transactions rolled-back */
-#define WT_STAT_CONN_TXN_ROLLBACK 1081
+#define WT_STAT_CONN_TXN_ROLLBACK 1083
/*! total write I/Os */
-#define WT_STAT_CONN_WRITE_IO 1082
+#define WT_STAT_CONN_WRITE_IO 1084
/*!
* @}
@@ -2759,43 +2771,47 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_DSRC_CURSOR_UPDATE 2066
/*! cursor-update value bytes updated */
#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2067
+/*! sleep for LSM checkpoint throttle */
+#define WT_STAT_DSRC_LSM_CHECKPOINT_THROTTLE 2068
/*! chunks in the LSM tree */
-#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2068
+#define WT_STAT_DSRC_LSM_CHUNK_COUNT 2069
/*! highest merge generation in the LSM tree */
-#define WT_STAT_DSRC_LSM_GENERATION_MAX 2069
+#define WT_STAT_DSRC_LSM_GENERATION_MAX 2070
/*! queries that could have benefited from a Bloom filter that did not
* exist */
-#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2070
+#define WT_STAT_DSRC_LSM_LOOKUP_NO_BLOOM 2071
+/*! sleep for LSM merge throttle */
+#define WT_STAT_DSRC_LSM_MERGE_THROTTLE 2072
/*! reconciliation dictionary matches */
-#define WT_STAT_DSRC_REC_DICTIONARY 2071
+#define WT_STAT_DSRC_REC_DICTIONARY 2073
/*! reconciliation internal-page overflow keys */
-#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2072
+#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2074
/*! reconciliation leaf-page overflow keys */
-#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2073
+#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2075
/*! reconciliation overflow values written */
-#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2074
+#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2076
/*! reconciliation pages deleted */
-#define WT_STAT_DSRC_REC_PAGE_DELETE 2075
+#define WT_STAT_DSRC_REC_PAGE_DELETE 2077
/*! reconciliation pages merged */
-#define WT_STAT_DSRC_REC_PAGE_MERGE 2076
+#define WT_STAT_DSRC_REC_PAGE_MERGE 2078
/*! page reconciliation calls */
-#define WT_STAT_DSRC_REC_PAGES 2077
+#define WT_STAT_DSRC_REC_PAGES 2079
/*! page reconciliation calls for eviction */
-#define WT_STAT_DSRC_REC_PAGES_EVICTION 2078
+#define WT_STAT_DSRC_REC_PAGES_EVICTION 2080
/*! reconciliation failed because an update could not be included */
-#define WT_STAT_DSRC_REC_SKIPPED_UPDATE 2079
+#define WT_STAT_DSRC_REC_SKIPPED_UPDATE 2081
/*! reconciliation internal pages split */
-#define WT_STAT_DSRC_REC_SPLIT_INTERNAL 2080
+#define WT_STAT_DSRC_REC_SPLIT_INTERNAL 2082
/*! reconciliation leaf pages split */
-#define WT_STAT_DSRC_REC_SPLIT_LEAF 2081
+#define WT_STAT_DSRC_REC_SPLIT_LEAF 2083
/*! reconciliation maximum splits for a page */
-#define WT_STAT_DSRC_REC_SPLIT_MAX 2082
+#define WT_STAT_DSRC_REC_SPLIT_MAX 2084
/*! object compaction */
-#define WT_STAT_DSRC_SESSION_COMPACT 2083
+#define WT_STAT_DSRC_SESSION_COMPACT 2085
/*! open cursor count */
-#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2084
+#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2086
/*! update conflicts */
-#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2085
+#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2087
/*! @} */
/*
* Statistics section: END
diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c
index 618257469ee..c50380b91b9 100644
--- a/src/lsm/lsm_cursor.c
+++ b/src/lsm/lsm_cursor.c
@@ -1115,9 +1115,18 @@ __clsm_put(WT_SESSION_IMPL *session,
* don't worry about protecting access.
*/
if (++clsm->primary_chunk->count % 100 == 0 &&
- lsm_tree->merge_throttle + lsm_tree->ckpt_throttle > 0)
+ lsm_tree->merge_throttle + lsm_tree->ckpt_throttle > 0) {
+ WT_STAT_FAST_INCRV(session, &clsm->lsm_tree->stats,
+ lsm_checkpoint_throttle, (uint64_t)lsm_tree->ckpt_throttle);
+ WT_STAT_FAST_CONN_INCRV(session,
+ lsm_checkpoint_throttle, (uint64_t)lsm_tree->ckpt_throttle);
+ WT_STAT_FAST_INCRV(session, &clsm->lsm_tree->stats,
+ lsm_merge_throttle, (uint64_t)lsm_tree->merge_throttle);
+ WT_STAT_FAST_CONN_INCRV(session,
+ lsm_merge_throttle, (uint64_t)lsm_tree->merge_throttle);
__wt_sleep(0,
lsm_tree->ckpt_throttle + lsm_tree->merge_throttle);
+ }
/*
* In LSM there are multiple btrees active at one time. The tree
diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c
index 3aec49da252..a830295908f 100644
--- a/src/lsm/lsm_tree.c
+++ b/src/lsm/lsm_tree.c
@@ -407,6 +407,10 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session,
* Set up the config for each chunk. If possible, avoid high latencies
* from fsync by flushing the cache every 8MB (will be overridden by
* any application setting).
+ *
+ * Also make the memory_page_max double the chunk size, so application
+ * threads don't immediately try to force evict the chunk when the
+ * worker thread clears the NO_EVICTION flag.
*/
tmpconfig = "";
#ifdef HAVE_SYNC_FILE_RANGE
@@ -415,7 +419,8 @@ __wt_lsm_tree_create(WT_SESSION_IMPL *session,
#endif
WT_ERR(__wt_scr_alloc(session, 0, &buf));
WT_ERR(__wt_buf_fmt(session, buf,
- "%s%s,key_format=u,value_format=u", tmpconfig, config));
+ "%s%s,key_format=u,value_format=u,memory_page_max=%" PRIu64,
+ tmpconfig, config, 2 * lsm_tree->chunk_max));
lsm_tree->file_config = __wt_buf_steal(session, buf);
/* Create the first chunk and flush the metadata. */
diff --git a/src/support/stat.c b/src/support/stat.c
index 621c79220a4..c0caecbe606 100644
--- a/src/support/stat.c
+++ b/src/support/stat.c
@@ -93,11 +93,14 @@ __wt_stat_init_dsrc_stats(WT_DSRC_STATS *stats)
stats->cursor_search_near.desc = "cursor search near calls";
stats->cursor_update.desc = "cursor update calls";
stats->cursor_update_bytes.desc = "cursor-update value bytes updated";
+ stats->lsm_checkpoint_throttle.desc =
+ "sleep for LSM checkpoint throttle";
stats->lsm_chunk_count.desc = "chunks in the LSM tree";
stats->lsm_generation_max.desc =
"highest merge generation in the LSM tree";
stats->lsm_lookup_no_bloom.desc =
"queries that could have benefited from a Bloom filter that did not exist";
+ stats->lsm_merge_throttle.desc = "sleep for LSM merge throttle";
stats->rec_dictionary.desc = "reconciliation dictionary matches";
stats->rec_overflow_key_internal.desc =
"reconciliation internal-page overflow keys";
@@ -194,9 +197,11 @@ __wt_stat_refresh_dsrc_stats(void *stats_arg)
stats->cursor_search_near.v = 0;
stats->cursor_update.v = 0;
stats->cursor_update_bytes.v = 0;
+ stats->lsm_checkpoint_throttle.v = 0;
stats->lsm_chunk_count.v = 0;
stats->lsm_generation_max.v = 0;
stats->lsm_lookup_no_bloom.v = 0;
+ stats->lsm_merge_throttle.v = 0;
stats->rec_dictionary.v = 0;
stats->rec_overflow_key_internal.v = 0;
stats->rec_overflow_key_leaf.v = 0;
@@ -280,9 +285,11 @@ __wt_stat_aggregate_dsrc_stats(const void *child, const void *parent)
p->cursor_search_near.v += c->cursor_search_near.v;
p->cursor_update.v += c->cursor_update.v;
p->cursor_update_bytes.v += c->cursor_update_bytes.v;
+ p->lsm_checkpoint_throttle.v += c->lsm_checkpoint_throttle.v;
if (c->lsm_generation_max.v > p->lsm_generation_max.v)
p->lsm_generation_max.v = c->lsm_generation_max.v;
p->lsm_lookup_no_bloom.v += c->lsm_lookup_no_bloom.v;
+ p->lsm_merge_throttle.v += c->lsm_merge_throttle.v;
p->rec_dictionary.v += c->rec_dictionary.v;
p->rec_overflow_key_internal.v += c->rec_overflow_key_internal.v;
p->rec_overflow_key_leaf.v += c->rec_overflow_key_leaf.v;
@@ -389,6 +396,9 @@ __wt_stat_init_connection_stats(WT_CONNECTION_STATS *stats)
"log: consolidated slot join transitions";
stats->log_sync.desc = "log: log sync operations";
stats->log_writes.desc = "log: log write operations";
+ stats->lsm_checkpoint_throttle.desc =
+ "sleep for LSM checkpoint throttle";
+ stats->lsm_merge_throttle.desc = "sleep for LSM merge throttle";
stats->lsm_rows_merged.desc = "rows merged in an LSM tree";
stats->memory_allocation.desc = "memory allocations";
stats->memory_free.desc = "memory frees";
@@ -479,6 +489,8 @@ __wt_stat_refresh_connection_stats(void *stats_arg)
stats->log_slot_transitions.v = 0;
stats->log_sync.v = 0;
stats->log_writes.v = 0;
+ stats->lsm_checkpoint_throttle.v = 0;
+ stats->lsm_merge_throttle.v = 0;
stats->lsm_rows_merged.v = 0;
stats->memory_allocation.v = 0;
stats->memory_free.v = 0;
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index 8cafc78c11f..f4cd3a94a15 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -24,7 +24,7 @@ __checkpoint_name_check(WT_SESSION_IMPL *session, const char *uri)
/*
* This function exists as a place for this comment: named checkpoints
- * are only supported on file objects, and not on LSM trees or Memrata
+ * are only supported on file objects, and not on LSM trees or Helium
* devices. If a target list is configured for the checkpoint, this
* function is called with each target list entry; check the entry to
* make sure it's backed by a file. If no target list is configured,
@@ -148,11 +148,11 @@ __checkpoint_data_source(WT_SESSION_IMPL *session, const char *cfg[])
WT_DATA_SOURCE *dsrc;
/*
- * A place-holder, to support Memrata devices: we assume calling the
+ * A place-holder, to support Helium devices: we assume calling the
* underlying data-source session checkpoint function is sufficient to
* checkpoint all objects in the data source, open or closed, and we
* don't attempt to optimize the checkpoint of individual targets.
- * Those assumptions is correct for the Memrata device, but it's not
+ * Those assumptions is correct for the Helium device, but it's not
* necessarily going to be true for other data sources.
*
* It's not difficult to support data-source checkpoints of individual
diff --git a/test/format/Makefile.am b/test/format/Makefile.am
index a7ab8cb2698..c0c93528945 100644
--- a/test/format/Makefile.am
+++ b/test/format/Makefile.am
@@ -1,5 +1,4 @@
-AM_CPPFLAGS = -DWIREDTIGER_TEST_COMPRESS_RAW \
- -DBERKELEY_DB_PATH=\"$(BERKELEY_DB_PATH)\" \
+AM_CPPFLAGS = -DBERKELEY_DB_PATH=\"$(BERKELEY_DB_PATH)\" \
-I$(top_builddir) -I$(top_srcdir)/src/include \
-I$(BERKELEY_DB_PATH)/include
@@ -12,8 +11,7 @@ t_SOURCES =\
t_LDADD = $(top_builddir)/libwiredtiger.la -L$(BERKELEY_DB_PATH)/lib -ldb
t_LDFLAGS = -static
-noinst_LTLIBRARIES = raw_compress.la
-#noinst_LTLIBRARIES = lzo_compress.la raw_compress.la
+#noinst_LTLIBRARIES = lzo_compress.la
# libtool hack: noinst_LTLIBRARIES turns off building shared libraries as well
# as installation, it will only build static libraries. As far as I can tell,
@@ -22,25 +20,14 @@ noinst_LTLIBRARIES = raw_compress.la
#lzo_compress_la_LIBADD = -llzo2 -lm
#lzo_compress_la_LDFLAGS = -avoid-version -module -rpath /nowhere
-# libtool hack: noinst_LTLIBRARIES turns off building shared libraries as well
-# as installation, it will only build static libraries. As far as I can tell,
-# the "approved" libtool way to turn them back on is by adding -rpath.
-raw_compress_la_SOURCES = bzip2_compress.c
-raw_compress_la_LIBADD = -lbz2
-raw_compress_la_LDFLAGS = -avoid-version -module -rpath /nowhere
-
s_dumpcmp: $(srcdir)/s_dumpcmp.sh
cp $(srcdir)/s_dumpcmp.sh $@
chmod +x $@
-# We build bzip_compress.c with special behaviors, retrieve our own local copy.
-bzip2_compress.c: $(srcdir)/../../ext/compressors/bzip2/bzip2_compress.c
- cp $? .
-
backup:
rm -rf BACKUP && cp -p -r RUNDIR BACKUP
TESTS = smoke.sh
clean-local:
- rm -rf RUNDIR *.core bzip2_compress.c
+ rm -rf RUNDIR *.core
diff --git a/test/format/backup.c b/test/format/backup.c
index affa2f2e2a2..0711b6d487b 100644
--- a/test/format/backup.c
+++ b/test/format/backup.c
@@ -40,21 +40,14 @@ check_copy(void)
wts_open(g.home_backup, 0, &conn);
- /*
- * Open a session and verify the store; some data-sources don't support
- * verify.
- */
- if (!DATASOURCE("memrata")) {
- if ((ret = conn->open_session(
- conn, NULL, NULL, &session)) != 0)
- die(ret, "connection.open_session: %s", g.home_backup);
-
- /* Session operations for LSM can return EBUSY. */
- ret = session->verify(session, g.uri, NULL);
- if (ret != 0 && !(ret == EBUSY && DATASOURCE("lsm")))
- die(ret,
- "session.verify: %s: %s", g.home_backup, g.uri);
- }
+ if ((ret = conn->open_session(
+ conn, NULL, NULL, &session)) != 0)
+ die(ret, "connection.open_session: %s", g.home_backup);
+
+ /* Session operations for LSM can return EBUSY. */
+ ret = session->verify(session, g.uri, NULL);
+ if (ret != 0 && !(ret == EBUSY && DATASOURCE("lsm")))
+ die(ret, "session.verify: %s: %s", g.home_backup, g.uri);
if ((ret = conn->close(conn, NULL)) != 0)
die(ret, "connection.close: %s", g.home_backup);
@@ -104,7 +97,7 @@ hot_backup(void *arg)
return (NULL);
/* Hot backups aren't supported for non-standard data sources. */
- if (DATASOURCE("kvsbdb") || DATASOURCE("memrata"))
+ if (DATASOURCE("helium") || DATASOURCE("kvsbdb"))
return (NULL);
/* Open a session. */
diff --git a/test/format/bdb.c b/test/format/bdb.c
index e6f72fff103..1239c57d75e 100644
--- a/test/format/bdb.c
+++ b/test/format/bdb.c
@@ -38,8 +38,8 @@ bdb_compare_reverse(DB *dbp, const DBT *k1, const DBT *k2
#endif
)
{
- int cmp;
size_t len;
+ int cmp;
WT_UNUSED(dbp);
#if DB_VERSION_MAJOR >= 6
diff --git a/test/format/bulk.c b/test/format/bulk.c
index 3600499b8d7..3b65ad29685 100644
--- a/test/format/bulk.c
+++ b/test/format/bulk.c
@@ -52,7 +52,7 @@ wts_load(void)
* the order of insertion will not match the collation order.
*/
is_bulk = !g.c_reverse &&
- !DATASOURCE("kvsbdb") && !DATASOURCE("memrata");
+ !DATASOURCE("kvsbdb") && !DATASOURCE("helium");
if ((ret = session->open_cursor(session, g.uri, NULL,
is_bulk ? "bulk" : NULL, &cursor)) != 0)
die(ret, "session.open_cursor");
diff --git a/test/format/compact.c b/test/format/compact.c
index 7d2d6c1338d..12af0c7b56c 100644
--- a/test/format/compact.c
+++ b/test/format/compact.c
@@ -42,7 +42,7 @@ compact(void *arg)
WT_UNUSED(arg);
/* Compaction isn't supported for all data sources. */
- if (DATASOURCE("kvsbdb") || DATASOURCE("memrata"))
+ if (DATASOURCE("helium") || DATASOURCE("kvsbdb"))
return (NULL);
/* Open a session. */
diff --git a/test/format/config.c b/test/format/config.c
index 6448acdfd83..7efc75ba078 100644
--- a/test/format/config.c
+++ b/test/format/config.c
@@ -36,6 +36,7 @@ static int config_find_is_perm(const char *, size_t);
static void config_map_checksum(const char *, u_int *);
static void config_map_compression(const char *, u_int *);
static void config_map_file_type(const char *, u_int *);
+static void config_sanity(void);
/*
* config_setup --
@@ -100,7 +101,7 @@ config_setup(void)
if ((g.uri = malloc(256)) == NULL)
syserr("malloc");
strcpy(g.uri, DATASOURCE("file") ? "file:" : "table:");
- if (DATASOURCE("memrata"))
+ if (DATASOURCE("helium"))
strcat(g.uri, "dev1/");
strcat(g.uri, WT_NAME);
@@ -125,14 +126,14 @@ config_setup(void)
*cp->v = CONF_RAND(cp);
}
- /* KVS requires shared libraries. */
+ /* Required shared libraries. */
+ if (DATASOURCE("helium") && access(HELIUM_PATH, R_OK) != 0)
+ die(errno, "Levyx/helium shared library: %s", HELIUM_PATH);
if (DATASOURCE("kvsbdb") && access(KVS_BDB_PATH, R_OK) != 0)
die(errno, "kvsbdb shared library: %s", KVS_BDB_PATH);
- if (DATASOURCE("memrata") && access(MEMRATA_PATH, R_OK) != 0)
- die(errno, "memrata shared library: %s", MEMRATA_PATH);
- /* KVS doesn't support user-specified collations. */
- if (DATASOURCE("kvsbdb") || DATASOURCE("memrata"))
+ /* Some data-sources don't support user-specified collations. */
+ if (DATASOURCE("helium") || DATASOURCE("kvsbdb"))
g.c_reverse = 0;
config_checksum();
@@ -172,6 +173,9 @@ config_setup(void)
/* Reset the key count. */
g.key_cnt = 0;
+
+ /* Perform any final sanity checks. */
+ config_sanity();
}
/*
@@ -219,27 +223,31 @@ config_compression(void)
if (!(cp->flags & C_PERM)) {
cstr = "compression=none";
switch (MMRAND(1, 10)) {
- case 1: /* 10% */
+ case 1: case 2: case 3: /* 30% */
break;
- case 2: case 3: case 4: case 5: /* 40% */
+ case 4: case 5: /* 20% */
if (access(BZIP_PATH, R_OK) == 0)
cstr = "compression=bzip";
break;
case 6: /* 10% */
if (access(BZIP_PATH, R_OK) == 0)
- cstr = "compression=raw";
+ cstr = "compression=bzip-raw";
break;
- case 7: case 8: case 9: case 10: /* 40% */
+ case 7: case 8: /* 20% */
if (access(SNAPPY_PATH, R_OK) == 0)
cstr = "compression=snappy";
break;
+ case 9: case 10: /* 20% */
+ if (access(ZLIB_PATH, R_OK) == 0)
+ cstr = "compression=zlib";
+ break;
}
config_single(cstr, 0);
}
switch (g.c_compression_flag) {
case COMPRESS_BZIP:
- case COMPRESS_RAW:
+ case COMPRESS_BZIP_RAW:
if (access(BZIP_PATH, R_OK) != 0)
die(0, "bzip library not found or not readable");
break;
@@ -250,6 +258,11 @@ config_compression(void)
case COMPRESS_SNAPPY:
if (access(SNAPPY_PATH, R_OK) != 0)
die(0, "snappy library not found or not readable");
+ break;
+ case COMPRESS_ZLIB:
+ if (access(ZLIB_PATH, R_OK) != 0)
+ die(0, "zlib library not found or not readable");
+ break;
}
}
@@ -381,9 +394,9 @@ config_single(const char *s, int perm)
if (cp->flags & C_STRING) {
if (strncmp(s, "data_source", strlen("data_source")) == 0 &&
strncmp("file", ep, strlen("file")) != 0 &&
+ strncmp("helium", ep, strlen("helium")) != 0 &&
strncmp("kvsbdb", ep, strlen("kvsbdb")) != 0 &&
strncmp("lsm", ep, strlen("lsm")) != 0 &&
- strncmp("memrata", ep, strlen("memrata")) != 0 &&
strncmp("table", ep, strlen("table")) != 0) {
fprintf(stderr,
"Invalid data source option: %s\n", ep);
@@ -472,12 +485,14 @@ config_map_compression(const char *s, u_int *vp)
*vp = COMPRESS_NONE;
else if (strcmp(s, "bzip") == 0)
*vp = COMPRESS_BZIP;
+ else if (strcmp(s, "bzip-raw") == 0)
+ *vp = COMPRESS_BZIP_RAW;
else if (strcmp(s, "lzo") == 0)
*vp = COMPRESS_LZO;
- else if (strcmp(s, "raw") == 0)
- *vp = COMPRESS_RAW;
else if (strcmp(s, "snappy") == 0)
*vp = COMPRESS_SNAPPY;
+ else if (strcmp(s, "zlib") == 0)
+ *vp = COMPRESS_ZLIB;
else
die(EINVAL, "illegal compression configuration: %s", s);
}
@@ -533,3 +548,16 @@ config_file_type(u_int type)
}
return ("error: unknown file type");
}
+
+/*
+ * config_sanity --
+ * Once configuration is done, any remaining sanity checks.
+ */
+static void
+config_sanity(void)
+{
+ if (g.c_key_min > g.c_key_max)
+ die(EINVAL, "key_min may not be larger than key_max");
+ if (g.c_value_min > g.c_value_max)
+ die(EINVAL, "value_min may not be larger than value_max");
+}
diff --git a/test/format/config.h b/test/format/config.h
index 66ec518d33d..bfd2e010fa3 100644
--- a/test/format/config.h
+++ b/test/format/config.h
@@ -116,7 +116,7 @@ static CONFIG c[] = {
0x0, C_BOOL, 10, 0, 0, &g.c_compact, NULL },
{ "compression",
- "type of compression (none | bzip | lzo | raw | snappy)",
+ "type of compression (none | bzip | bzip-raw | lzo | snappy | zlib)",
0x0, C_IGNORE|C_STRING, 1, 5, 5, NULL, &g.c_compression },
{ "data_extend",
@@ -124,7 +124,7 @@ static CONFIG c[] = {
0x0, C_BOOL, 5, 0, 0, &g.c_data_extend, NULL },
{ "data_source",
- "data source (file | kvsbdb | lsm | memrata | table)",
+ "data source (file | helium | kvsbdb | lsm | table)",
0x0, C_IGNORE | C_STRING, 0, 0, 0, NULL, &g.c_data_source },
{ "delete_pct",
diff --git a/test/format/format.h b/test/format/format.h
index c9815a27c21..cc65526150a 100644
--- a/test/format/format.h
+++ b/test/format/format.h
@@ -57,17 +57,18 @@ extern WT_EXTENSION_API *wt_api;
EXTPATH "compressors/bzip2/.libs/libwiredtiger_bzip2.so"
#define SNAPPY_PATH \
EXTPATH "compressors/snappy/.libs/libwiredtiger_snappy.so"
+#define ZLIB_PATH \
+ EXTPATH "compressors/zlib/.libs/libwiredtiger_zlib.so"
#define REVERSE_PATH \
EXTPATH "collators/reverse/.libs/libwiredtiger_reverse_collator.so"
#define KVS_BDB_PATH \
EXTPATH "test/kvs_bdb/.libs/libwiredtiger_kvs_bdb.so"
-#define MEMRATA_PATH \
- EXTPATH "test/memrata/.libs/libwiredtiger_memrata.so"
+#define HELIUM_PATH \
+ EXTPATH "datasources/helium/.libs/libwiredtiger_helium.so"
#define LZO_PATH ".libs/lzo_compress.so"
-#define RAW_PATH ".libs/raw_compress.so"
#undef M
#define M(v) ((v) * 1000000) /* Million */
@@ -99,6 +100,8 @@ typedef struct {
char *home_stats; /* Statistics file path */
char *home_salvage_copy; /* Salvage copy command */
+ char *helium_mount; /* Helium volume */
+
void *bdb; /* BDB comparison handle */
void *dbc; /* BDB cursor handle */
@@ -193,9 +196,10 @@ typedef struct {
#define COMPRESS_NONE 1
#define COMPRESS_BZIP 2
-#define COMPRESS_LZO 3
-#define COMPRESS_RAW 4
+#define COMPRESS_BZIP_RAW 3
+#define COMPRESS_LZO 4
#define COMPRESS_SNAPPY 5
+#define COMPRESS_ZLIB 6
u_int c_compression_flag; /* Compression flag value */
uint64_t key_cnt; /* Keys loaded so far */
diff --git a/test/format/ops.c b/test/format/ops.c
index 4002b13e5aa..e3618394baa 100644
--- a/test/format/ops.c
+++ b/test/format/ops.c
@@ -253,8 +253,8 @@ ops(void *arg)
* LSM and data-sources don't support named checkpoints,
* else 25% of the time we name the checkpoint.
*/
- if (DATASOURCE("lsm") || DATASOURCE("kvsbdb") ||
- DATASOURCE("memrata") || MMRAND(1, 4) == 1)
+ if (DATASOURCE("lsm") || DATASOURCE("helium") ||
+ DATASOURCE("kvsbdb") || MMRAND(1, 4) == 1)
ckpt_config = NULL;
else {
(void)snprintf(config, sizeof(config),
diff --git a/test/format/s_dumpcmp.sh b/test/format/s_dumpcmp.sh
index d9c047d2e52..82a9a214a88 100755
--- a/test/format/s_dumpcmp.sh
+++ b/test/format/s_dumpcmp.sh
@@ -47,14 +47,18 @@ lzo_ext=".libs/lzo_compress.so"
if test -e $lzo_ext ; then
ext="$ext,\"$lzo_ext\""
fi
-raw_ext=".libs/raw_compress.so"
-if test -e $raw_ext ; then
- ext="$ext,\"$raw_ext\""
+bzip_raw_ext=".libs/bzip_raw_compress.so"
+if test -e $bzip_raw_ext ; then
+ ext="$ext,\"$bzip_raw_ext\""
fi
snappy_ext="$top/ext/compressors/snappy/.libs/libwiredtiger_snappy.so"
if test -e $snappy_ext ; then
ext="$ext,\"$snappy_ext\""
fi
+zlib_ext="$top/ext/compressors/zlib/.libs/libwiredtiger_zlib.so"
+if test -e $zlib_ext ; then
+ ext="$ext,\"$zlib_ext\""
+fi
config='extensions=['$ext']'
diff --git a/test/format/t.c b/test/format/t.c
index 83ed877632b..10f14744435 100644
--- a/test/format/t.c
+++ b/test/format/t.c
@@ -60,7 +60,7 @@ main(int argc, char *argv[])
/* Set values from the command line. */
home = NULL;
- while ((ch = getopt(argc, argv, "1C:c:h:Llqrt:")) != EOF)
+ while ((ch = getopt(argc, argv, "1C:c:H:h:Llqrt:")) != EOF)
switch (ch) {
case '1': /* One run */
g.c_runs = 1;
@@ -71,6 +71,9 @@ main(int argc, char *argv[])
case 'c': /* Configuration from a file */
config = optarg;
break;
+ case 'H':
+ g.helium_mount = optarg;
+ break;
case 'h':
home = optarg;
break;
@@ -374,13 +377,14 @@ usage(void)
{
fprintf(stderr,
"usage: %s [-1Llqr]\n "
- "[-C wiredtiger-config] [-c config-file] [-h home] "
+ "[-C wiredtiger-config] [-c config-file] [-H mount] [-h home] "
"[name=value ...]\n",
g.progname);
fprintf(stderr, "%s",
"\t-1 run once\n"
"\t-C specify wiredtiger_open configuration arguments\n"
"\t-c read test program configuration from a file\n"
+ "\t-H mount Helium volume mount point\n"
"\t-h home (default 'RUNDIR')\n"
"\t-L output to a log file\n"
"\t-l log operations (implies -L)\n"
diff --git a/test/format/wts.c b/test/format/wts.c
index f2f408be018..196c7470658 100644
--- a/test/format/wts.c
+++ b/test/format/wts.c
@@ -96,9 +96,8 @@ wts_open(const char *home, int set_api, WT_CONNECTION **connp)
g.c_reverse ? REVERSE_PATH : "",
access(BZIP_PATH, R_OK) == 0 ? BZIP_PATH : "",
access(LZO_PATH, R_OK) == 0 ? LZO_PATH : "",
- (access(RAW_PATH, R_OK) == 0 &&
- access(BZIP_PATH, R_OK) == 0) ? RAW_PATH : "",
access(SNAPPY_PATH, R_OK) == 0 ? SNAPPY_PATH : "",
+ access(ZLIB_PATH, R_OK) == 0 ? ZLIB_PATH : "",
DATASOURCE("kvsbdb") ? KVS_BDB_PATH : "",
g.c_config_open == NULL ? "" : g.c_config_open,
g.config_open == NULL ? "" : g.config_open);
@@ -120,18 +119,26 @@ wts_open(const char *home, int set_api, WT_CONNECTION **connp)
g.wt_api = conn->get_extension_api(conn);
/*
- * Load the Memrata shared library: it would be possible to do this as
+ * Load the Helium shared library: it would be possible to do this as
* part of the extensions configured for wiredtiger_open, there's no
* difference, I am doing it here because it's easier to work with the
* configuration strings.
*/
- if (DATASOURCE("memrata") &&
- (ret = conn->load_extension(conn, MEMRATA_PATH,
- "entry=wiredtiger_extension_init,config=["
- "dev1=[kvs_devices=[/dev/loop0,/dev/loop1],kvs_open_o_truncate=1],"
- "dev2=[kvs_devices=[/dev/loop2],kvs_open_o_truncate=1]]")) != 0)
- die(ret, "WT_CONNECTION.load_extension: %s", MEMRATA_PATH);
-
+ if (DATASOURCE("helium")) {
+ if (g.helium_mount == NULL)
+ die(EINVAL, "no Helium mount point specified");
+ (void)snprintf(config, sizeof(config),
+ "entry=wiredtiger_extension_init,config=["
+ "helium_verbose=0,"
+ "dev1=[helium_devices=\"he://./%s\","
+ "helium_o_volume_truncate=1]]",
+ g.helium_mount);
+ if ((ret =
+ conn->load_extension(conn, HELIUM_PATH, config)) != 0)
+ die(ret,
+ "WT_CONNECTION.load_extension: %s:%s",
+ HELIUM_PATH, config);
+ }
*connp = conn;
}
@@ -151,12 +158,6 @@ wts_create(void)
conn = g.wts_conn;
/*
- * Create the underlying store.
- */
- if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0)
- die(ret, "connection.open_session");
-
- /*
* Ensure that we can service at least one operation per-thread
* concurrently without filling the cache with pinned pages. We
* choose a multiplier of three because the max configurations control
@@ -244,21 +245,25 @@ wts_create(void)
p += snprintf(p, (size_t)(end - p),
",block_compressor=\"bzip2\"");
break;
- case COMPRESS_LZO:
+ case COMPRESS_BZIP_RAW:
p += snprintf(p, (size_t)(end - p),
- ",block_compressor=\"LZO1B-6\"");
+ ",block_compressor=\"bzip2-raw-test\"");
break;
- case COMPRESS_RAW:
+ case COMPRESS_LZO:
p += snprintf(p, (size_t)(end - p),
- ",block_compressor=\"raw\"");
+ ",block_compressor=\"LZO1B-6\"");
break;
case COMPRESS_SNAPPY:
p += snprintf(p, (size_t)(end - p),
",block_compressor=\"snappy\"");
break;
+ case COMPRESS_ZLIB:
+ p += snprintf(p, (size_t)(end - p),
+ ",block_compressor=\"zlib\"");
+ break;
}
- /* Configure internal key truncation. */
+ /* Configure Btree internal key truncation. */
p += snprintf(
p, (size_t)(end - p), ",internal_key_truncate=%s",
g.c_internal_key_truncation ? "true" : "false");
@@ -270,7 +275,12 @@ wts_create(void)
p += snprintf(p, (size_t)(end - p),
",split_pct=%" PRIu32, g.c_split_pct);
- /* Configure data types. */
+ /* Configure LSM and data-sources. */
+ if (DATASOURCE("helium"))
+ p += snprintf(p, (size_t)(end - p),
+ ",type=helium,helium_o_compress=%d,helium_o_truncate=1",
+ g.c_compression_flag == COMPRESS_NONE ? 0 : 1);
+
if (DATASOURCE("kvsbdb"))
p += snprintf(p, (size_t)(end - p), ",type=kvsbdb");
@@ -301,13 +311,13 @@ wts_create(void)
p += snprintf(p, (size_t)(end - p), ",)");
}
- if (DATASOURCE("memrata"))
- p += snprintf(p, (size_t)(end - p),
- ",type=memrata,kvs_open_o_truncate=1");
-
+ /*
+ * Create the underlying store.
+ */
+ if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0)
+ die(ret, "connection.open_session");
if ((ret = session->create(session, g.uri, config)) != 0)
die(ret, "session.create: %s", g.uri);
-
if ((ret = session->close(session, NULL)) != 0)
die(ret, "session.close");
}
@@ -331,8 +341,8 @@ wts_dump(const char *tag, int dump_bdb)
int ret;
char *cmd;
- /* Data-sources that don't support dump through the wt utility. */
- if (DATASOURCE("kvsbdb") || DATASOURCE("memrata"))
+ /* Some data-sources don't support dump through the wt utility. */
+ if (DATASOURCE("helium") || DATASOURCE("kvsbdb"))
return;
track("dump files and compare", 0ULL, NULL);
@@ -361,10 +371,8 @@ wts_salvage(void)
WT_SESSION *session;
int ret;
- /*
- * Data-sources that don't support salvage.
- */
- if (DATASOURCE("kvsbdb") || DATASOURCE("memrata"))
+ /* Some data-sources don't support salvage. */
+ if (DATASOURCE("helium") || DATASOURCE("kvsbdb"))
return;
conn = g.wts_conn;
@@ -392,12 +400,6 @@ wts_verify(const char *tag)
WT_SESSION *session;
int ret;
- /*
- * Data-sources that don't support verify.
- */
- if (DATASOURCE("memrata"))
- return;
-
conn = g.wts_conn;
track("verify", 0ULL, NULL);
@@ -435,14 +437,14 @@ wts_stats(void)
uint64_t v;
int ret;
- /* Data-sources that don't support statistics. */
- if (DATASOURCE("kvsbdb") || DATASOURCE("memrata"))
- return;
-
/* Ignore statistics if they're not configured. */
if (g.c_statistics == 0)
return;
+ /* Some data-sources don't support statistics. */
+ if (DATASOURCE("helium") || DATASOURCE("kvsbdb"))
+ return;
+
conn = g.wts_conn;
track("stat", 0ULL, NULL);
diff --git a/tools/wtperf_graph.py b/tools/wtperf_graph.py
index 91ea0ab3ffb..fb925ef28b9 100644
--- a/tools/wtperf_graph.py
+++ b/tools/wtperf_graph.py
@@ -26,29 +26,40 @@
# OTHER DEALINGS IN THE SOFTWARE.
#
-import csv, os
+import csv, os, sys
from subprocess import call
# Python script to read wtperf monitor output and create a performance
# graph.
TIMEFMT = "%b %d %H:%M:%S"
-# Read the monitor file and figure out when a checkpoint was running.
-in_ckpt = 'N'
-ckptlist=[]
-with open('monitor', 'r') as csvfile:
- reader = csv.reader(csvfile)
- for row in reader:
- if row[4] != in_ckpt:
- ckptlist.append(row[0])
- in_ckpt = row[4]
-if in_ckpt == 'Y':
- ckptlist.append(row[0])
+def process_monitor(fname, ckptlist, opdict):
+ # Read the monitor file and figure out when a checkpoint was running.
+ in_ckpt = 'N'
+ ckptlist=[]
+ # Monitor output format currently is:
+ # time,totalsec,read,insert,update,ckpt,...latencies...
+ ops = ('read', 'insert', 'update')
+ csvcol = (2, 3, 4)
+ with open(fname, 'r') as csvfile:
+ reader = csv.reader(csvfile)
+ for row in reader:
+ # Look for checkpoints and operations.
+ if row[5] != in_ckpt:
+ ckptlist.append(row[0])
+ in_ckpt = row[5]
+ for op, col in zip(ops, csvcol):
+ if row[col] != '0' and opdict[op] == 0:
+ opdict[op] = 1
-# Graph time vs. read, insert and update operations per second.
-of = open("gnuplot.cmd", "w")
-of.write('''
+ if in_ckpt == 'Y':
+ ckptlist.append(row[0])
+
+ # Graph time vs. read, insert and update operations per second.
+ gcmd = "gnuplot.mon.cmd"
+ of = open(gcmd, "w")
+ of.write('''
set autoscale
set datafile sep ','
set grid
@@ -60,26 +71,27 @@ set format x "%(TIMEFMT)s"
set xlabel "Time"
set xtics rotate by -45
set xdata time
-set ylabel "Operations per second (hundreds)"
+set ylabel "Operations per second (thousands)"
set yrange [0:]\n''' % {
'TIMEFMT' : TIMEFMT
- })
-it = iter(ckptlist)
-for start, stop in zip(it, it):
- of.write('set object rectangle from first \'' + start +\
- '\', graph 0 ' + ' to first \'' + stop +\
- '\', graph 1 fc rgb "gray" back\n')
-of.write('''
-set output 'monitor.png'
-plot "monitor" using 1:($2/100) title "Reads", "monitor" using 1:($3/100) title "Updates", "monitor" using 1:($4/100) title "Inserts"\n''')
-of.close()
-call(["gnuplot", "gnuplot.cmd"])
-os.remove("gnuplot.cmd")
-
+ })
+ it = iter(ckptlist)
+ for start, stop in zip(it, it):
+ of.write('set object rectangle from first \'' + start +\
+ '\', graph 0 ' + ' to first \'' + stop +\
+ '\', graph 1 fc rgb "gray" back\n')
+ of.write('set output "' + fname + '.png"\n')
+ of.write('plot "' + fname + '" using 1:($3/1000) title "Reads", "' +\
+ fname + '" using 1:($4/1000) title "Inserts", "' +\
+ fname + '" using 1:($5/1000) title "Updates"\n')
+ of.close()
+ call(["gnuplot", gcmd])
+ os.remove(gcmd)
# Graph time vs. average, minimium, maximum latency for an operation.
-def plot_latency_operation(name, col_avg, col_min, col_max):
- of = open("gnuplot.cmd", "w")
+def plot_latency_operation(name, fname, sfx, ckptlist, col_avg, col_min, col_max):
+ gcmd = "gnuplot." + name + ".l1.cmd"
+ of = open(gcmd, "w")
of.write('''
set autoscale
set datafile sep ','
@@ -103,48 +115,55 @@ set yrange [1:]\n''' % {
of.write('set object rectangle from first \'' + start +\
'\', graph 0 ' + ' to first \'' + stop +\
'\', graph 1 fc rgb "gray" back\n')
- of.write('''
-set output '%(NAME)s.latency1.png'
-plot "monitor" using 1:($%(COL_AVG)d / 1000) title "Average Latency", "monitor" using 1:($%(COL_MIN)d / 1000) title "Minimum Latency", "monitor" using 1:($%(COL_MAX)d / 1000) title "Maximum Latency"\n''' % {
- 'NAME' : name,
- 'COL_AVG' : col_avg,
- 'COL_MIN' : col_min,
- 'COL_MAX' : col_max
- })
+ ofname = name + sfx + '.latency1.png'
+ of.write('set output "' + ofname + '"\n')
+ of.write('plot "' + fname + '" using 1:($' + repr(col_avg) +\
+ ') title "Average Latency", "' + fname +'" using 1:($' +\
+ repr(col_min) + ') title "Minimum Latency", "' +\
+ fname + '" using 1:($' + repr(col_max) + ') title "Maximum Latency"\n')
of.close()
- call(["gnuplot", "gnuplot.cmd"])
- os.remove("gnuplot.cmd")
+ call(["gnuplot", gcmd])
+ os.remove(gcmd)
# Graph latency vs. % operations
-def plot_latency_percent(name):
- of = open("gnuplot.cmd", "w")
+def plot_latency_percent(name, dirname, sfx, ckptlist):
+ lfile = os.path.join(dirname, 'latency.' + name)
+ if not os.path.exists(lfile):
+ return
+ gcmd = "gnuplot." + name + ".l2.cmd"
+ of = open(gcmd, "w")
of.write('''
set autoscale
set datafile sep ','
set grid
set style data points
-set terminal png nocrop size 800,600
-set title "%(NAME)s: latency distribution"
+set terminal png nocrop size 800,600\n''')
+ of.write('set title "' + name + ': latency distribution"\n')
+ of.write('''
set xlabel "Latency (us)"
set xrange [1:]
set xtics rotate by -45
set logscale x
set ylabel "%% operations"
-set yrange [0:]
-set output '%(NAME)s.latency2.png'
-plot "latency.%(NAME)s" using (($2 * 100)/$4) title "%(NAME)s"\n''' % {
- 'NAME' : name
- })
+set yrange [0:]\n''')
+ ofname = name + sfx + '.latency2.png'
+ of.write('set output "' + ofname + '"\n')
+ of.write('plot "' + lfile + sfx +\
+ '" using (($2 * 100)/$4) title "' + name + '"\n')
of.close()
- call(["gnuplot", "gnuplot.cmd"])
- os.remove("gnuplot.cmd")
+ call(["gnuplot", gcmd])
+ os.remove(gcmd)
# Graph latency vs. % operations (cumulative)
-def plot_latency_cumulative_percent(name):
+def plot_latency_cumulative_percent(name, dirname, sfx, ckptlist):
+ lfile = os.path.join(dirname, 'latency.' + name)
+ if not os.path.exists(lfile):
+ return
# Latency plot: cumulative operations vs. latency
- of = open("gnuplot.cmd", "w")
+ gcmd = "gnuplot." + name + ".l3.cmd"
+ of = open(gcmd, "w")
of.write('''
set autoscale
set datafile sep ','
@@ -157,19 +176,52 @@ set xrange [1:]
set xtics rotate by -45
set logscale x
set ylabel "%% operations"
-set yrange [0:]
-set output '%(NAME)s.latency3.png'
-plot "latency.%(NAME)s" using 1:(($3 * 100)/$4) title "%(NAME)s"\n''' % {
+set yrange [0:]\n''' % {
'NAME' : name
})
+ ofname = name + sfx + '.latency3.png'
+ of.write('set output "' + ofname + '"\n')
+ of.write('plot "' + lfile + sfx +\
+ '" using 1:(($3 * 100)/$4) title "' + name + '"\n')
of.close()
- call(["gnuplot", "gnuplot.cmd"])
- os.remove("gnuplot.cmd")
+ call(["gnuplot", gcmd])
+ os.remove(gcmd)
+
+def process_file(fname):
+ ckptlist = []
+ # NOTE: The operations below must be in this exact order to match
+ # the operation latency output in the monitor file.
+ opdict={'read':0, 'insert':0, 'update':0}
+ process_monitor(fname, ckptlist, opdict)
+
+ # This assumes the monitor file has the string "monitor"
+ # and any other (optional) characters in the filename are a suffix.
+ sfx = os.path.basename(fname).replace('monitor','')
+ dirname = os.path.dirname(fname)
+
+ column = 7 # average, minimum, maximum start in column 7
+ for k, v in opdict.items():
+ if v != 0:
+ plot_latency_operation(
+ k, fname, sfx, ckptlist, column, column + 1, column + 2)
+ plot_latency_percent(k, dirname, sfx, ckptlist)
+ plot_latency_cumulative_percent(k, dirname, sfx, ckptlist)
+ else:
+ print fname + ': no ' + k + ' operations found. Skip.'
+ column = column + 3
+def main():
+ # This program takes a list of monitor files generated by
+ # wtperf. If no args are given, it looks for a single file
+ # named 'monitor'.
+ numargs = len(sys.argv)
+ if numargs < 2:
+ process_file('monitor')
+ else:
+ d = 1
+ while d < numargs:
+ process_file(sys.argv[d])
+ d += 1
-column = 6 # average, minimum, maximum start in column 6
-for op in ['read', 'insert', 'update']:
- plot_latency_operation(op, column, column + 1, column + 2)
- column = column + 3
- plot_latency_percent(op)
- plot_latency_cumulative_percent(op)
+if __name__ == '__main__':
+ main()