summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--SConstruct5
-rw-r--r--bench/wtperf/runners/evict-btree-1.wtperf11
-rw-r--r--bench/wtperf/runners/evict-lsm-1.wtperf12
-rw-r--r--build_posix/aclocal/types.m41
-rw-r--r--build_win/filelist.win7
-rw-r--r--dist/api_data.py17
-rw-r--r--dist/api_err.py8
-rw-r--r--dist/filelist11
-rw-r--r--dist/flags.py7
-rwxr-xr-xdist/s_docs3
-rwxr-xr-xdist/s_longlines4
-rwxr-xr-xdist/s_prototypes2
-rw-r--r--dist/s_string.ok16
-rwxr-xr-xdist/s_style2
-rwxr-xr-xdist/s_whitespace2
-rwxr-xr-xdist/s_win4
-rw-r--r--dist/stat_data.py7
-rw-r--r--examples/c/Makefile.am3
-rw-r--r--examples/c/ex_all.c7
-rw-r--r--examples/c/ex_config.c91
-rw-r--r--examples/c/ex_event_handler.c4
-rw-r--r--examples/c/ex_file_system.c721
-rw-r--r--examples/c/ex_schema.c76
-rw-r--r--examples/c/ex_scope.c6
-rw-r--r--examples/c/queue_example.h149
-rw-r--r--examples/java/com/wiredtiger/examples/ex_all.java12
-rw-r--r--examples/java/com/wiredtiger/examples/ex_schema.java76
-rw-r--r--ext/compressors/zlib/zlib_compress.c10
-rw-r--r--lang/java/java_doc.i1
-rw-r--r--lang/java/src/com/wiredtiger/db/PackFormatInputStream.java7
-rw-r--r--lang/java/src/com/wiredtiger/db/PackInputStream.java35
-rw-r--r--lang/java/src/com/wiredtiger/db/PackOutputStream.java2
-rw-r--r--lang/java/wiredtiger.i65
-rw-r--r--src/block/block_ckpt.c11
-rw-r--r--src/block/block_ext.c5
-rw-r--r--src/block/block_map.c48
-rw-r--r--src/block/block_mgr.c33
-rw-r--r--src/block/block_open.c40
-rw-r--r--src/block/block_read.c61
-rw-r--r--src/block/block_vrfy.c20
-rw-r--r--src/block/block_write.c53
-rw-r--r--src/bloom/bloom.c2
-rw-r--r--src/btree/bt_curnext.c15
-rw-r--r--src/btree/bt_curprev.c18
-rw-r--r--src/btree/bt_cursor.c42
-rw-r--r--src/btree/bt_debug.c96
-rw-r--r--src/btree/bt_delete.c7
-rw-r--r--src/btree/bt_discard.c53
-rw-r--r--src/btree/bt_handle.c21
-rw-r--r--src/btree/bt_huffman.c25
-rw-r--r--src/btree/bt_page.c23
-rw-r--r--src/btree/bt_read.c8
-rw-r--r--src/btree/bt_rebalance.c5
-rw-r--r--src/btree/bt_ret.c2
-rw-r--r--src/btree/bt_slvg.c45
-rw-r--r--src/btree/bt_split.c152
-rw-r--r--src/btree/bt_sync.c4
-rw-r--r--src/btree/bt_vrfy.c12
-rw-r--r--src/btree/col_modify.c20
-rw-r--r--src/btree/col_srch.c30
-rw-r--r--src/btree/row_key.c6
-rw-r--r--src/btree/row_modify.c13
-rw-r--r--src/btree/row_srch.c13
-rw-r--r--src/checksum/checksum.c (renamed from src/support/cksum.c)0
-rw-r--r--src/checksum/power8/LICENSE.TXT (renamed from src/support/power8/LICENSE.TXT)0
-rw-r--r--src/checksum/power8/README.md (renamed from src/support/power8/README.md)0
-rw-r--r--src/checksum/power8/crc32.S (renamed from src/support/power8/crc32.S)0
-rw-r--r--src/checksum/power8/crc32_constants.h (renamed from src/support/power8/crc32_constants.h)0
-rw-r--r--src/checksum/power8/crc32_wrapper.c (renamed from src/support/power8/crc32_wrapper.c)0
-rw-r--r--src/checksum/power8/ppc-opcode.h (renamed from src/support/power8/ppc-opcode.h)0
-rw-r--r--src/config/config_def.c16
-rw-r--r--src/conn/conn_api.c204
-rw-r--r--src/conn/conn_cache.c20
-rw-r--r--src/conn/conn_cache_pool.c1
-rw-r--r--src/conn/conn_handle.c8
-rw-r--r--src/conn/conn_log.c42
-rw-r--r--src/conn/conn_open.c2
-rw-r--r--src/conn/conn_stat.c32
-rw-r--r--src/cursor/cur_backup.c91
-rw-r--r--src/cursor/cur_ds.c2
-rw-r--r--src/cursor/cur_index.c16
-rw-r--r--src/cursor/cur_join.c1498
-rw-r--r--src/cursor/cur_std.c1
-rw-r--r--src/docs/Doxyfile2
-rw-r--r--src/docs/backup.dox44
-rw-r--r--src/docs/cursor-join.dox25
-rw-r--r--src/docs/custom-file-systems.dox25
-rw-r--r--src/docs/error-handling.dox3
-rw-r--r--src/docs/examples.dox6
-rw-r--r--src/docs/in-memory.dox12
-rw-r--r--src/docs/programming.dox2
-rw-r--r--src/docs/spell.ok5
-rw-r--r--src/docs/tune-cache.dox4
-rw-r--r--src/evict/evict_file.c4
-rw-r--r--src/evict/evict_lru.c355
-rw-r--r--src/evict/evict_page.c2
-rw-r--r--src/include/api.h2
-rw-r--r--src/include/bitstring.i14
-rw-r--r--src/include/block.h9
-rw-r--r--src/include/btmem.h83
-rw-r--r--src/include/btree.i22
-rw-r--r--src/include/cache.h26
-rw-r--r--src/include/column.i24
-rw-r--r--src/include/config.h71
-rw-r--r--src/include/connection.h44
-rw-r--r--src/include/cursor.h56
-rw-r--r--src/include/cursor.i2
-rw-r--r--src/include/extern.h61
-rw-r--r--src/include/flags.h5
-rw-r--r--src/include/log.h6
-rw-r--r--src/include/meta.h2
-rw-r--r--src/include/misc.h5
-rw-r--r--src/include/misc.i245
-rw-r--r--src/include/os.h128
-rw-r--r--src/include/os_fhandle.i154
-rw-r--r--src/include/os_fs.i243
-rw-r--r--src/include/os_fstream.i97
-rw-r--r--src/include/serial.i2
-rw-r--r--src/include/stat.h7
-rw-r--r--src/include/txn.h9
-rw-r--r--src/include/txn.i12
-rw-r--r--src/include/wiredtiger.in895
-rw-r--r--src/include/wt_internal.h26
-rw-r--r--src/log/log.c121
-rw-r--r--src/lsm/lsm_cursor.c4
-rw-r--r--src/lsm/lsm_merge.c28
-rw-r--r--src/lsm/lsm_meta.c2
-rw-r--r--src/lsm/lsm_tree.c14
-rw-r--r--src/lsm/lsm_work_unit.c4
-rw-r--r--src/meta/meta_track.c4
-rw-r--r--src/meta/meta_turtle.c74
-rw-r--r--src/os_common/filename.c44
-rw-r--r--src/os_common/os_fhandle.c162
-rw-r--r--src/os_common/os_fs_inmemory.c685
-rw-r--r--src/os_common/os_fs_stdio.c239
-rw-r--r--src/os_common/os_fstream.c213
-rw-r--r--src/os_common/os_fstream_stdio.c84
-rw-r--r--src/os_common/os_getline.c51
-rw-r--r--src/os_common/os_init.c41
-rw-r--r--src/os_posix/os_dir.c89
-rw-r--r--src/os_posix/os_dlopen.c2
-rw-r--r--src/os_posix/os_fallocate.c160
-rw-r--r--src/os_posix/os_fs.c616
-rw-r--r--src/os_posix/os_map.c118
-rw-r--r--src/os_win/os_dir.c95
-rw-r--r--src/os_win/os_dlopen.c1
-rw-r--r--src/os_win/os_fs.c581
-rw-r--r--src/os_win/os_map.c91
-rw-r--r--src/os_win/os_thread.c2
-rw-r--r--src/reconcile/rec_write.c143
-rw-r--r--src/schema/schema_create.c2
-rw-r--r--src/schema/schema_open.c6
-rw-r--r--src/schema/schema_rename.c4
-rw-r--r--src/schema/schema_stat.c8
-rw-r--r--src/session/session_api.c48
-rw-r--r--src/support/err.c4
-rw-r--r--src/support/scratch.c2
-rw-r--r--src/support/stat.c29
-rw-r--r--src/txn/txn.c245
-rw-r--r--src/txn/txn_ckpt.c7
-rw-r--r--src/txn/txn_nsnap.c2
-rw-r--r--src/utilities/util_backup.c2
-rw-r--r--src/utilities/util_dump.c362
-rw-r--r--src/utilities/util_main.c1
-rw-r--r--test/bloom/Makefile.am2
-rw-r--r--test/bloom/test_bloom.c12
-rw-r--r--test/checkpoint/test_checkpoint.c3
-rw-r--r--test/checkpoint/test_checkpoint.h12
-rw-r--r--test/cursor_order/Makefile.am2
-rw-r--r--test/cursor_order/cursor_order_ops.c19
-rw-r--r--test/fops/fops.c6
-rw-r--r--test/fops/thread.h18
-rw-r--r--test/format/backup.c118
-rw-r--r--test/format/bdb.c35
-rw-r--r--test/format/bulk.c51
-rw-r--r--test/format/compact.c2
-rw-r--r--test/format/config.c182
-rw-r--r--test/format/config.h4
-rw-r--r--test/format/format.h50
-rw-r--r--test/format/lrt.c24
-rw-r--r--test/format/ops.c736
-rw-r--r--test/format/salvage.c8
-rw-r--r--test/format/t.c9
-rw-r--r--test/format/util.c223
-rw-r--r--test/format/wts.c26
-rw-r--r--test/huge/Makefile.am2
-rw-r--r--test/huge/huge.c11
-rw-r--r--test/manydbs/Makefile.am2
-rw-r--r--test/manydbs/manydbs.c32
-rw-r--r--test/packing/Makefile.am3
-rw-r--r--test/packing/intpack-test.c9
-rw-r--r--test/packing/intpack-test2.c9
-rw-r--r--test/packing/intpack-test3.c14
-rw-r--r--test/packing/packing-test.c12
-rw-r--r--test/readonly/Makefile.am2
-rw-r--r--test/readonly/readonly.c14
-rw-r--r--test/recovery/Makefile.am2
-rw-r--r--test/salvage/Makefile.am2
-rw-r--r--test/salvage/salvage.c4
-rw-r--r--test/suite/test_backup05.py35
-rw-r--r--test/suite/test_join01.py409
-rw-r--r--test/suite/test_join07.py548
-rw-r--r--test/suite/test_join08.py265
-rw-r--r--test/suite/test_reconfig02.py10
-rw-r--r--test/suite/test_stat05.py5
-rw-r--r--test/suite/test_txn04.py2
-rw-r--r--test/thread/Makefile.am2
-rw-r--r--test/thread/rw.c12
-rw-r--r--test/thread/thread.h12
-rw-r--r--test/utility/test_util.i63
210 files changed, 8879 insertions, 5473 deletions
diff --git a/SConstruct b/SConstruct
index a7306262f82..425a531fda2 100644
--- a/SConstruct
+++ b/SConstruct
@@ -214,6 +214,7 @@ if (VERSION_MAJOR == None or
wiredtiger_includes = """
#include <sys/types.h>
#include <stdarg.h>
+ #include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
"""
@@ -345,12 +346,12 @@ examples = [
"ex_all",
"ex_async",
"ex_call_center",
- "ex_config",
"ex_config_parse",
"ex_cursor",
"ex_data_source",
"ex_encrypt",
"ex_extending",
+ "ex_file_system",
"ex_hello",
"ex_log",
"ex_pack",
@@ -468,7 +469,7 @@ Default(t)
#Build the Examples
for ex in examples:
- if(ex in ['ex_all', 'ex_async', 'ex_thread', 'ex_encrypt']):
+ if(ex in ['ex_all', 'ex_async', 'ex_encrypt', 'ex_file_system' , 'ex_thread']):
exp = env.Program(ex, "examples/c/" + ex + ".c", LIBS=[wtlib, shim] + wtlibs)
Default(exp)
env.Alias("check", env.SmokeTest(exp))
diff --git a/bench/wtperf/runners/evict-btree-1.wtperf b/bench/wtperf/runners/evict-btree-1.wtperf
new file mode 100644
index 00000000000..24da4dd7902
--- /dev/null
+++ b/bench/wtperf/runners/evict-btree-1.wtperf
@@ -0,0 +1,11 @@
+# wtperf options file: evict btree configuration
+conn_config="cache_size=50M"
+table_config="type=file"
+icount=10000000
+report_interval=5
+run_time=120
+populate_threads=1
+threads=((count=16,reads=1))
+# Add throughput/latency monitoring
+max_latency=2000
+sample_interval=5
diff --git a/bench/wtperf/runners/evict-lsm-1.wtperf b/bench/wtperf/runners/evict-lsm-1.wtperf
new file mode 100644
index 00000000000..ad885d98eb7
--- /dev/null
+++ b/bench/wtperf/runners/evict-lsm-1.wtperf
@@ -0,0 +1,12 @@
+# wtperf options file: evict lsm configuration
+conn_config="cache_size=50M,lsm_manager=(worker_thread_max=6)"
+table_config="type=lsm,lsm=(chunk_size=2M),os_cache_dirty_max=16MB"
+compact=true
+icount=10000000
+report_interval=5
+run_time=120
+populate_threads=1
+threads=((count=16,reads=1))
+# Add throughput/latency monitoring
+max_latency=2000
+sample_interval=5
diff --git a/build_posix/aclocal/types.m4 b/build_posix/aclocal/types.m4
index 439034c89d2..089058f5611 100644
--- a/build_posix/aclocal/types.m4
+++ b/build_posix/aclocal/types.m4
@@ -7,6 +7,7 @@ AC_DEFUN([AM_TYPES], [
#include <sys/types.h>
#include <inttypes.h>
#include <stdarg.h>
+#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>"
AC_SUBST(wiredtiger_includes_decl)
diff --git a/build_win/filelist.win b/build_win/filelist.win
index c370303d5f8..d52a57ba2e3 100644
--- a/build_win/filelist.win
+++ b/build_win/filelist.win
@@ -47,6 +47,7 @@ src/btree/row_key.c
src/btree/row_modify.c
src/btree/row_srch.c
src/cache/cache_las.c
+src/checksum/checksum.c
src/config/config.c
src/config/config_api.c
src/config/config_check.c
@@ -106,10 +107,9 @@ src/os_common/os_abort.c
src/os_common/os_alloc.c
src/os_common/os_fhandle.c
src/os_common/os_fs_inmemory.c
-src/os_common/os_fs_stdio.c
-src/os_common/os_getline.c
+src/os_common/os_fstream.c
+src/os_common/os_fstream_stdio.c
src/os_common/os_getopt.c
-src/os_common/os_init.c
src/os_common/os_strtouq.c
src/os_win/os_dir.c
src/os_win/os_dlopen.c
@@ -149,7 +149,6 @@ src/session/session_api.c
src/session/session_compact.c
src/session/session_dhandle.c
src/session/session_salvage.c
-src/support/cksum.c
src/support/cond_auto.c
src/support/crypto.c
src/support/err.c
diff --git a/dist/api_data.py b/dist/api_data.py
index 5ca294a5d60..20c4433efc8 100644
--- a/dist/api_data.py
+++ b/dist/api_data.py
@@ -722,8 +722,8 @@ wiredtiger_open = wiredtiger_open_common + [
\c create option''',
type='boolean'),
Config('in_memory', 'false', r'''
- keep data in-memory only, minimize disk I/O''',
- type='boolean', undoc=True),
+ keep data in-memory only. See @ref in_memory for more information''',
+ type='boolean'),
Config('use_environment', 'true', r'''
use the \c WIREDTIGER_CONFIG and \c WIREDTIGER_HOME environment
variables if the process is not running with special privileges.
@@ -818,6 +818,13 @@ methods = {
Config('bloom_hash_count', '8', r'''
the number of hash values per item for the bloom filter''',
min='2', max='100'),
+ Config('operation', '"and"', r'''
+ the operation applied between this and other joined cursors.
+ When "operation=and" is specified, all the conditions implied by
+ joins must be satisfied for an entry to be returned by the join cursor;
+ when "operation=or" is specified, only one must be satisfied.
+ All cursors joined to a join cursor must have matching operations''',
+ choices=['and', 'or']),
Config('strategy', '', r'''
when set to bloom, a bloom filter is created and populated for
this index. This has an up front cost but may reduce the number
@@ -1073,11 +1080,17 @@ methods = {
type='boolean'),
]),
'WT_CONNECTION.reconfigure' : Method(connection_runtime_config),
+'WT_CONNECTION.set_file_system' : Method([]),
'WT_CONNECTION.load_extension' : Method([
Config('config', '', r'''
configuration string passed to the entry point of the
extension as its WT_CONFIG_ARG argument'''),
+ Config('early_load', 'false', r'''
+ whether this extension should be loaded at the beginning of
+ ::wiredtiger_open. Only applicable to extensions loaded via the
+ wiredtiger_open configurations string''',
+ type='boolean'),
Config('entry', 'wiredtiger_extension_init', r'''
the entry point of the extension, called to initialize the
extension when it is loaded. The signature of the function
diff --git a/dist/api_err.py b/dist/api_err.py
index a17c68ee196..af5f1c12b37 100644
--- a/dist/api_err.py
+++ b/dist/api_err.py
@@ -53,9 +53,11 @@ errors = [
to return an error if recovery is required to use the database.'''),
Error('WT_CACHE_FULL', -31807,
'operation would overflow cache', '''
- This error is generated when wiredtiger_open is configured
- to run in-memory, and an insert or update operation requires more
- than the configured cache size to complete.''', undoc=True),
+ This error is only generated when wiredtiger_open is configured
+ to run in-memory, and an insert or update operation requires
+ more than the configured cache size to complete. The operation
+ may be retried; if a transaction is in progress, it should be
+ rolled back and the operation retried in a new transaction.'''),
Error('WT_PERM_DENIED', -31808,
'permission denied (internal)', undoc=True),
]
diff --git a/dist/filelist b/dist/filelist
index 1d7ffa76922..22d29d22edf 100644
--- a/dist/filelist
+++ b/dist/filelist
@@ -47,6 +47,9 @@ src/btree/row_key.c
src/btree/row_modify.c
src/btree/row_srch.c
src/cache/cache_las.c
+src/checksum/checksum.c
+src/checksum/power8/crc32.S
+src/checksum/power8/crc32_wrapper.c
src/config/config.c
src/config/config_api.c
src/config/config_check.c
@@ -106,10 +109,9 @@ src/os_common/os_abort.c
src/os_common/os_alloc.c
src/os_common/os_fhandle.c
src/os_common/os_fs_inmemory.c
-src/os_common/os_fs_stdio.c
-src/os_common/os_getline.c
+src/os_common/os_fstream.c
+src/os_common/os_fstream_stdio.c
src/os_common/os_getopt.c
-src/os_common/os_init.c
src/os_common/os_strtouq.c
src/os_posix/os_dir.c
src/os_posix/os_dlopen.c
@@ -148,7 +150,6 @@ src/session/session_api.c
src/session/session_compact.c
src/session/session_dhandle.c
src/session/session_salvage.c
-src/support/cksum.c
src/support/cond_auto.c
src/support/crypto.c
src/support/err.c
@@ -160,8 +161,6 @@ src/support/hex.c
src/support/huffman.c
src/support/mtx_rw.c
src/support/pow.c
-src/support/power8/crc32.S
-src/support/power8/crc32_wrapper.c
src/support/rand.c
src/support/scratch.c
src/support/stat.c
diff --git a/dist/flags.py b/dist/flags.py
index 8f7827ad160..48b0a2452a9 100644
--- a/dist/flags.py
+++ b/dist/flags.py
@@ -8,13 +8,6 @@ flags = {
###################################################
# Internal routine flag declarations
###################################################
- 'file_types' : [
- 'FILE_TYPE_CHECKPOINT',
- 'FILE_TYPE_DATA',
- 'FILE_TYPE_DIRECTORY',
- 'FILE_TYPE_LOG',
- 'FILE_TYPE_REGULAR',
- ],
'log_scan' : [
'LOGSCAN_FIRST',
'LOGSCAN_FROM_CKP',
diff --git a/dist/s_docs b/dist/s_docs
index c66bcb0bd06..08602989fe8 100755
--- a/dist/s_docs
+++ b/dist/s_docs
@@ -114,7 +114,8 @@ valid_build()
}
classf=`ls ../docs/struct___* 2>/dev/null`
for c in $classf; do
- echo "$c: Need to add class to PREDEFINED in src/docs/Doxyfile"
+ echo "$c: Add class to PREDEFINED in src/docs/Doxyfile, then remove docs/*.{html,js} and rebuild"
+
done
}
diff --git a/dist/s_longlines b/dist/s_longlines
index 000f33d51d5..2837d400f9e 100755
--- a/dist/s_longlines
+++ b/dist/s_longlines
@@ -8,9 +8,9 @@ l=`(cd .. &&
find bench/wtperf examples ext src test -name '*.[chisy]' &&
find dist -name '*.py' &&
find src -name '*.in') |
- sed -e '/dist\/stat_data\.py/d' \
+ sed -e '/checksum\/power8/d' \
+ -e '/dist\/stat_data\.py/d' \
-e '/include\/extern\.h/d' \
- -e '/support\/power8/d' \
-e '/support\/stat\.c/d'`
for f in $l ; do
diff --git a/dist/s_prototypes b/dist/s_prototypes
index 4ceb69f4c77..aa66d06dbe0 100755
--- a/dist/s_prototypes
+++ b/dist/s_prototypes
@@ -55,7 +55,7 @@ l=`echo ../src\/os*/*.c`
for i in $l; do
proto $i
-done | tee xxx | env LC_ALL=C sort -u
+done | env LC_ALL=C sort -u
) > $t
f=../src/include/extern.h
diff --git a/dist/s_string.ok b/dist/s_string.ok
index eed034abb47..81d09a55225 100644
--- a/dist/s_string.ok
+++ b/dist/s_string.ok
@@ -76,6 +76,7 @@ DECL
DECR
DESC
DHANDLE
+DIRECTIO
DNE
DOI
DONTNEED
@@ -255,6 +256,7 @@ Qsort
RCS
RDNOLOCK
RDONLY
+READONLY
RECNO
REF's
REFs
@@ -487,9 +489,11 @@ conn
connectionp
const
constantp
+cookiep
copydoc
copyin
copyout
+countp
cp
cpuid
crc
@@ -601,6 +605,7 @@ evictserver
exactp
exe
execop
+existp
extern
extlist
fadvise
@@ -617,6 +622,7 @@ ffs
fgetc
fgetln
fh
+fhandle
filefrag
filehandle
fileid
@@ -639,8 +645,10 @@ fopen
fp
fprintf
free'd
+fs
fscanf
fstat
+fstream
fsync
fsyncLock
fsyncs
@@ -668,11 +676,13 @@ gostruct
goutf
gt
handleops
+handlep
hashval
havesize
hdr
highjack
hotbackup
+hselasky
html
huffman
hval
@@ -688,6 +698,7 @@ im
impl
incase
incr
+incrementals
incrementing
indices
indirects
@@ -696,6 +707,7 @@ infeasible
inflateInit
infmt
init
+initializers
initn
initsize
initval
@@ -716,6 +728,7 @@ io
ip
islocked
ispo
+isrc
iter
iteratively
jnr
@@ -738,6 +751,7 @@ lbracket
ld
le
len
+lengthp
lenp
level's
leveldb
@@ -843,6 +857,7 @@ noraw
notfound
notsup
notused
+nowait
nset
nsnap
nul
@@ -938,6 +953,7 @@ recsize
rectype
recurse
refp
+regionp
reinitialization
relocked
resize
diff --git a/dist/s_style b/dist/s_style
index a163eb83b25..85220124971 100755
--- a/dist/s_style
+++ b/dist/s_style
@@ -20,7 +20,7 @@ if [ $# -ne 1 ]; then
-name '*.[chisy]' -o -name '*.in' -o -name '*.dox' |
sed -e '/Makefile.in/d' \
-e '/build_win\/wiredtiger_config.h/d' \
- -e '/support\/power8/d' |
+ -e '/checksum\/power8/d' |
xargs $xp -n 1 -I{} sh ./dist/s_style {}
else
# General style correction and cleanup for a single file
diff --git a/dist/s_whitespace b/dist/s_whitespace
index 74820a4f0e9..8cf3f7dfe6f 100755
--- a/dist/s_whitespace
+++ b/dist/s_whitespace
@@ -38,7 +38,7 @@ for f in `find bench examples ext src test \
-name '*.in' -o \
-name 'Makefile.am' |
sed -e '/Makefile.in/d' \
- -e '/support\/power8/d'`; do
+ -e '/checksum\/power8/d'`; do
whitespace_and_empty_line $f
done
diff --git a/dist/s_win b/dist/s_win
index 562e89f94c6..7fe525c202d 100755
--- a/dist/s_win
+++ b/dist/s_win
@@ -47,8 +47,8 @@ win_filelist()
(
sed \
-e '/\/os_posix\//d' \
- -e '/src\/support\/power8\/crc32.S/d' \
- -e '/src\/support\/power8\/crc32_wrapper.c/d'
+ -e '/src\/checksum\/power8\/crc32.S/d' \
+ -e '/src\/checksum\/power8\/crc32_wrapper.c/d'
echo 'src/os_win/os_dir.c'
echo 'src/os_win/os_dlopen.c'
diff --git a/dist/stat_data.py b/dist/stat_data.py
index bd951e64999..0486d94e278 100644
--- a/dist/stat_data.py
+++ b/dist/stat_data.py
@@ -170,6 +170,9 @@ connection_stats = [
CacheStat('cache_eviction_force', 'pages evicted because they exceeded the in-memory maximum'),
CacheStat('cache_eviction_force_delete', 'pages evicted because they had chains of deleted items'),
CacheStat('cache_eviction_force_fail', 'failed eviction of pages that exceeded the in-memory maximum'),
+ CacheStat('cache_eviction_get_ref', 'eviction calls to get a page'),
+ CacheStat('cache_eviction_get_ref_empty', 'eviction calls to get a page found queue empty'),
+ CacheStat('cache_eviction_get_ref_empty2', 'eviction calls to get a page found queue empty after locking'),
CacheStat('cache_eviction_hazard', 'hazard pointer blocked page eviction'),
CacheStat('cache_eviction_internal', 'internal pages evicted'),
CacheStat('cache_eviction_maximum_page_size', 'maximum page size at eviction', 'no_clear,no_scale,size'),
@@ -177,6 +180,8 @@ connection_stats = [
CacheStat('cache_eviction_queue_not_empty', 'eviction server candidate queue not empty when topping up'),
CacheStat('cache_eviction_server_evicting', 'eviction server evicting pages'),
CacheStat('cache_eviction_server_not_evicting', 'eviction server populating queue, but not evicting pages'),
+ CacheStat('cache_eviction_server_slept', 'eviction server slept, because we did not make progress with eviction'),
+ CacheStat('cache_eviction_server_toobig', 'eviction server skipped very large page'),
CacheStat('cache_eviction_slow', 'eviction server unable to reach eviction goal'),
CacheStat('cache_eviction_split_internal', 'internal pages split during eviction'),
CacheStat('cache_eviction_split_leaf', 'leaf pages split during eviction'),
@@ -189,6 +194,7 @@ connection_stats = [
CacheStat('cache_overhead', 'percentage overhead', 'no_clear,no_scale'),
CacheStat('cache_pages_dirty', 'tracked dirty pages in the cache', 'no_clear,no_scale'),
CacheStat('cache_pages_inuse', 'pages currently held in the cache', 'no_clear,no_scale'),
+ CacheStat('cache_pages_requested', 'pages requested from the cache'),
CacheStat('cache_read', 'pages read into cache'),
CacheStat('cache_read_lookaside', 'pages read into cache requiring lookaside entries'),
CacheStat('cache_write', 'pages written from cache'),
@@ -422,6 +428,7 @@ dsrc_stats = [
CacheStat('cache_inmem_split', 'in-memory page splits'),
CacheStat('cache_inmem_splittable', 'in-memory page passed criteria to be split'),
CacheStat('cache_overflow_value', 'overflow values cached in memory', 'no_scale'),
+ CacheStat('cache_pages_requested', 'pages requested from the cache'),
CacheStat('cache_read', 'pages read into cache'),
CacheStat('cache_read_lookaside', 'pages read into cache requiring lookaside entries'),
CacheStat('cache_read_overflow', 'overflow pages read into cache'),
diff --git a/examples/c/Makefile.am b/examples/c/Makefile.am
index 72fd98aff7b..d5305eec5c8 100644
--- a/examples/c/Makefile.am
+++ b/examples/c/Makefile.am
@@ -7,7 +7,6 @@ noinst_PROGRAMS = \
ex_async \
ex_backup \
ex_call_center \
- ex_config \
ex_config_parse \
ex_cursor \
ex_data_source \
@@ -15,6 +14,7 @@ noinst_PROGRAMS = \
ex_event_handler \
ex_extending \
ex_extractor \
+ ex_file_system \
ex_hello \
ex_log \
ex_pack \
@@ -26,6 +26,7 @@ noinst_PROGRAMS = \
ex_thread
ex_encrypt_LDFLAGS = -rdynamic
+ex_file_system_LDFLAGS = -rdynamic
# The examples can be run with no arguments as simple smoke tests
TESTS = $(noinst_PROGRAMS)
diff --git a/examples/c/ex_all.c b/examples/c/ex_all.c
index 1c036b75461..ea97668c697 100644
--- a/examples/c/ex_all.c
+++ b/examples/c/ex_all.c
@@ -1037,6 +1037,13 @@ backup(WT_SESSION *session)
ret = cursor->close(cursor);
/*! [backup]*/
+ /*! [incremental backup]*/
+ /* Open the backup data source for incremental backup. */
+ ret = session->open_cursor(
+ session, "backup:", NULL, "target=(\"log:\")", &cursor);
+ /*! [incremental backup]*/
+ ret = cursor->close(cursor);
+
/*! [backup of a checkpoint]*/
ret = session->checkpoint(session, "drop=(from=June01),name=June01");
/*! [backup of a checkpoint]*/
diff --git a/examples/c/ex_config.c b/examples/c/ex_config.c
deleted file mode 100644
index 2ac8198176c..00000000000
--- a/examples/c/ex_config.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/*-
- * Public Domain 2014-2016 MongoDB, Inc.
- * Public Domain 2008-2014 WiredTiger, Inc.
- *
- * This is free and unencumbered software released into the public domain.
- *
- * Anyone is free to copy, modify, publish, use, compile, sell, or
- * distribute this software, either in source code form or as a compiled
- * binary, for any purpose, commercial or non-commercial, and by any
- * means.
- *
- * In jurisdictions that recognize copyright laws, the author or authors
- * of this software dedicate any and all copyright interest in the
- * software to the public domain. We make this dedication for the benefit
- * of the public at large and to the detriment of our heirs and
- * successors. We intend this dedication to be an overt act of
- * relinquishment in perpetuity of all present and future rights to this
- * software under copyright law.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- * ex_config.c
- * This is an example demonstrating how to configure various database and
- * table properties.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <wiredtiger.h>
-
-static const char *home;
-
-int
-main(void)
-{
- int ret;
- WT_CONNECTION *conn;
- WT_SESSION *session;
- WT_CURSOR *cursor;
- const char *key, *value;
-
- /*
- * Create a clean test directory for this run of the test program if the
- * environment variable isn't already set (as is done by make check).
- */
- if (getenv("WIREDTIGER_HOME") == NULL) {
- home = "WT_HOME";
- ret = system("rm -rf WT_HOME && mkdir WT_HOME");
- } else
- home = NULL;
-
- /*! [configure cache size] */
- if ((ret = wiredtiger_open(home, NULL,
- "create,cache_size=500M", &conn)) != 0)
- fprintf(stderr, "Error connecting to %s: %s\n",
- home, wiredtiger_strerror(ret));
- /*! [configure cache size] */
-
- /*! [create a table] */
- ret = conn->open_session(conn, NULL, NULL, &session);
-
- ret = session->create(session,
- "table:access", "key_format=S,value_format=S");
- /*! [create a table] */
-
- /*! [transaction] */
- ret = session->begin_transaction(session, "priority=100,name=mytxn");
-
- ret = session->open_cursor(session, "config:", NULL, NULL, &cursor);
-
- while ((ret = cursor->next(cursor)) == 0) {
- ret = cursor->get_key(cursor, &key);
- ret = cursor->get_value(cursor, &value);
- printf("configuration value: %s = %s\n", key, value);
- }
-
- ret = session->commit_transaction(session, NULL);
- /*! [transaction] */
-
- ret = conn->close(conn, NULL);
-
- return (ret);
-}
diff --git a/examples/c/ex_event_handler.c b/examples/c/ex_event_handler.c
index d1e08edb04d..ea30a5990fb 100644
--- a/examples/c/ex_event_handler.c
+++ b/examples/c/ex_event_handler.c
@@ -111,10 +111,10 @@ config_event_handler(void)
/*! [Configure event_handler] */
/* Make an invalid API call, to ensure the event handler works. */
+ printf("ex_event_handler: expect an error message to follow\n");
(void)conn->open_session(conn, NULL, "isolation=invalid", &session);
- if (ret == 0)
- ret = conn->close(conn, NULL);
+ ret = conn->close(conn, NULL);
return (ret);
}
diff --git a/examples/c/ex_file_system.c b/examples/c/ex_file_system.c
new file mode 100644
index 00000000000..18ea9b7242e
--- /dev/null
+++ b/examples/c/ex_file_system.c
@@ -0,0 +1,721 @@
+/*-
+ * Public Domain 2014-2016 MongoDB, Inc.
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ex_file_system.c
+ * demonstrates how to use the custom file system interface
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+
+#include <wiredtiger.h>
+#include "queue_example.h"
+
+static const char *home;
+
+/*
+ * Example file system implementation. Using memory buffers to represent files.
+ *
+ * WARNING: This implementation isn't thread safe: WiredTiger performs schema
+ * and I/O operations in parallel, so all access to the handle must be thread-
+ * safe.
+ */
+typedef struct {
+ WT_FILE_SYSTEM iface;
+
+ int opened_file_count;
+ int opened_unique_file_count;
+ int closed_file_count;
+
+ /* Queue of file handles */
+ TAILQ_HEAD(demo_file_handle_qh, demo_file_handle) fileq;
+
+} DEMO_FILE_SYSTEM;
+
+typedef struct demo_file_handle {
+ WT_FILE_HANDLE iface;
+
+ /*
+ * Add custom file handle fields after the interface.
+ */
+ DEMO_FILE_SYSTEM *demo_fs;
+
+ TAILQ_ENTRY(demo_file_handle) q;
+ uint32_t ref; /* Reference count */
+
+ char *buf; /* In-memory contents */
+ size_t size;
+ size_t off; /* Read/write offset */
+} DEMO_FILE_HANDLE;
+
+/*
+ * Extension initialization function.
+ */
+#ifdef _WIN32
+/*
+ * Explicitly export this function so it is visible when loading extensions.
+ */
+__declspec(dllexport)
+#endif
+int demo_file_system_create(WT_CONNECTION *, WT_CONFIG_ARG *);
+
+/*
+ * Forward function declarations for file system API implementation
+ */
+static int demo_fs_open(WT_FILE_SYSTEM *,
+ WT_SESSION *, const char *, WT_OPEN_FILE_TYPE, uint32_t, WT_FILE_HANDLE **);
+static int demo_fs_directory_list(WT_FILE_SYSTEM *, WT_SESSION *,
+ const char *, const char *, char ***, uint32_t *);
+static int demo_fs_directory_list_free(
+ WT_FILE_SYSTEM *, WT_SESSION *, char **, uint32_t);
+static int demo_fs_directory_sync(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *session, const char *directory);
+static int demo_fs_exist(WT_FILE_SYSTEM *, WT_SESSION *, const char *, bool *);
+static int demo_fs_remove(WT_FILE_SYSTEM *, WT_SESSION *, const char *);
+static int demo_fs_rename(
+ WT_FILE_SYSTEM *, WT_SESSION *, const char *, const char *);
+static int demo_fs_size(
+ WT_FILE_SYSTEM *, WT_SESSION *, const char *, wt_off_t *);
+static int demo_fs_terminate(WT_FILE_SYSTEM *, WT_SESSION *);
+
+/*
+ * Forward function declarations for file handle API implementation
+ */
+static int demo_file_close(WT_FILE_HANDLE *, WT_SESSION *);
+static int demo_file_lock(WT_FILE_HANDLE *, WT_SESSION *, bool);
+static int demo_file_read(
+ WT_FILE_HANDLE *, WT_SESSION *, wt_off_t, size_t, void *);
+static int demo_file_size(WT_FILE_HANDLE *, WT_SESSION *, wt_off_t *);
+static int demo_file_sync(WT_FILE_HANDLE *, WT_SESSION *);
+static int demo_file_sync_nowait(WT_FILE_HANDLE *, WT_SESSION *);
+static int demo_file_truncate(WT_FILE_HANDLE *, WT_SESSION *, wt_off_t);
+static int demo_file_write(
+ WT_FILE_HANDLE *, WT_SESSION *, wt_off_t, size_t, const void *);
+
+/*
+ * Forward function declarations for internal functions
+ */
+static int demo_handle_remove(WT_SESSION *, DEMO_FILE_HANDLE *);
+static DEMO_FILE_HANDLE *demo_handle_search(WT_FILE_SYSTEM *, const char *);
+
+#define DEMO_FILE_SIZE_INCREMENT 32768
+
+/*
+ * demo_file_system_create --
+ * Initialization point for demo file system
+ */
+int
+demo_file_system_create(WT_CONNECTION *conn, WT_CONFIG_ARG *config)
+{
+ WT_FILE_SYSTEM *file_system;
+ DEMO_FILE_SYSTEM *demo_fs;
+ int ret = 0;
+
+ (void)config; /* Unused */
+
+ if ((demo_fs = calloc(1, sizeof(DEMO_FILE_SYSTEM))) == NULL)
+ return (ENOMEM);
+ file_system = (WT_FILE_SYSTEM *)demo_fs;
+
+ /* Initialize the in-memory jump table. */
+ file_system->directory_list = demo_fs_directory_list;
+ file_system->directory_list_free = demo_fs_directory_list_free;
+ file_system->directory_sync = demo_fs_directory_sync;
+ file_system->exist = demo_fs_exist;
+ file_system->open_file = demo_fs_open;
+ file_system->remove = demo_fs_remove;
+ file_system->rename = demo_fs_rename;
+ file_system->size = demo_fs_size;
+ file_system->terminate = demo_fs_terminate;
+
+ if ((ret = conn->set_file_system(conn, file_system, NULL)) != 0) {
+ fprintf(stderr, "Error setting custom file system: %s\n",
+ wiredtiger_strerror(ret));
+ goto err;
+ }
+
+ return (0);
+
+err: free(demo_fs);
+ /* An error installing the file system is fatal. */
+ exit(1);
+}
+
+/*
+ * demo_fs_open --
+ * fopen for our demo file system
+ */
+static int
+demo_fs_open(WT_FILE_SYSTEM *file_system, WT_SESSION *session,
+ const char *name, WT_OPEN_FILE_TYPE file_type, uint32_t flags,
+ WT_FILE_HANDLE **file_handlep)
+{
+ WT_FILE_HANDLE *file_handle;
+ DEMO_FILE_HANDLE *demo_fh;
+ DEMO_FILE_SYSTEM *demo_fs;
+
+ (void)file_type; /* Unused */
+ (void)session; /* Unused */
+ (void)flags; /* Unused */
+
+ demo_fs = (DEMO_FILE_SYSTEM *)file_system;
+ demo_fh = NULL;
+
+ ++demo_fs->opened_file_count;
+
+ /*
+ * First search the file queue, if we find it, assert there's only a
+ * single reference, we only supports a single handle on any file.
+ */
+ demo_fh = demo_handle_search(file_system, name);
+ if (demo_fh != NULL) {
+ if (demo_fh->ref != 0) {
+ fprintf(stderr,
+ "demo_file_open of already open file %s\n",
+ name);
+ return (EBUSY);
+ }
+
+ demo_fh->ref = 1;
+ demo_fh->off = 0;
+
+ *file_handlep = (WT_FILE_HANDLE *)demo_fh;
+ return (0);
+ }
+
+ /* The file hasn't been opened before, create a new one. */
+ if ((demo_fh = calloc(1, sizeof(DEMO_FILE_HANDLE))) == NULL)
+ return (ENOMEM);
+
+ /* Initialize private information. */
+ demo_fh->ref = 1;
+ demo_fh->off = 0;
+ demo_fh->demo_fs = demo_fs;
+ if ((demo_fh->buf = calloc(1, DEMO_FILE_SIZE_INCREMENT)) == NULL)
+ goto enomem;
+ demo_fh->size = DEMO_FILE_SIZE_INCREMENT;
+
+ /* Initialize public information. */
+ file_handle = (WT_FILE_HANDLE *)demo_fh;
+ if ((file_handle->name = strdup(name)) == NULL)
+ goto enomem;
+
+ /*
+ * Setup the function call table for our custom file system. Set the
+ * function pointer to NULL where our implementation doesn't support
+ * the functionality.
+ */
+ file_handle->close = demo_file_close;
+ file_handle->fadvise = NULL;
+ file_handle->fallocate = NULL;
+ file_handle->fallocate_nolock = NULL;
+ file_handle->lock = demo_file_lock;
+ file_handle->map = NULL;
+ file_handle->map_discard = NULL;
+ file_handle->map_preload = NULL;
+ file_handle->unmap = NULL;
+ file_handle->read = demo_file_read;
+ file_handle->size = demo_file_size;
+ file_handle->sync = demo_file_sync;
+ file_handle->sync_nowait = demo_file_sync_nowait;
+ file_handle->truncate = demo_file_truncate;
+ file_handle->write = demo_file_write;
+
+ TAILQ_INSERT_HEAD(&demo_fs->fileq, demo_fh, q);
+ ++demo_fs->opened_unique_file_count;
+
+ *file_handlep = file_handle;
+ return (0);
+
+enomem: free(demo_fh->buf);
+ free(demo_fh);
+ return (ENOMEM);
+}
+
+/*
+ * demo_fs_directory_list --
+ * Return a list of files in a given sub-directory.
+ */
+static int
+demo_fs_directory_list(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *session, const char *directory,
+ const char *prefix, char ***dirlistp, uint32_t *countp)
+{
+ DEMO_FILE_HANDLE *demo_fh;
+ DEMO_FILE_SYSTEM *demo_fs;
+ size_t dir_len, prefix_len;
+ char *name, **entries;
+ uint32_t allocated, count;
+
+ (void)session; /* Unused */
+
+ demo_fs = (DEMO_FILE_SYSTEM *)file_system;
+ entries = NULL;
+ allocated = count = 0;
+ dir_len = strlen(directory);
+ prefix_len = prefix == NULL ? 0 : strlen(prefix);
+
+ TAILQ_FOREACH(demo_fh, &demo_fs->fileq, q) {
+ name = demo_fh->iface.name;
+ if (strncmp(name, directory, dir_len) != 0 ||
+ (prefix != NULL && strncmp(name, prefix, prefix_len) != 0))
+ continue;
+
+ /*
+ * Increase the list size in groups of 10, it doesn't
+ * matter if the list is a bit longer than necessary.
+ */
+ if (count >= allocated) {
+ entries = realloc(
+ entries, (allocated + 10) * sizeof(char *));
+ if (entries == NULL)
+ return (ENOMEM);
+ memset(entries + allocated * sizeof(char *),
+ 0, 10 * sizeof(char *));
+ allocated += 10;
+ }
+ entries[count++] = strdup(name);
+ }
+
+ *dirlistp = entries;
+ *countp = count;
+
+ return (0);
+}
+
+/*
+ * demo_fs_directory_list_free --
+ * Free memory allocated by demo_fs_directory_list.
+ */
+static int
+demo_fs_directory_list_free(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *session, char **dirlist, uint32_t count)
+{
+ (void)file_system;
+ (void)session;
+
+ if (dirlist != NULL) {
+ while (count > 0)
+ free(dirlist[--count]);
+ free(dirlist);
+ }
+ return (0);
+}
+
+/*
+ * demo_fs_directory_sync --
+ * Directory sync for our demo file system, which is a no-op.
+ */
+static int
+demo_fs_directory_sync(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *session, const char *directory)
+{
+ (void)file_system; /* Unused */
+ (void)session; /* Unused */
+ (void)directory; /* Unused */
+
+ return (0);
+}
+
+/*
+ * demo_fs_exist --
+ * Return if the file exists.
+ */
+static int
+demo_fs_exist(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *session, const char *name, bool *existp)
+{
+ (void)session; /* Unused */
+
+ *existp =
+ demo_handle_search(file_system, name) != NULL;
+
+ return (0);
+}
+
+/*
+ * demo_fs_remove --
+ * POSIX remove.
+ */
+static int
+demo_fs_remove(
+ WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *name)
+{
+ DEMO_FILE_HANDLE *demo_fh;
+ int ret;
+
+ ret = ENOENT;
+ if ((demo_fh = demo_handle_search(file_system, name)) != NULL)
+ ret = demo_handle_remove(session, demo_fh);
+
+ return (ret);
+}
+
+/*
+ * demo_fs_rename --
+ * POSIX rename.
+ */
+static int
+demo_fs_rename(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *session, const char *from, const char *to)
+{
+ DEMO_FILE_HANDLE *demo_fh;
+ char *copy;
+
+ (void)session; /* Unused */
+
+ if ((demo_fh = demo_handle_search(file_system, from)) == NULL)
+ return (ENOENT);
+
+ if ((copy = strdup(to)) == NULL)
+ return (ENOMEM);
+
+ free(demo_fh->iface.name);
+ demo_fh->iface.name = copy;
+ return (0);
+}
+
+/*
+ * demo_fs_size --
+ * Get the size of a file in bytes, by file name.
+ */
+static int
+demo_fs_size(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *session, const char *name, wt_off_t *sizep)
+{
+ DEMO_FILE_HANDLE *demo_fh;
+ int ret = 0;
+
+ ret = ENOENT;
+ if ((demo_fh = demo_handle_search(file_system, name)) != NULL)
+ ret = demo_file_size(
+ (WT_FILE_HANDLE *)demo_fh, session, sizep);
+
+ return (ret);
+}
+
+/*
+ * demo_fs_terminate --
+ * Discard any resources on termination
+ */
+static int
+demo_fs_terminate(WT_FILE_SYSTEM *file_system, WT_SESSION *session)
+{
+ DEMO_FILE_HANDLE *demo_fh;
+ DEMO_FILE_SYSTEM *demo_fs;
+ int ret = 0, tret;
+
+ (void)session; /* Unused */
+
+ demo_fs = (DEMO_FILE_SYSTEM *)file_system;
+
+ while ((demo_fh = TAILQ_FIRST(&demo_fs->fileq)) != NULL)
+ if ((tret =
+ demo_handle_remove(session, demo_fh)) != 0 && ret == 0)
+ ret = tret;
+
+ printf("Custom file system\n");
+ printf("\t%d unique file opens\n", demo_fs->opened_unique_file_count);
+ printf("\t%d opened\n", demo_fs->opened_file_count);
+ printf("\t%d closed\n", demo_fs->closed_file_count);
+ free(demo_fs);
+
+ return (ret);
+}
+
+/*
+ * demo_file_close --
+ * ANSI C close.
+ */
+static int
+demo_file_close(WT_FILE_HANDLE *file_handle, WT_SESSION *session)
+{
+ DEMO_FILE_HANDLE *demo_fh;
+
+ (void)session; /* Unused */
+
+ demo_fh = (DEMO_FILE_HANDLE *)file_handle;
+ if (demo_fh->ref < 1) {
+ fprintf(stderr, "Closing already closed handle: %s\n",
+ demo_fh->iface.name);
+ return (EINVAL);
+ }
+ --demo_fh->ref;
+
+ if (demo_fh->ref == 0)
+ ++demo_fh->demo_fs->closed_file_count;
+
+ return (0);
+}
+
+/*
+ * demo_file_lock --
+ * Lock/unlock a file.
+ */
+static int
+demo_file_lock(WT_FILE_HANDLE *file_handle, WT_SESSION *session, bool lock)
+{
+ /* Locks are always granted. */
+ (void)file_handle; /* Unused */
+ (void)session; /* Unused */
+ (void)lock; /* Unused */
+ return (0);
+}
+
+/*
+ * demo_file_read --
+ * POSIX pread.
+ */
+static int
+demo_file_read(WT_FILE_HANDLE *file_handle,
+ WT_SESSION *session, wt_off_t offset, size_t len, void *buf)
+{
+ DEMO_FILE_HANDLE *demo_fh;
+ int ret = 0;
+ size_t off;
+
+ (void)session; /* Unused */
+ demo_fh = (DEMO_FILE_HANDLE *)file_handle;
+
+ off = (size_t)offset;
+ if (off < demo_fh->size) {
+ if (len > demo_fh->size - off)
+ len = demo_fh->size - off;
+ memcpy(buf, (uint8_t *)demo_fh->buf + off, len);
+ demo_fh->off = off + len;
+ } else
+ ret = EINVAL;
+
+ if (ret == 0)
+ return (0);
+ /*
+ * WiredTiger should never request data past the end of a file, so
+ * flag an error if it does.
+ */
+ fprintf(stderr,
+ "%s: handle-read: failed to read %zu bytes at offset %zu\n",
+ demo_fh->iface.name, len, off);
+ return (EINVAL);
+}
+
+/*
+ * demo_file_size --
+ * Get the size of a file in bytes, by file handle.
+ */
+static int
+demo_file_size(
+ WT_FILE_HANDLE *file_handle, WT_SESSION *session, wt_off_t *sizep)
+{
+ DEMO_FILE_HANDLE *demo_fh;
+
+ (void)session; /* Unused */
+ demo_fh = (DEMO_FILE_HANDLE *)file_handle;
+
+ assert(demo_fh->size != 0);
+ *sizep = (wt_off_t)demo_fh->size;
+ return (0);
+}
+
+/*
+ * demo_file_sync --
+ * Ensure the content of the file is stable. This is a no-op in our
+ * memory backed file system.
+ */
+static int
+demo_file_sync(WT_FILE_HANDLE *file_handle, WT_SESSION *session)
+{
+ (void)file_handle; /* Unused */
+ (void)session; /* Unused */
+
+ return (0);
+}
+
+/*
+ * demo_file_sync_nowait --
+ * Ensure the content of the file is stable. This is a no-op in our
+ * memory backed file system.
+ */
+static int
+demo_file_sync_nowait(WT_FILE_HANDLE *file_handle, WT_SESSION *session)
+{
+ (void)file_handle; /* Unused */
+ (void)session; /* Unused */
+
+ return (0);
+}
+
+/*
+ * demo_file_truncate --
+ * POSIX ftruncate.
+ */
+static int
+demo_file_truncate(
+ WT_FILE_HANDLE *file_handle, WT_SESSION *session, wt_off_t offset)
+{
+ DEMO_FILE_HANDLE *demo_fh;
+ size_t off;
+
+ (void)session; /* Unused */
+ demo_fh = (DEMO_FILE_HANDLE *)file_handle;
+
+ /*
+ * Grow the buffer as necessary, clear any new space in the file,
+ * and reset the file's data length.
+ */
+ off = (size_t)offset;
+ demo_fh->buf = realloc(demo_fh->buf, off);
+ if (demo_fh->buf == NULL) {
+ fprintf(stderr, "Failed to resize buffer in truncate\n");
+ return (ENOSPC);
+ }
+ if (demo_fh->size < off)
+ memset((uint8_t *)demo_fh->buf + demo_fh->size,
+ 0, off - demo_fh->size);
+ demo_fh->size = off;
+
+ return (0);
+}
+
+/*
+ * demo_file_write --
+ * POSIX pwrite.
+ */
+static int
+demo_file_write(WT_FILE_HANDLE *file_handle, WT_SESSION *session,
+ wt_off_t offset, size_t len, const void *buf)
+{
+ DEMO_FILE_HANDLE *demo_fh;
+ int ret = 0;
+
+ demo_fh = (DEMO_FILE_HANDLE *)file_handle;
+
+ /* Make sure the buffer is large enough for the write */
+ if ((ret = demo_file_truncate(file_handle, session,
+ offset + (wt_off_t)(len + DEMO_FILE_SIZE_INCREMENT))) != 0)
+ return (ret);
+
+ memcpy((uint8_t *)demo_fh->buf + offset, buf, len);
+ demo_fh->off = (size_t)offset + len;
+
+ return (0);
+}
+
+/*
+ * demo_handle_remove --
+ * Destroy an in-memory file handle. Should only happen on remove or
+ * shutdown.
+ */
+static int
+demo_handle_remove(WT_SESSION *session, DEMO_FILE_HANDLE *demo_fh)
+{
+ DEMO_FILE_SYSTEM *demo_fs;
+
+ (void)session; /* Unused */
+ demo_fs = demo_fh->demo_fs;
+
+ if (demo_fh->ref != 0) {
+ fprintf(stderr,
+ "demo_handle_remove on file %s with non-zero reference "
+ "count of %u\n",
+ demo_fh->iface.name, demo_fh->ref);
+ return (EINVAL);
+ }
+
+ TAILQ_REMOVE(&demo_fs->fileq, demo_fh, q);
+
+ /* Clean up private information. */
+ free(demo_fh->buf);
+ demo_fh->buf = NULL;
+
+ /* Clean up public information. */
+ free(demo_fh->iface.name);
+
+ free(demo_fh);
+
+ return (0);
+}
+
+/*
+ * demo_handle_search --
+ * Return a matching handle, if one exists.
+ */
+static DEMO_FILE_HANDLE *
+demo_handle_search(WT_FILE_SYSTEM *file_system, const char *name)
+{
+ DEMO_FILE_HANDLE *demo_fh;
+ DEMO_FILE_SYSTEM *demo_fs;
+
+ demo_fs = (DEMO_FILE_SYSTEM *)file_system;
+
+ TAILQ_FOREACH(demo_fh, &demo_fs->fileq, q)
+ if (strcmp(demo_fh->iface.name, name) == 0)
+ break;
+ return (demo_fh);
+}
+
+int
+main(void)
+{
+ WT_CONNECTION *conn;
+ const char *open_config;
+ int ret = 0;
+
+ /*
+ * Create a clean test directory for this run of the test program if the
+ * environment variable isn't already set (as is done by make check).
+ */
+ if (getenv("WIREDTIGER_HOME") == NULL) {
+ home = "WT_HOME";
+ ret = system("rm -rf WT_HOME && mkdir WT_HOME");
+ } else
+ home = NULL;
+
+ /*! [WT_FILE_SYSTEM register] */
+ /*
+ * Setup a configuration string that will load our custom file system.
+ * Use the special local extension to indicate that the entry point is
+ * in the same executable. Also enable early load for this extension,
+ * since WiredTiger needs to be able to find it before doing any file
+ * operations.
+ */
+ open_config = "create,log=(enabled=true),extensions=(local="
+ "{entry=demo_file_system_create,early_load=true})";
+ /* Open a connection to the database, creating it if necessary. */
+ if ((ret = wiredtiger_open(home, NULL, open_config, &conn)) != 0) {
+ fprintf(stderr, "Error connecting to %s: %s\n",
+ home, wiredtiger_strerror(ret));
+ return (ret);
+ }
+ /*! [WT_FILE_SYSTEM register] */
+
+ if ((ret = conn->close(conn, NULL)) != 0)
+ fprintf(stderr, "Error closing connection to %s: %s\n",
+ home, wiredtiger_strerror(ret));
+
+ return (ret);
+}
diff --git a/examples/c/ex_schema.c b/examples/c/ex_schema.c
index 70fc7eb2e62..155b982bbbe 100644
--- a/examples/c/ex_schema.c
+++ b/examples/c/ex_schema.c
@@ -69,7 +69,8 @@ main(void)
{
POP_RECORD *p;
WT_CONNECTION *conn;
- WT_CURSOR *cursor, *cursor2, *join_cursor, *stat_cursor;
+ WT_CURSOR *country_cursor, *country_cursor2, *cursor, *join_cursor,
+ *stat_cursor, *subjoin_cursor, *year_cursor;
WT_SESSION *session;
const char *country;
uint64_t recno, population;
@@ -336,18 +337,18 @@ main(void)
ret = session->open_cursor(session,
"join:table:poptable", NULL, NULL, &join_cursor);
ret = session->open_cursor(session,
- "index:poptable:country", NULL, NULL, &cursor);
+ "index:poptable:country", NULL, NULL, &country_cursor);
ret = session->open_cursor(session,
- "index:poptable:immutable_year", NULL, NULL, &cursor2);
+ "index:poptable:immutable_year", NULL, NULL, &year_cursor);
/* select values WHERE country == "AU" AND year > 1900 */
- cursor->set_key(cursor, "AU\0\0\0");
- ret = cursor->search(cursor);
- ret = session->join(session, join_cursor, cursor,
+ country_cursor->set_key(country_cursor, "AU\0\0\0");
+ ret = country_cursor->search(country_cursor);
+ ret = session->join(session, join_cursor, country_cursor,
"compare=eq,count=10");
- cursor2->set_key(cursor2, (uint16_t)1900);
- ret = cursor2->search(cursor2);
- ret = session->join(session, join_cursor, cursor2,
+ year_cursor->set_key(year_cursor, (uint16_t)1900);
+ ret = year_cursor->search(year_cursor);
+ ret = session->join(session, join_cursor, year_cursor,
"compare=gt,count=10,strategy=bloom");
/* List the values that are joined */
@@ -370,8 +371,61 @@ main(void)
ret = stat_cursor->close(stat_cursor);
ret = join_cursor->close(join_cursor);
- ret = cursor2->close(cursor2);
- ret = cursor->close(cursor);
+ ret = year_cursor->close(year_cursor);
+ ret = country_cursor->close(country_cursor);
+
+ /*! [Complex join cursors] */
+ /* Open cursors needed by the join. */
+ ret = session->open_cursor(session,
+ "join:table:poptable", NULL, NULL, &join_cursor);
+ ret = session->open_cursor(session,
+ "join:table:poptable", NULL, NULL, &subjoin_cursor);
+ ret = session->open_cursor(session,
+ "index:poptable:country", NULL, NULL, &country_cursor);
+ ret = session->open_cursor(session,
+ "index:poptable:country", NULL, NULL, &country_cursor2);
+ ret = session->open_cursor(session,
+ "index:poptable:immutable_year", NULL, NULL, &year_cursor);
+
+ /*
+ * select values WHERE (country == "AU" OR country == "UK")
+ * AND year > 1900
+ *
+ * First, set up the join representing the country clause.
+ */
+ country_cursor->set_key(country_cursor, "AU\0\0\0");
+ ret = country_cursor->search(country_cursor);
+ ret = session->join(session, subjoin_cursor, country_cursor,
+ "operation=or,compare=eq,count=10");
+ country_cursor2->set_key(country_cursor2, "UK\0\0\0");
+ ret = country_cursor2->search(country_cursor2);
+ ret = session->join(session, subjoin_cursor, country_cursor2,
+ "operation=or,compare=eq,count=10");
+
+ /* Join that to the top join, and add the year clause */
+ ret = session->join(session, join_cursor, subjoin_cursor, NULL);
+ year_cursor->set_key(year_cursor, (uint16_t)1900);
+ ret = year_cursor->search(year_cursor);
+ ret = session->join(session, join_cursor, year_cursor,
+ "compare=gt,count=10,strategy=bloom");
+
+ /* List the values that are joined */
+ while ((ret = join_cursor->next(join_cursor)) == 0) {
+ ret = join_cursor->get_key(join_cursor, &recno);
+ ret = join_cursor->get_value(join_cursor, &country, &year,
+ &population);
+ printf("ID %" PRIu64, recno);
+ printf(
+ ": country %s, year %" PRIu16 ", population %" PRIu64 "\n",
+ country, year, population);
+ }
+ /*! [Complex join cursors] */
+
+ ret = join_cursor->close(join_cursor);
+ ret = subjoin_cursor->close(subjoin_cursor);
+ ret = country_cursor->close(country_cursor);
+ ret = country_cursor2->close(country_cursor2);
+ ret = year_cursor->close(year_cursor);
ret = conn->close(conn, NULL);
diff --git a/examples/c/ex_scope.c b/examples/c/ex_scope.c
index 93878ec7e3d..ef4d67ad722 100644
--- a/examples/c/ex_scope.c
+++ b/examples/c/ex_scope.c
@@ -106,10 +106,12 @@ cursor_scope_ops(WT_CURSOR *cursor)
* memory, but as it does not position the cursor, it
* doesn't reference memory owned by the cursor, either.
*/
+ printf("ex_scope: "
+ "expect two WiredTiger error messages:\n");
if ((ret = cursor->get_key(cursor, &key)) == 0 ||
(ret = cursor->get_value(cursor, &value)) == 0) {
fprintf(stderr,
- "%s: error in s get_key/value: %s\n",
+ "%s: error in get_key/value: %s\n",
op->op, session->strerror(session, ret));
return (ret);
}
@@ -122,6 +124,8 @@ cursor_scope_ops(WT_CURSOR *cursor)
* reference key memory owned by the cursor, but has no
* value.
*/
+ printf("ex_scope: "
+ "expect one WiredTiger error message:\n");
if ((ret = cursor->get_key(cursor, &key)) != 0 ||
(ret = cursor->get_value(cursor, &value)) == 0) {
fprintf(stderr,
diff --git a/examples/c/queue_example.h b/examples/c/queue_example.h
new file mode 100644
index 00000000000..5f6674b5d1d
--- /dev/null
+++ b/examples/c/queue_example.h
@@ -0,0 +1,149 @@
+/*-
+ * Public Domain 2014-2016 MongoDB, Inc.
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * This is a stripped down copy of the FreeBSD queue.h include file to make
+ * TAILQ_XXX functionality available in WiredTiger example programs.
+ */
+
+/*-
+ * Copyright (c) 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)queue.h 8.5 (Berkeley) 8/20/94
+ * $FreeBSD: releng/10.2/sys/sys/queue.h 279633 2015-03-05 09:23:43Z hselasky $
+ */
+
+/*
+ * Tail queue declarations.
+ */
+#define TAILQ_HEAD(name, type) \
+struct name { \
+ struct type *tqh_first; /* first element */ \
+ struct type **tqh_last; /* addr of last next element */ \
+}
+
+#define TAILQ_ENTRY(type) \
+struct { \
+ struct type *tqe_next; /* next element */ \
+ struct type **tqe_prev; /* address of previous next element */ \
+}
+
+/*
+ * Tail queue functions.
+ */
+#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL)
+
+#define TAILQ_FIRST(head) ((head)->tqh_first)
+
+#define TAILQ_FOREACH(var, head, field) \
+ for ((var) = TAILQ_FIRST((head)); \
+ (var); \
+ (var) = TAILQ_NEXT((var), field))
+
+#define TAILQ_INIT(head) do { \
+ TAILQ_FIRST((head)) = NULL; \
+ (head)->tqh_last = &TAILQ_FIRST((head)); \
+} while (0)
+
+#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \
+ if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\
+ TAILQ_NEXT((elm), field)->field.tqe_prev = \
+ &TAILQ_NEXT((elm), field); \
+ else { \
+ (head)->tqh_last = &TAILQ_NEXT((elm), field); \
+ } \
+ TAILQ_NEXT((listelm), field) = (elm); \
+ (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \
+} while (0)
+
+#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \
+ (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \
+ TAILQ_NEXT((elm), field) = (listelm); \
+ *(listelm)->field.tqe_prev = (elm); \
+ (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \
+} while (0)
+
+#define TAILQ_INSERT_HEAD(head, elm, field) do { \
+ if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL) \
+ TAILQ_FIRST((head))->field.tqe_prev = \
+ &TAILQ_NEXT((elm), field); \
+ else \
+ (head)->tqh_last = &TAILQ_NEXT((elm), field); \
+ TAILQ_FIRST((head)) = (elm); \
+ (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \
+} while (0)
+
+#define TAILQ_INSERT_TAIL(head, elm, field) do { \
+ TAILQ_NEXT((elm), field) = NULL; \
+ (elm)->field.tqe_prev = (head)->tqh_last; \
+ *(head)->tqh_last = (elm); \
+ (head)->tqh_last = &TAILQ_NEXT((elm), field); \
+} while (0)
+
+#define TAILQ_LAST(head, headname) \
+ (*(((struct headname *)((head)->tqh_last))->tqh_last))
+
+#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next)
+
+#define TAILQ_PREV(elm, headname, field) \
+ (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last))
+
+#define TAILQ_REMOVE(head, elm, field) do { \
+ if ((TAILQ_NEXT((elm), field)) != NULL) \
+ TAILQ_NEXT((elm), field)->field.tqe_prev = \
+ (elm)->field.tqe_prev; \
+ else { \
+ (head)->tqh_last = (elm)->field.tqe_prev; \
+ } \
+ *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \
+} while (0)
diff --git a/examples/java/com/wiredtiger/examples/ex_all.java b/examples/java/com/wiredtiger/examples/ex_all.java
index 5fe767d49bf..48e85c9fade 100644
--- a/examples/java/com/wiredtiger/examples/ex_all.java
+++ b/examples/java/com/wiredtiger/examples/ex_all.java
@@ -878,6 +878,18 @@ backup(Session session)
": backup failed: " + ex.toString());
}
/*! [backup]*/
+ try {
+ /*! [incremental backup]*/
+ /* Open the backup data source for incremental backup. */
+ cursor = session.open_cursor("backup:", null, "target=(\"log:\")");
+ /*! [incremental backup]*/
+
+ ret = cursor.close();
+ }
+ catch (Exception ex) {
+ System.err.println(progname +
+ ": incremental backup failed: " + ex.toString());
+ }
/*! [backup of a checkpoint]*/
ret = session.checkpoint("drop=(from=June01),name=June01");
diff --git a/examples/java/com/wiredtiger/examples/ex_schema.java b/examples/java/com/wiredtiger/examples/ex_schema.java
index 7cc26acb479..76bff66a688 100644
--- a/examples/java/com/wiredtiger/examples/ex_schema.java
+++ b/examples/java/com/wiredtiger/examples/ex_schema.java
@@ -76,7 +76,8 @@ public class ex_schema {
throws WiredTigerException
{
Connection conn;
- Cursor cursor, cursor2, join_cursor, stat_cursor;
+ Cursor country_cursor, country_cursor2, cursor, join_cursor,
+ stat_cursor, subjoin_cursor, year_cursor;
Session session;
String country;
long recno, population;
@@ -343,18 +344,18 @@ public class ex_schema {
/* Open cursors needed by the join. */
join_cursor = session.open_cursor(
"join:table:poptable", null, null);
- cursor = session.open_cursor(
+ country_cursor = session.open_cursor(
"index:poptable:country", null, null);
- cursor2 = session.open_cursor(
+ year_cursor = session.open_cursor(
"index:poptable:immutable_year", null, null);
/* select values WHERE country == "AU" AND year > 1900 */
- cursor.putKeyString("AU");
- ret = cursor.search();
- session.join(join_cursor, cursor, "compare=eq,count=10");
- cursor2.putKeyShort((short)1900);
- ret = cursor2.search();
- session.join(join_cursor, cursor2,
+ country_cursor.putKeyString("AU");
+ ret = country_cursor.search();
+ session.join(join_cursor, country_cursor, "compare=eq,count=10");
+ year_cursor.putKeyShort((short)1900);
+ ret = year_cursor.search();
+ session.join(join_cursor, year_cursor,
"compare=gt,count=10,strategy=bloom");
/* List the values that are joined */
@@ -376,8 +377,61 @@ public class ex_schema {
ret = stat_cursor.close();
ret = join_cursor.close();
- ret = cursor2.close();
- ret = cursor.close();
+ ret = year_cursor.close();
+ ret = country_cursor.close();
+
+ /*! [Complex join cursors] */
+ /* Open cursors needed by the join. */
+ join_cursor = session.open_cursor(
+ "join:table:poptable", null, null);
+ subjoin_cursor = session.open_cursor(
+ "join:table:poptable", null, null);
+ country_cursor = session.open_cursor(
+ "index:poptable:country", null, null);
+ country_cursor2 = session.open_cursor(
+ "index:poptable:country", null, null);
+ year_cursor = session.open_cursor(
+ "index:poptable:immutable_year", null, null);
+
+ /*
+ * select values WHERE (country == "AU" OR country == "UK")
+ * AND year > 1900
+ *
+ * First, set up the join representing the country clause.
+ */
+ country_cursor.putKeyString("AU");
+ ret = country_cursor.search();
+ ret = session.join(subjoin_cursor, country_cursor,
+ "operation=or,compare=eq,count=10");
+ country_cursor2.putKeyString("UK");
+ ret = country_cursor2.search();
+ ret = session.join(subjoin_cursor, country_cursor2,
+ "operation=or,compare=eq,count=10");
+
+ /* Join that to the top join, and add the year clause */
+ ret = session.join(join_cursor, subjoin_cursor, null);
+ year_cursor.putKeyShort((short)1900);
+ ret = year_cursor.search();
+ ret = session.join(join_cursor, year_cursor,
+ "compare=gt,count=10,strategy=bloom");
+
+ /* List the values that are joined */
+ while ((ret = join_cursor.next()) == 0) {
+ recno = join_cursor.getKeyRecord();
+ country = join_cursor.getValueString();
+ year = join_cursor.getValueShort();
+ population = join_cursor.getValueLong();
+ System.out.print("ID " + recno);
+ System.out.println( ": country " + country + ", year " + year +
+ ", population " + population);
+ }
+ /*! [Complex join cursors] */
+
+ ret = join_cursor.close();
+ ret = subjoin_cursor.close();
+ ret = year_cursor.close();
+ ret = country_cursor.close();
+ ret = country_cursor2.close();
ret = conn.close(null);
diff --git a/ext/compressors/zlib/zlib_compress.c b/ext/compressors/zlib/zlib_compress.c
index 4ff0d8576eb..9aede2ed907 100644
--- a/ext/compressors/zlib/zlib_compress.c
+++ b/ext/compressors/zlib/zlib_compress.c
@@ -307,17 +307,9 @@ zlib_compress_raw(WT_COMPRESSOR *compressor, WT_SESSION *session,
/*
* If there's more compression to do, save a snapshot and keep
* going, otherwise, use the current compression.
- *
- * Don't let the compression ratio become insanely good (which
- * can happen with synthetic workloads). Once we hit a limit,
- * stop so the in-memory size of pages isn't hugely larger than
- * the on-disk size, otherwise we can get into trouble where
- * every update to a page results in forced eviction based on
- * the in-memory size, even though the data fits into a single
- * on-disk block.
*/
last_slot = curr_slot;
- if (zs.avail_out > 0 && zs.total_in <= zs.total_out * 20) {
+ if (zs.avail_out > 0) {
if ((ret = deflateCopy(&last_zs, &zs)) != Z_OK)
return (zlib_error(
compressor, session, "deflateCopy", ret));
diff --git a/lang/java/java_doc.i b/lang/java/java_doc.i
index 450cb1d5ab2..2264cb31ef1 100644
--- a/lang/java/java_doc.i
+++ b/lang/java/java_doc.i
@@ -63,6 +63,7 @@ COPYDOC(__wt_connection, WT_CONNECTION, add_collator)
COPYDOC(__wt_connection, WT_CONNECTION, add_compressor)
COPYDOC(__wt_connection, WT_CONNECTION, add_encryptor)
COPYDOC(__wt_connection, WT_CONNECTION, add_extractor)
+COPYDOC(__wt_connection, WT_CONNECTION, set_file_system)
COPYDOC(__wt_config_parser, WT_CONFIG_PARSER, close)
COPYDOC(__wt_config_parser, WT_CONFIG_PARSER, next)
COPYDOC(__wt_config_parser, WT_CONFIG_PARSER, get)
diff --git a/lang/java/src/com/wiredtiger/db/PackFormatInputStream.java b/lang/java/src/com/wiredtiger/db/PackFormatInputStream.java
index 02639bfe77a..4f05e153607 100644
--- a/lang/java/src/com/wiredtiger/db/PackFormatInputStream.java
+++ b/lang/java/src/com/wiredtiger/db/PackFormatInputStream.java
@@ -40,6 +40,7 @@ import com.wiredtiger.db.WiredTigerPackingException;
public class PackFormatInputStream {
protected String format;
+ protected boolean isRaw;
protected int formatOff;
protected int formatRepeatCount;
@@ -48,8 +49,9 @@ public class PackFormatInputStream {
*
* \param format the encoded format backing string.
*/
- protected PackFormatInputStream(String format) {
+ protected PackFormatInputStream(String format, boolean isRaw) {
this.format = format;
+ this.isRaw = isRaw;
formatOff = 0;
formatRepeatCount = 0;
}
@@ -114,6 +116,9 @@ public class PackFormatInputStream {
throws WiredTigerPackingException {
char expected = getType();
+ if (isRaw)
+ throw new WiredTigerPackingException(
+ "Format mismatch for raw mode");
if (Character.toLowerCase(expected) != Character.toLowerCase(asking))
throw new WiredTigerPackingException(
"Format mismatch. Wanted: " + asking + ", got: " + expected);
diff --git a/lang/java/src/com/wiredtiger/db/PackInputStream.java b/lang/java/src/com/wiredtiger/db/PackInputStream.java
index f265d041d94..732bf450acd 100644
--- a/lang/java/src/com/wiredtiger/db/PackInputStream.java
+++ b/lang/java/src/com/wiredtiger/db/PackInputStream.java
@@ -43,6 +43,7 @@ public class PackInputStream {
protected byte[] value;
protected int valueOff;
protected int valueLen;
+ protected boolean isRaw;
/**
* Constructor.
@@ -52,7 +53,7 @@ public class PackInputStream {
* \param value The raw bytes that back the stream.
*/
public PackInputStream(String format, byte[] value) {
- this(format, value, 0, value.length);
+ this(format, value, false, 0, value.length);
}
/**
@@ -61,14 +62,29 @@ public class PackInputStream {
* \param format A String that contains the WiredTiger format that
* defines the layout of this packed value.
* \param value The raw bytes that back the stream.
+ * \param isRaw The stream is opened raw.
+ */
+ public PackInputStream(String format, byte[] value, boolean isRaw) {
+ this(format, value, isRaw, 0, value.length);
+ }
+
+ /**
+ * Constructor.
+ *
+ * \param format A String that contains the WiredTiger format that
+ * defines the layout of this packed value.
+ * \param value The raw bytes that back the stream.
+ * \param isRaw The stream is opened raw.
* \param off Offset into the value array at which the stream begins.
* \param len Length of the value array that forms the stream.
*/
- public PackInputStream(String format, byte[] value, int off, int len) {
- this.format = new PackFormatInputStream(format);
+ public PackInputStream(
+ String format, byte[] value, boolean isRaw, int off, int len) {
+ this.format = new PackFormatInputStream(format, isRaw);
this.value = value;
this.valueOff = off;
this.valueLen = len;
+ this.isRaw = isRaw;
}
/**
@@ -117,7 +133,9 @@ public class PackInputStream {
*/
public void getByteArray(byte[] dest, int off, int len)
throws WiredTigerPackingException {
- format.checkType('U', false);
+ if (!isRaw) {
+ format.checkType('U', false);
+ }
getByteArrayInternal(getByteArrayLength(), dest, off, len);
}
@@ -127,7 +145,9 @@ public class PackInputStream {
*/
public byte[] getByteArray()
throws WiredTigerPackingException {
- format.checkType('U', false);
+ if (!isRaw) {
+ format.checkType('U', false);
+ }
int itemLen = getByteArrayLength();
byte[] unpacked = new byte[itemLen];
getByteArrayInternal(itemLen, unpacked, 0, itemLen);
@@ -142,7 +162,10 @@ public class PackInputStream {
throws WiredTigerPackingException {
int itemLen = 0;
- if (format.hasLength()) {
+ if (isRaw) {
+ // The rest of the buffer is a byte array.
+ itemLen = valueLen - valueOff;
+ } else if (format.hasLength()) {
// If the format has a length, it's always used.
itemLen = format.getLengthFromFormat(true);
} else if (format.getType() == 'U') {
diff --git a/lang/java/src/com/wiredtiger/db/PackOutputStream.java b/lang/java/src/com/wiredtiger/db/PackOutputStream.java
index 805e34f6ca8..46b3aef0974 100644
--- a/lang/java/src/com/wiredtiger/db/PackOutputStream.java
+++ b/lang/java/src/com/wiredtiger/db/PackOutputStream.java
@@ -50,7 +50,7 @@ public class PackOutputStream {
* defines the layout of this packed value.
*/
public PackOutputStream(String format) {
- this.format = new PackFormatInputStream(format);
+ this.format = new PackFormatInputStream(format, false);
intBuf = new byte[MAX_INT_BYTES];
packed = new ByteArrayOutputStream(100);
}
diff --git a/lang/java/wiredtiger.i b/lang/java/wiredtiger.i
index ce013a1939c..c04bae63cbc 100644
--- a/lang/java/wiredtiger.i
+++ b/lang/java/wiredtiger.i
@@ -80,6 +80,7 @@ typedef struct {
JavaVM *javavm; /* Used in async threads to craft a jnienv */
JNIEnv *jnienv; /* jni env that created the Session/Cursor */
WT_SESSION_IMPL *session; /* session used for alloc/free */
+ bool cursor_raw; /* is the cursor opened raw? */
jobject jobj; /* the java Session/Cursor/AsyncOp object */
jobject jcallback; /* callback object for async ops */
jfieldID cptr_fid; /* cached Cursor.swigCPtr field id in session */
@@ -576,8 +577,15 @@ WT_ASYNC_CALLBACK javaApiAsyncHandler = {javaAsyncHandler};
return $self->update($self);
}
- %javamethodmodifiers java_init "protected";
- int java_init(jobject jasyncop) {
+ %javamethodmodifiers _java_raw "protected";
+ bool _java_raw(JNIEnv *jenv) {
+ (void)jenv;
+ JAVA_CALLBACK *jcb = (JAVA_CALLBACK *)$self->c.lang_private;
+ return jcb->cursor_raw;
+ }
+
+ %javamethodmodifiers _java_init "protected";
+ int _java_init(jobject jasyncop) {
JAVA_CALLBACK *jcb =
(JAVA_CALLBACK *)$self->c.lang_private;
jcb->jobj = JCALL1(NewGlobalRef, jcb->jnienv, jasyncop);
@@ -604,7 +612,7 @@ WT_ASYNC_CALLBACK javaApiAsyncHandler = {javaAsyncHandler};
valueFormat = getValue_format();
keyPacker = new PackOutputStream(keyFormat);
valuePacker = new PackOutputStream(valueFormat);
- wiredtigerJNI.AsyncOp_java_init(swigCPtr, this, this);
+ wiredtigerJNI.AsyncOp__java_init(swigCPtr, this, this);
}
protected static long getCPtr($javaclassname obj) {
@@ -1090,7 +1098,8 @@ WT_ASYNC_CALLBACK javaApiAsyncHandler = {javaAsyncHandler};
throws WiredTigerPackingException {
if (keyUnpacker == null)
keyUnpacker =
- new PackInputStream(keyFormat, get_key_wrap());
+ new PackInputStream(keyFormat, get_key_wrap(),
+ _java_raw());
return keyUnpacker;
}
@@ -1103,7 +1112,8 @@ WT_ASYNC_CALLBACK javaApiAsyncHandler = {javaAsyncHandler};
throws WiredTigerPackingException {
if (valueUnpacker == null)
valueUnpacker =
- new PackInputStream(valueFormat, get_value_wrap());
+ new PackInputStream(valueFormat, get_value_wrap(),
+ _java_raw());
return valueUnpacker;
}
@@ -1175,6 +1185,7 @@ WT_ASYNC_CALLBACK javaApiAsyncHandler = {javaAsyncHandler};
return $self->update($self);
}
+ %javamethodmodifiers compare_wrap "protected";
int compare_wrap(JNIEnv *jenv, WT_CURSOR *other) {
int cmp, ret = $self->compare($self, other, &cmp);
if (ret != 0)
@@ -1182,6 +1193,7 @@ WT_ASYNC_CALLBACK javaApiAsyncHandler = {javaAsyncHandler};
return cmp;
}
+ %javamethodmodifiers equals_wrap "protected";
int equals_wrap(JNIEnv *jenv, WT_CURSOR *other) {
int cmp, ret = $self->equals($self, other, &cmp);
if (ret != 0)
@@ -1189,8 +1201,15 @@ WT_ASYNC_CALLBACK javaApiAsyncHandler = {javaAsyncHandler};
return cmp;
}
- %javamethodmodifiers java_init "protected";
- int java_init(jobject jcursor) {
+ %javamethodmodifiers _java_raw "protected";
+ bool _java_raw(JNIEnv *jenv) {
+ (void)jenv;
+ JAVA_CALLBACK *jcb = (JAVA_CALLBACK *)$self->lang_private;
+ return jcb->cursor_raw;
+ }
+
+ %javamethodmodifiers _java_init "protected";
+ int _java_init(jobject jcursor) {
JAVA_CALLBACK *jcb = (JAVA_CALLBACK *)$self->lang_private;
jcb->jobj = JCALL1(NewGlobalRef, jcb->jnienv, jcursor);
JCALL1(DeleteLocalRef, jcb->jnienv, jcursor);
@@ -1216,7 +1235,7 @@ WT_ASYNC_CALLBACK javaApiAsyncHandler = {javaAsyncHandler};
valueFormat = getValue_format();
keyPacker = new PackOutputStream(keyFormat);
valuePacker = new PackOutputStream(valueFormat);
- wiredtigerJNI.Cursor_java_init(swigCPtr, this, this);
+ wiredtigerJNI.Cursor__java_init(swigCPtr, this, this);
}
protected static long getCPtr($javaclassname obj) {
@@ -1773,7 +1792,8 @@ WT_ASYNC_CALLBACK javaApiAsyncHandler = {javaAsyncHandler};
if (!success || keyFormat.equals(""))
return null;
else
- return new PackInputStream(keyFormat, get_key_wrap());
+ return new PackInputStream(keyFormat,
+ get_key_wrap(), _java_raw());
}
/**
@@ -1789,7 +1809,7 @@ WT_ASYNC_CALLBACK javaApiAsyncHandler = {javaAsyncHandler};
return null;
else
return new PackInputStream(valueFormat,
- get_value_wrap());
+ get_value_wrap(), _java_raw());
}
%}
@@ -1799,20 +1819,22 @@ WT_ASYNC_CALLBACK javaApiAsyncHandler = {javaAsyncHandler};
*/
%javaexception("com.wiredtiger.db.WiredTigerException") { $action; }
%javaexception("") wiredtiger_strerror { $action; }
+%javaexception("") __wt_async_op::_java_raw { $action; }
%javaexception("") __wt_async_op::connection { $action; }
%javaexception("") __wt_async_op::get_type { $action; }
%javaexception("") __wt_async_op::get_id { $action; }
%javaexception("") __wt_async_op::key_format { $action; }
%javaexception("") __wt_async_op::value_format { $action; }
+%javaexception("") __wt_connection::_java_init { $action; }
%javaexception("") __wt_connection::get_home { $action; }
%javaexception("") __wt_connection::is_new { $action; }
-%javaexception("") __wt_connection::java_init { $action; }
+%javaexception("") __wt_cursor::_java_raw { $action; }
%javaexception("") __wt_cursor::key_format { $action; }
%javaexception("") __wt_cursor::session { $action; }
%javaexception("") __wt_cursor::uri { $action; }
%javaexception("") __wt_cursor::value_format { $action; }
+%javaexception("") __wt_session::_java_init { $action; }
%javaexception("") __wt_session::connection { $action; }
-%javaexception("") __wt_session::java_init { $action; }
/* Remove / rename parts of the C API that we don't want in Java. */
%immutable __wt_cursor::session;
@@ -1832,6 +1854,9 @@ WT_ASYNC_CALLBACK javaApiAsyncHandler = {javaAsyncHandler};
%ignore __wt_event_handler;
%ignore __wt_extractor;
%ignore __wt_connection::add_extractor;
+%ignore __wt_file_system;
+%ignore __wt_file_handle;
+%ignore __wt_connection::set_file_system;
%ignore __wt_item;
%ignore __wt_lsn;
%ignore __wt_session::msg_printf;
@@ -1890,8 +1915,8 @@ REQUIRE_WRAP(WT_ASYNC_OP::get_id, __wt_async_op::get_id,getId)
%}
%extend ctypename {
- %javamethodmodifiers java_init "protected";
- int java_init(jobject jsess) {
+ %javamethodmodifiers _java_init "protected";
+ int _java_init(jobject jsess) {
implclass *session = (implclass *)$self;
JAVA_CALLBACK *jcb = (JAVA_CALLBACK *)session->lang_private;
jcb->jobj = JCALL1(NewGlobalRef, jcb->jnienv, jsess);
@@ -1901,8 +1926,8 @@ REQUIRE_WRAP(WT_ASYNC_OP::get_id, __wt_async_op::get_id,getId)
}
%enddef
-TRACKED_CLASS(Session, __wt_session, wiredtigerJNI.Session_java_init, WT_SESSION_IMPL)
-TRACKED_CLASS(Connection, __wt_connection, wiredtigerJNI.Connection_java_init, WT_CONNECTION_IMPL)
+TRACKED_CLASS(Session, __wt_session, wiredtigerJNI.Session__java_init, WT_SESSION_IMPL)
+TRACKED_CLASS(Connection, __wt_connection, wiredtigerJNI.Connection__java_init, WT_CONNECTION_IMPL)
/* Note: Cursor incorporates the elements of TRACKED_CLASS into its
* custom constructor and %extend clause.
*/
@@ -1996,13 +2021,15 @@ err: if (ret != 0)
if ((ret = $self->open_cursor($self, uri, to_dup, config, &cursor)) != 0)
goto err;
- if ((cursor->flags & WT_CURSTD_DUMP_JSON) == 0)
- cursor->flags |= WT_CURSTD_RAW;
-
if ((ret = __wt_calloc_def((WT_SESSION_IMPL *)cursor->session,
1, &jcb)) != 0)
goto err;
+ if ((cursor->flags & WT_CURSTD_RAW) != 0)
+ jcb->cursor_raw = true;
+ if ((cursor->flags & WT_CURSTD_DUMP_JSON) == 0)
+ cursor->flags |= WT_CURSTD_RAW;
+
jcb->jnienv = jenv;
jcb->session = (WT_SESSION_IMPL *)cursor->session;
cursor->lang_private = jcb;
diff --git a/src/block/block_ckpt.c b/src/block/block_ckpt.c
index a861a21876b..716121faa06 100644
--- a/src/block/block_ckpt.c
+++ b/src/block/block_ckpt.c
@@ -140,12 +140,10 @@ __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block,
* will unnecessarily allocate buffer space.
*/
if (!checkpoint && !F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) {
- /*
- * The truncate might fail if there's a file mapping (if there's
- * an open checkpoint on the file), that's OK.
- */
WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
"truncate file to %" PRIuMAX, (uintmax_t)ci->file_size));
+
+ /* The truncate might fail, and that's OK. */
WT_ERR_BUSY_OK(
__wt_block_truncate(session, block, ci->file_size));
}
@@ -190,10 +188,7 @@ __wt_block_checkpoint_unload(
* checkpoints.
*/
if (!checkpoint) {
- /*
- * The truncate might fail if there's a file mapping (if there's
- * an open checkpoint on the file), that's OK.
- */
+ /* The truncate might fail, and that's OK. */
WT_TRET_BUSY_OK(
__wt_block_truncate(session, block, block->size));
diff --git a/src/block/block_ext.c b/src/block/block_ext.c
index caafcc77c48..6d67a66be5f 100644
--- a/src/block/block_ext.c
+++ b/src/block/block_ext.c
@@ -1362,9 +1362,8 @@ __wt_block_extlist_truncate(
block->size = size;
/*
- * Truncate the file. The truncate might fail if there's a file mapping
- * (if there's an open checkpoint on the file), that's OK, we'll ignore
- * those blocks.
+ * Truncate the file. The truncate might fail, and that's OK, we simply
+ * ignore those blocks.
*/
WT_RET(__wt_verbose(session, WT_VERB_BLOCK,
"truncate file from %" PRIdMAX " to %" PRIdMAX,
diff --git a/src/block/block_map.c b/src/block/block_map.c
index b16fe7f8423..ce6fe8602f5 100644
--- a/src/block/block_map.c
+++ b/src/block/block_map.c
@@ -13,24 +13,16 @@
* Map a segment of the file in, if possible.
*/
int
-__wt_block_map(
- WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapp, size_t *maplenp,
- void **mappingcookie)
+__wt_block_map(WT_SESSION_IMPL *session, WT_BLOCK *block,
+ void *mapped_regionp, size_t *lengthp, void *mapped_cookiep)
{
WT_DECL_RET;
+ WT_FILE_HANDLE *handle;
- *(void **)mapp = NULL;
- *maplenp = 0;
+ *(void **)mapped_regionp = NULL;
+ *lengthp = 0;
+ *(void **)mapped_cookiep = NULL;
-#ifdef WORDS_BIGENDIAN
- /*
- * The underlying objects are little-endian, mapping objects isn't
- * currently supported on big-endian systems.
- */
- WT_UNUSED(session);
- WT_UNUSED(block);
- WT_UNUSED(mappingcookie);
-#else
/* Map support is configurable. */
if (!S2C(session)->mmap)
return (0);
@@ -51,15 +43,23 @@ __wt_block_map(
return (0);
/*
+ * There may be no underlying functionality.
+ */
+ handle = block->fh->handle;
+ if (handle->map == NULL)
+ return (0);
+
+ /*
* Map the file into memory.
* Ignore not-supported errors, we'll read the file through the cache
* if map fails.
*/
- ret = block->fh->fh_map(
- session, block->fh, mapp, maplenp, mappingcookie);
- if (ret == ENOTSUP)
+ ret = handle->map(handle,
+ (WT_SESSION *)session, mapped_regionp, lengthp, mapped_cookiep);
+ if (ret == ENOTSUP) {
+ *(void **)mapped_regionp = NULL;
ret = 0;
-#endif
+ }
return (ret);
}
@@ -69,11 +69,13 @@ __wt_block_map(
* Unmap any mapped-in segment of the file.
*/
int
-__wt_block_unmap(
- WT_SESSION_IMPL *session, WT_BLOCK *block, void *map, size_t maplen,
- void **mappingcookie)
+__wt_block_unmap(WT_SESSION_IMPL *session,
+ WT_BLOCK *block, void *mapped_region, size_t length, void *mapped_cookie)
{
+ WT_FILE_HANDLE *handle;
+
/* Unmap the file from memory. */
- return (block->fh->fh_map_unmap(
- session, block->fh, map, maplen, mappingcookie));
+ handle = block->fh->handle;
+ return (handle->unmap(handle,
+ (WT_SESSION *)session, mapped_region, length, mapped_cookie));
}
diff --git a/src/block/block_mgr.c b/src/block/block_mgr.c
index 06150a0f062..465952d8ca5 100644
--- a/src/block/block_mgr.c
+++ b/src/block/block_mgr.c
@@ -103,7 +103,7 @@ __bm_checkpoint_load(WT_BM *bm, WT_SESSION_IMPL *session,
* of being read into cache buffers.
*/
WT_RET(__wt_block_map(session,
- bm->block, &bm->map, &bm->maplen, &bm->mappingcookie));
+ bm->block, &bm->map, &bm->maplen, &bm->mapped_cookie));
/*
* If this handle is for a checkpoint, that is, read-only, there
@@ -149,7 +149,7 @@ __bm_checkpoint_unload(WT_BM *bm, WT_SESSION_IMPL *session)
/* Unmap any mapped segment. */
if (bm->map != NULL)
WT_TRET(__wt_block_unmap(session,
- bm->block, bm->map, bm->maplen, &bm->mappingcookie));
+ bm->block, bm->map, bm->maplen, &bm->mapped_cookie));
/* Unload the checkpoint. */
WT_TRET(__wt_block_checkpoint_unload(session, bm->block, !bm->is_live));
@@ -302,6 +302,20 @@ __bm_is_mapped(WT_BM *bm, WT_SESSION_IMPL *session)
}
/*
+ * __bm_map_discard --
+ * Discard a mapped segment.
+ */
+static int
+__bm_map_discard(WT_BM *bm, WT_SESSION_IMPL *session, void *map, size_t len)
+{
+ WT_FILE_HANDLE *handle;
+
+ handle = bm->block->fh->handle;
+ return (handle->map_discard(
+ handle, (WT_SESSION *)session, map, len, bm->mapped_cookie));
+}
+
+/*
* __bm_salvage_end --
* End a block manager salvage.
*/
@@ -413,19 +427,7 @@ __bm_stat(WT_BM *bm, WT_SESSION_IMPL *session, WT_DSRC_STATS *stats)
static int
__bm_sync(WT_BM *bm, WT_SESSION_IMPL *session, bool block)
{
- WT_DECL_RET;
-
- if (!block && !bm->block->nowait_sync_available)
- return (0);
-
- if ((ret = __wt_fsync(session, bm->block->fh, block)) == 0)
- return (0);
-
- /* Ignore ENOTSUP, but don't try again. */
- if (ret != ENOTSUP)
- return (ret);
- bm->block->nowait_sync_available = false;
- return (0);
+ return (__wt_fsync(session, bm->block->fh, block));
}
/*
@@ -544,6 +546,7 @@ __bm_method_set(WT_BM *bm, bool readonly)
bm->compact_start = __bm_compact_start;
bm->free = __bm_free;
bm->is_mapped = __bm_is_mapped;
+ bm->map_discard = __bm_map_discard;
bm->preload = __wt_bm_preload;
bm->read = __wt_bm_read;
bm->salvage_end = __bm_salvage_end;
diff --git a/src/block/block_open.c b/src/block/block_open.c
index f4da5ca7c05..e58bef30a6d 100644
--- a/src/block/block_open.c
+++ b/src/block/block_open.c
@@ -33,7 +33,6 @@ __wt_block_manager_create(
WT_FH *fh;
int suffix;
bool exists;
- char *path;
/*
* Create the underlying file and open a handle.
@@ -44,7 +43,7 @@ __wt_block_manager_create(
* in our space. Move any existing files out of the way and complain.
*/
for (;;) {
- if ((ret = __wt_open(session, filename, WT_FILE_TYPE_DATA,
+ if ((ret = __wt_open(session, filename, WT_OPEN_FILE_TYPE_DATA,
WT_OPEN_CREATE | WT_OPEN_EXCLUSIVE, &fh)) == 0)
break;
WT_ERR_TEST(ret != EEXIST, ret);
@@ -54,10 +53,10 @@ __wt_block_manager_create(
for (suffix = 1;; ++suffix) {
WT_ERR(__wt_buf_fmt(
session, tmp, "%s.%d", filename, suffix));
- WT_ERR(__wt_exist(session, tmp->data, &exists));
+ WT_ERR(__wt_fs_exist(session, tmp->data, &exists));
if (!exists) {
- WT_ERR(
- __wt_rename(session, filename, tmp->data));
+ WT_ERR(__wt_fs_rename(
+ session, filename, tmp->data));
WT_ERR(__wt_msg(session,
"unexpected file %s found, renamed to %s",
filename, (char *)tmp->data));
@@ -82,14 +81,12 @@ __wt_block_manager_create(
* Some filesystems require that we sync the directory to be confident
* that the file will appear.
*/
- if (ret == 0 && (ret = __wt_filename(session, filename, &path)) == 0) {
- ret = __wt_directory_sync(session, path);
- __wt_free(session, path);
- }
+ if (ret == 0)
+ WT_TRET(__wt_fs_directory_sync(session, filename));
/* Undo any create on error. */
if (ret != 0)
- WT_TRET(__wt_remove(session, filename));
+ WT_TRET(__wt_fs_remove(session, filename));
err: __wt_scr_free(session, &tmp);
@@ -156,8 +153,7 @@ __wt_block_open(WT_SESSION_IMPL *session,
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
uint64_t bucket, hash;
-
- WT_UNUSED(readonly);
+ uint32_t flags;
WT_RET(__wt_verbose(session, WT_VERB_BLOCK, "open: %s", filename));
@@ -204,12 +200,18 @@ __wt_block_open(WT_SESSION_IMPL *session,
/* Set the file extension information. */
block->extend_len = conn->data_extend_len;
- /* Set the asynchronous flush, preload availability. */
- block->nowait_sync_available = true;
- block->preload_available = true;
-
- /* Open the underlying file handle. */
- WT_ERR(__wt_open(session, filename, WT_FILE_TYPE_DATA, 0, &block->fh));
+ /*
+ * Open the underlying file handle.
+ *
+ * "direct_io=checkpoint" configures direct I/O for readonly data files.
+ */
+ flags = 0;
+ if (readonly && FLD_ISSET(conn->direct_io, WT_DIRECT_IO_CHECKPOINT))
+ LF_SET(WT_OPEN_DIRECTIO);
+ if (!readonly && FLD_ISSET(conn->direct_io, WT_DIRECT_IO_DATA))
+ LF_SET(WT_OPEN_DIRECTIO);
+ WT_ERR(__wt_open(
+ session, filename, WT_OPEN_FILE_TYPE_DATA, flags, &block->fh));
/* Set the file's size. */
WT_ERR(__wt_filesize(session, block->fh, &block->size));
@@ -422,5 +424,5 @@ int
__wt_block_manager_named_size(
WT_SESSION_IMPL *session, const char *name, wt_off_t *sizep)
{
- return (__wt_filesize_name(session, name, false, sizep));
+ return (__wt_fs_size(session, name, sizep));
}
diff --git a/src/block/block_read.c b/src/block/block_read.c
index 6f0c41c1b5c..7304f6ff4bc 100644
--- a/src/block/block_read.c
+++ b/src/block/block_read.c
@@ -19,44 +19,32 @@ __wt_bm_preload(
WT_BLOCK *block;
WT_DECL_ITEM(tmp);
WT_DECL_RET;
+ WT_FILE_HANDLE *handle;
wt_off_t offset;
uint32_t cksum, size;
bool mapped;
WT_UNUSED(addr_size);
+
block = bm->block;
WT_STAT_FAST_CONN_INCR(session, block_preload);
- /* Preload the block. */
- if (block->preload_available) {
- /* Crack the cookie. */
- WT_RET(__wt_block_buffer_to_addr(
- block, addr, &offset, &size, &cksum));
-
- mapped = bm->map != NULL &&
- offset + size <= (wt_off_t)bm->maplen;
- if (mapped)
- ret = block->fh->fh_map_preload(session,
- block->fh, (uint8_t *)bm->map + offset, size);
- else
- ret = block->fh->fh_advise(session,
- block->fh, (wt_off_t)offset,
- (wt_off_t)size, POSIX_FADV_WILLNEED);
- if (ret == 0)
- return (0);
-
- /* Ignore ENOTSUP, but don't try again. */
- if (ret != ENOTSUP)
- return (ret);
- block->preload_available = false;
- }
+ /* Crack the cookie. */
+ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));
- /*
- * If preload isn't supported, do it the slow way; don't call the
- * underlying read routine directly, we don't know for certain if
- * this is a mapped range.
- */
+ handle = block->fh->handle;
+ mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen;
+ if (mapped && handle->map_preload != NULL)
+ ret = handle->map_preload(handle, (WT_SESSION *)session,
+ (uint8_t *)bm->map + offset, size, bm->mapped_cookie);
+ if (!mapped && handle->fadvise != NULL)
+ ret = handle->fadvise(handle, (WT_SESSION *)session,
+ (wt_off_t)offset, (wt_off_t)size, WT_FILE_HANDLE_WILLNEED);
+ if (ret != EBUSY && ret != ENOTSUP)
+ return (ret);
+
+ /* If preload isn't supported, do it the slow way. */
WT_RET(__wt_scr_alloc(session, 0, &tmp));
ret = __wt_bm_read(bm, session, tmp, addr, addr_size);
__wt_scr_free(session, &tmp);
@@ -74,6 +62,7 @@ __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session,
{
WT_BLOCK *block;
WT_DECL_RET;
+ WT_FILE_HANDLE *handle;
wt_off_t offset;
uint32_t cksum, size;
bool mapped;
@@ -87,23 +76,17 @@ __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session,
/*
* Map the block if it's possible.
*/
+ handle = block->fh->handle;
mapped = bm->map != NULL && offset + size <= (wt_off_t)bm->maplen;
- if (mapped) {
+ if (mapped && handle->map_preload != NULL) {
buf->data = (uint8_t *)bm->map + offset;
buf->size = size;
- if (block->preload_available) {
- ret = block->fh->fh_map_preload(
- session, block->fh, buf->data, buf->size);
-
- /* Ignore ENOTSUP, but don't try again. */
- if (ret != ENOTSUP)
- return (ret);
- block->preload_available = false;
- }
+ ret = handle->map_preload(handle, (WT_SESSION *)session,
+ buf->data, buf->size,bm->mapped_cookie);
WT_STAT_FAST_CONN_INCR(session, block_map_read);
WT_STAT_FAST_CONN_INCRV(session, block_byte_map_read, size);
- return (0);
+ return (ret);
}
#ifdef HAVE_DIAGNOSTIC
diff --git a/src/block/block_vrfy.c b/src/block/block_vrfy.c
index 6570184ca10..a8e59ad0af7 100644
--- a/src/block/block_vrfy.c
+++ b/src/block/block_vrfy.c
@@ -15,7 +15,7 @@ static int __verify_filefrag_add(
WT_SESSION_IMPL *, WT_BLOCK *, const char *, wt_off_t, wt_off_t, bool);
static int __verify_filefrag_chk(WT_SESSION_IMPL *, WT_BLOCK *);
static int __verify_last_avail(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *);
-static int __verify_last_truncate(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *);
+static int __verify_set_file_size(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *);
/* The bit list ignores the first block: convert to/from a frag/offset. */
#define WT_wt_off_TO_FRAG(block, off) \
@@ -49,8 +49,8 @@ __wt_block_verify_start(WT_SESSION_IMPL *session,
return (0);
}
- /* Truncate the file to the size of the last checkpoint. */
- WT_RET(__verify_last_truncate(session, block, ckpt));
+ /* Set the size of the file to the size of the last checkpoint. */
+ WT_RET(__verify_set_file_size(session, block, ckpt));
/*
* We're done if the file has no data pages (this happens if we verify
@@ -144,11 +144,11 @@ err: __wt_block_ckpt_destroy(session, ci);
}
/*
- * __verify_last_truncate --
- * Truncate the file to the last checkpoint's size.
+ * __verify_set_file_size --
+ * Set the file size to the last checkpoint's size.
*/
static int
-__verify_last_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt)
+__verify_set_file_size(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt)
{
WT_BLOCK_CKPT *ci, _ci;
WT_DECL_RET;
@@ -156,7 +156,13 @@ __verify_last_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt)
ci = &_ci;
WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name));
WT_ERR(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci));
- WT_ERR_BUSY_OK(__wt_block_truncate(session, block, ci->file_size));
+
+ /*
+ * Verify is read-only. Set the block's file size information as if we
+ * truncated the file during checkpoint load, so references to blocks
+ * after last checkpoint's file size fail.
+ */
+ block->size = block->extend_size = ci->file_size;
err: __wt_block_ckpt_destroy(session, ci);
return (ret);
diff --git a/src/block/block_write.c b/src/block/block_write.c
index 134272b52f9..4f1224f3c13 100644
--- a/src/block/block_write.c
+++ b/src/block/block_write.c
@@ -15,6 +15,24 @@
int
__wt_block_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t len)
{
+ /*
+ * Backups are done by copying files outside of WiredTiger, potentially
+ * by system utilities. We cannot truncate the file during the backup
+ * window, we might surprise an application.
+ *
+ * Stop block truncation. This affects files that aren't involved in the
+ * backup (for example, doing incremental backups, which only copies log
+ * files, or targeted backups, stops all truncation). We may want a more
+ * targeted solution at some point.
+ */
+ if (S2C(session)->hot_backup)
+ return (EBUSY);
+
+ /*
+ * Additionally, the truncate might fail if there's a file mapping (if
+ * there's an open checkpoint on the file), in which case the underlying
+ * function returns EBUSY.
+ */
WT_RET(__wt_ftruncate(session, block->fh, len));
block->size = block->extend_size = len;
@@ -30,27 +48,28 @@ int
__wt_block_discard(WT_SESSION_IMPL *session, WT_BLOCK *block, size_t added_size)
{
WT_DECL_RET;
+ WT_FILE_HANDLE *handle;
+ /* The file may not support this call. */
+ handle = block->fh->handle;
+ if (handle->fadvise == NULL)
+ return (0);
+
+ /* The call may not be configured. */
if (block->os_cache_max == 0)
return (0);
/*
* We're racing on the addition, but I'm not willing to serialize on it
- * in the standard read path with more evidence it's needed.
+ * in the standard read path without evidence it's needed.
*/
if ((block->os_cache += added_size) <= block->os_cache_max)
return (0);
block->os_cache = 0;
- WT_ERR(block->fh->fh_advise(session,
- block->fh, (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED));
- return (0);
-
-err: /* Ignore ENOTSUP, but don't try again. */
- if (ret != ENOTSUP)
- return (ret);
- block->os_cache_max = 0;
- return (0);
+ ret = handle->fadvise(handle, (WT_SESSION *)session,
+ (wt_off_t)0, (wt_off_t)0, WT_FILE_HANDLE_DONTNEED);
+ return (ret == EBUSY || ret == ENOTSUP ? 0 : ret);
}
/*
@@ -62,6 +81,7 @@ __wt_block_extend(WT_SESSION_IMPL *session, WT_BLOCK *block,
WT_FH *fh, wt_off_t offset, size_t align_size, bool *release_lockp)
{
WT_DECL_RET;
+ WT_FILE_HANDLE *handle;
bool locked;
/*
@@ -107,7 +127,8 @@ __wt_block_extend(WT_SESSION_IMPL *session, WT_BLOCK *block,
* based on the filesystem type, fall back to ftruncate in that case,
* and remember that ftruncate requires locking.
*/
- if (fh->fallocate_available != WT_FALLOCATE_NOT_AVAILABLE) {
+ handle = fh->handle;
+ if (handle->fallocate != NULL || handle->fallocate_nolock != NULL) {
/*
* Release any locally acquired lock if not needed to extend the
* file, extending the file may require updating on-disk file's
@@ -115,7 +136,7 @@ __wt_block_extend(WT_SESSION_IMPL *session, WT_BLOCK *block,
* configure for file extension on systems that require locking
* over the extend call.)
*/
- if (!fh->fallocate_requires_locking && *release_lockp) {
+ if (handle->fallocate_nolock != NULL && *release_lockp) {
*release_lockp = locked = false;
__wt_spin_unlock(session, &block->live_lock);
}
@@ -131,8 +152,7 @@ __wt_block_extend(WT_SESSION_IMPL *session, WT_BLOCK *block,
if ((ret = __wt_fallocate(
session, fh, block->size, block->extend_len * 2)) == 0)
return (0);
- if (ret != ENOTSUP)
- return (ret);
+ WT_RET_ERROR_OK(ret, ENOTSUP);
}
/*
@@ -155,9 +175,8 @@ __wt_block_extend(WT_SESSION_IMPL *session, WT_BLOCK *block,
* The truncate might fail if there's a mapped file (in other words, if
* there's an open checkpoint on the file), that's OK.
*/
- if ((ret = __wt_ftruncate(session, fh, block->extend_size)) == EBUSY)
- ret = 0;
- return (ret);
+ WT_RET_BUSY_OK(__wt_ftruncate(session, fh, block->extend_size));
+ return (0);
}
/*
diff --git a/src/bloom/bloom.c b/src/bloom/bloom.c
index 505630f12cf..e32544d5521 100644
--- a/src/bloom/bloom.c
+++ b/src/bloom/bloom.c
@@ -295,7 +295,7 @@ __wt_bloom_hash_get(WT_BLOOM *bloom, WT_BLOOM_HASH *bhash)
err: /* Don't return WT_NOTFOUND from a failed search. */
if (ret == WT_NOTFOUND)
ret = WT_ERROR;
- __wt_err(bloom->session, ret, "Failed lookup in bloom filter.");
+ __wt_err(bloom->session, ret, "Failed lookup in bloom filter");
return (ret);
}
diff --git a/src/btree/bt_curnext.c b/src/btree/bt_curnext.c
index 63b2e2abebc..70b3ba56e31 100644
--- a/src/btree/bt_curnext.c
+++ b/src/btree/bt_curnext.c
@@ -86,10 +86,10 @@ __cursor_fix_next(WT_CURSOR_BTREE *cbt, bool newpage)
/* Initialize for each new page. */
if (newpage) {
- cbt->last_standard_recno = __col_fix_last_recno(page);
+ cbt->last_standard_recno = __col_fix_last_recno(cbt->ref);
if (cbt->last_standard_recno == 0)
return (WT_NOTFOUND);
- __cursor_set_recno(cbt, page->pg_fix_recno);
+ __cursor_set_recno(cbt, cbt->ref->ref_recno);
goto new_page;
}
@@ -107,7 +107,7 @@ new_page:
cbt->ins = NULL;
upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd);
if (upd == NULL) {
- cbt->v = __bit_getv_recno(page, cbt->recno, btree->bitcnt);
+ cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt);
val->data = &cbt->v;
} else
val->data = WT_UPDATE_DATA(upd);
@@ -179,10 +179,10 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, bool newpage)
/* Initialize for each new page. */
if (newpage) {
- cbt->last_standard_recno = __col_var_last_recno(page);
+ cbt->last_standard_recno = __col_var_last_recno(cbt->ref);
if (cbt->last_standard_recno == 0)
return (WT_NOTFOUND);
- __cursor_set_recno(cbt, page->pg_var_recno);
+ __cursor_set_recno(cbt, cbt->ref->ref_recno);
goto new_page;
}
@@ -194,7 +194,7 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, bool newpage)
new_page: /* Find the matching WT_COL slot. */
if ((cip =
- __col_var_search(page, cbt->recno, &rle_start)) == NULL)
+ __col_var_search(cbt->ref, cbt->recno, &rle_start)) == NULL)
return (WT_NOTFOUND);
cbt->slot = WT_COL_SLOT(page, cip);
@@ -558,7 +558,8 @@ __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt)
* page.
*/
cbt->last_standard_recno = page->type == WT_PAGE_COL_VAR ?
- __col_var_last_recno(page) : __col_fix_last_recno(page);
+ __col_var_last_recno(cbt->ref) :
+ __col_fix_last_recno(cbt->ref);
/* If we're traversing the append list, set the reference. */
if (cbt->ins_head != NULL &&
diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c
index 7475c0f1312..872f648446c 100644
--- a/src/btree/bt_curprev.c
+++ b/src/btree/bt_curprev.c
@@ -128,12 +128,10 @@ static inline int
__cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage)
{
WT_ITEM *val;
- WT_PAGE *page;
WT_SESSION_IMPL *session;
WT_UPDATE *upd;
session = (WT_SESSION_IMPL *)cbt->iface.session;
- page = cbt->ref->page;
val = &cbt->iface.value;
if (newpage) {
@@ -176,8 +174,8 @@ __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage)
* to a record number matching the first record on the page.
*/
if (cbt->ins == NULL &&
- (cbt->recno == page->pg_fix_recno ||
- __col_fix_last_recno(page) != 0))
+ (cbt->recno == cbt->ref->ref_recno ||
+ __col_fix_last_recno(cbt->ref) != 0))
return (WT_NOTFOUND);
}
@@ -234,7 +232,7 @@ __cursor_fix_prev(WT_CURSOR_BTREE *cbt, bool newpage)
/* Initialize for each new page. */
if (newpage) {
- cbt->last_standard_recno = __col_fix_last_recno(page);
+ cbt->last_standard_recno = __col_fix_last_recno(cbt->ref);
if (cbt->last_standard_recno == 0)
return (WT_NOTFOUND);
__cursor_set_recno(cbt, cbt->last_standard_recno);
@@ -242,7 +240,7 @@ __cursor_fix_prev(WT_CURSOR_BTREE *cbt, bool newpage)
}
/* Move to the previous entry and return the item. */
- if (cbt->recno == page->pg_fix_recno)
+ if (cbt->recno == cbt->ref->ref_recno)
return (WT_NOTFOUND);
__cursor_set_recno(cbt, cbt->recno - 1);
@@ -255,7 +253,7 @@ new_page:
cbt->ins = NULL;
upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd);
if (upd == NULL) {
- cbt->v = __bit_getv_recno(page, cbt->recno, btree->bitcnt);
+ cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt);
val->data = &cbt->v;
} else
val->data = WT_UPDATE_DATA(upd);
@@ -327,7 +325,7 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage)
/* Initialize for each new page. */
if (newpage) {
- cbt->last_standard_recno = __col_var_last_recno(page);
+ cbt->last_standard_recno = __col_var_last_recno(cbt->ref);
if (cbt->last_standard_recno == 0)
return (WT_NOTFOUND);
__cursor_set_recno(cbt, cbt->last_standard_recno);
@@ -338,12 +336,12 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage)
for (;;) {
__cursor_set_recno(cbt, cbt->recno - 1);
-new_page: if (cbt->recno < page->pg_var_recno)
+new_page: if (cbt->recno < cbt->ref->ref_recno)
return (WT_NOTFOUND);
/* Find the matching WT_COL slot. */
if ((cip =
- __col_var_search(page, cbt->recno, &rle_start)) == NULL)
+ __col_var_search(cbt->ref, cbt->recno, &rle_start)) == NULL)
return (WT_NOTFOUND);
cbt->slot = WT_COL_SLOT(page, cip);
diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c
index 1f3ac443495..4b73b76c8c8 100644
--- a/src/btree/bt_cursor.c
+++ b/src/btree/bt_cursor.c
@@ -164,12 +164,12 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
* column-store pages don't have slots, but map one-to-one to
* keys, check for retrieval past the end of the page.
*/
- if (cbt->recno >= page->pg_fix_recno + page->pg_fix_entries)
+ if (cbt->recno >= cbt->ref->ref_recno + page->pg_fix_entries)
return (false);
/*
- * Updates aren't stored on the page, an update would have
- * appeared as an "insert" object; no further checks to do.
+ * An update would have appeared as an "insert" object; no
+ * further checks to do.
*/
break;
case BTREE_COL_VAR:
@@ -179,19 +179,18 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
WT_ASSERT(session, cbt->slot < page->pg_var_entries);
/*
- * Column-store updates aren't stored on the page, instead they
- * are stored as "insert" objects. If search returned an insert
- * object we can't return, the returned on-page object must be
- * checked for a match.
+ * Column-store updates are stored as "insert" objects. If
+ * search returned an insert object we can't return, the
+ * returned on-page object must be checked for a match.
*/
if (cbt->ins != NULL && !F_ISSET(cbt, WT_CBT_VAR_ONPAGE_MATCH))
return (false);
/*
- * Updates aren't stored on the page, an update would have
- * appeared as an "insert" object; however, variable-length
- * column store deletes are written into the backing store,
- * check the cell for a record already deleted when read.
+ * Although updates would have appeared as an "insert" objects,
+ * variable-length column store deletes are written into the
+ * backing store; check the cell for a record already deleted
+ * when read.
*/
cip = &page->pg_var_d[cbt->slot];
if ((cell = WT_COL_PTR(page, cip)) == NULL ||
@@ -211,9 +210,11 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
if (cbt->ins != NULL)
return (false);
- /* Updates are stored on the page, check for a delete. */
- if (page->pg_row_upd != NULL && (upd = __wt_txn_read(
- session, page->pg_row_upd[cbt->slot])) != NULL) {
+ /* Check for an update. */
+ if (page->modify != NULL &&
+ page->modify->mod_row_update != NULL &&
+ (upd = __wt_txn_read(session,
+ page->modify->mod_row_update[cbt->slot])) != NULL) {
if (WT_UPDATE_DELETED_ISSET(upd))
return (false);
if (updp != NULL)
@@ -325,7 +326,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt)
valid = false;
if (F_ISSET(cbt, WT_CBT_ACTIVE) &&
cbt->ref->page->read_gen != WT_READGEN_OLDEST) {
- __wt_txn_cursor_op(session);
+ WT_ERR(__wt_txn_cursor_op(session));
WT_ERR(btree->type == BTREE_ROW ?
__cursor_row_search(session, cbt, cbt->ref, false) :
@@ -405,7 +406,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
if (btree->type == BTREE_ROW &&
F_ISSET(cbt, WT_CBT_ACTIVE) &&
cbt->ref->page->read_gen != WT_READGEN_OLDEST) {
- __wt_txn_cursor_op(session);
+ WT_ERR(__wt_txn_cursor_op(session));
WT_ERR(__cursor_row_search(session, cbt, cbt->ref, true));
@@ -596,9 +597,12 @@ __curfile_update_check(WT_CURSOR_BTREE *cbt)
return (0);
if (cbt->ins != NULL)
return (__wt_txn_update_check(session, cbt->ins->upd));
- if (btree->type == BTREE_ROW && cbt->ref->page->pg_row_upd != NULL)
- return (__wt_txn_update_check(
- session, cbt->ref->page->pg_row_upd[cbt->slot]));
+
+ if (btree->type == BTREE_ROW &&
+ cbt->ref->page->modify != NULL &&
+ cbt->ref->page->modify->mod_row_update != NULL)
+ return (__wt_txn_update_check(session,
+ cbt->ref->page->modify->mod_row_update[cbt->slot]));
return (0);
}
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index 8ce1463a0db..bd5970ecf86 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -19,7 +19,7 @@ typedef struct {
* When using the standard event handlers, the debugging output has to
* do its own message handling because its output isn't line-oriented.
*/
- WT_FH *fh; /* Output file stream */
+ FILE *fp;
WT_ITEM *msg; /* Buffered message */
WT_ITEM *tmp; /* Temporary space */
@@ -36,17 +36,17 @@ static int __debug_config(WT_SESSION_IMPL *, WT_DBG *, const char *);
static int __debug_dsk_cell(WT_DBG *, const WT_PAGE_HEADER *);
static void __debug_dsk_col_fix(WT_DBG *, const WT_PAGE_HEADER *);
static void __debug_item(WT_DBG *, const char *, const void *, size_t);
-static int __debug_page(WT_DBG *, WT_PAGE *, uint32_t);
-static void __debug_page_col_fix(WT_DBG *, WT_PAGE *);
+static int __debug_page(WT_DBG *, WT_REF *, uint32_t);
+static void __debug_page_col_fix(WT_DBG *, WT_REF *);
static int __debug_page_col_int(WT_DBG *, WT_PAGE *, uint32_t);
-static int __debug_page_col_var(WT_DBG *, WT_PAGE *);
-static int __debug_page_metadata(WT_DBG *, WT_PAGE *);
+static int __debug_page_col_var(WT_DBG *, WT_REF *);
+static int __debug_page_metadata(WT_DBG *, WT_REF *);
static int __debug_page_row_int(WT_DBG *, WT_PAGE *, uint32_t);
static int __debug_page_row_leaf(WT_DBG *, WT_PAGE *);
static void __debug_ref(WT_DBG *, WT_REF *);
static void __debug_row_skip(WT_DBG *, WT_INSERT_HEAD *);
static int __debug_tree(
- WT_SESSION_IMPL *, WT_BTREE *, WT_PAGE *, const char *, uint32_t);
+ WT_SESSION_IMPL *, WT_BTREE *, WT_REF *, const char *, uint32_t);
static void __debug_update(WT_DBG *, WT_UPDATE *, bool);
static void __dmsg(WT_DBG *, const char *, ...)
WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 2, 3)));
@@ -97,8 +97,11 @@ __debug_config(WT_SESSION_IMPL *session, WT_DBG *ds, const char *ofile)
if (ofile == NULL)
return (__wt_scr_alloc(session, 512, &ds->msg));
- return (__wt_open(session, ofile, WT_FILE_TYPE_REGULAR,
- WT_OPEN_CREATE | WT_STREAM_LINE_BUFFER | WT_STREAM_WRITE, &ds->fh));
+ if ((ds->fp = fopen(ofile, "w")) == NULL)
+ return (EIO);
+ __wt_stream_set_line_buffer(ds->fp);
+
+ return (0);
}
/*
@@ -127,7 +130,8 @@ __dmsg_wrapup(WT_DBG *ds)
}
/* Close any file we opened. */
- (void)__wt_close(session, &ds->fh);
+ if (ds->fp != NULL)
+ (void)fclose(ds->fp);
}
/*
@@ -152,7 +156,7 @@ __dmsg(WT_DBG *ds, const char *fmt, ...)
* the output chunk, and pass it to the event handler once we see a
* terminating newline.
*/
- if (ds->fh == NULL) {
+ if (ds->fp == NULL) {
msg = ds->msg;
for (;;) {
p = (char *)msg->mem + msg->size;
@@ -184,7 +188,7 @@ __dmsg(WT_DBG *ds, const char *fmt, ...)
}
} else {
va_start(ap, fmt);
- (void)__wt_vfprintf(session, ds->fh, fmt, ap);
+ (void)vfprintf(ds->fp, fmt, ap);
va_end(ap);
}
}
@@ -498,10 +502,10 @@ __wt_debug_tree_shape(
*/
int
__wt_debug_tree_all(
- WT_SESSION_IMPL *session, WT_BTREE *btree, WT_PAGE *page, const char *ofile)
+ WT_SESSION_IMPL *session, WT_BTREE *btree, WT_REF *ref, const char *ofile)
{
return (__debug_tree(session,
- btree, page, ofile, WT_DEBUG_TREE_LEAF | WT_DEBUG_TREE_WALK));
+ btree, ref, ofile, WT_DEBUG_TREE_LEAF | WT_DEBUG_TREE_WALK));
}
/*
@@ -513,9 +517,9 @@ __wt_debug_tree_all(
*/
int
__wt_debug_tree(
- WT_SESSION_IMPL *session, WT_BTREE *btree, WT_PAGE *page, const char *ofile)
+ WT_SESSION_IMPL *session, WT_BTREE *btree, WT_REF *ref, const char *ofile)
{
- return (__debug_tree(session, btree, page, ofile, WT_DEBUG_TREE_WALK));
+ return (__debug_tree(session, btree, ref, ofile, WT_DEBUG_TREE_WALK));
}
/*
@@ -523,7 +527,7 @@ __wt_debug_tree(
* Dump the in-memory information for a page.
*/
int
-__wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile)
+__wt_debug_page(WT_SESSION_IMPL *session, WT_REF *ref, const char *ofile)
{
WT_DBG *ds, _ds;
WT_DECL_RET;
@@ -533,7 +537,7 @@ __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile)
ds = &_ds;
WT_RET(__debug_config(session, ds, ofile));
- ret = __debug_page(ds, page, WT_DEBUG_TREE_LEAF);
+ ret = __debug_page(ds, ref, WT_DEBUG_TREE_LEAF);
__dmsg_wrapup(ds);
@@ -549,9 +553,8 @@ __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile)
* in this function
*/
static int
-__debug_tree(
- WT_SESSION_IMPL *session, WT_BTREE *btree,
- WT_PAGE *page, const char *ofile, uint32_t flags)
+__debug_tree(WT_SESSION_IMPL *session,
+ WT_BTREE *btree, WT_REF *ref, const char *ofile, uint32_t flags)
{
WT_DBG *ds, _ds;
WT_DECL_RET;
@@ -560,10 +563,10 @@ __debug_tree(
WT_RET(__debug_config(session, ds, ofile));
/* A NULL page starts at the top of the tree -- it's a convenience. */
- if (page == NULL)
- page = btree->root.page;
+ if (ref == NULL)
+ ref = &btree->root;
- WT_WITH_BTREE(session, btree, ret = __debug_page(ds, page, flags));
+ WT_WITH_BTREE(session, btree, ret = __debug_page(ds, ref, flags));
__dmsg_wrapup(ds);
@@ -575,7 +578,7 @@ __debug_tree(
* Dump the in-memory information for an in-memory page.
*/
static int
-__debug_page(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
+__debug_page(WT_DBG *ds, WT_REF *ref, uint32_t flags)
{
WT_DECL_RET;
WT_SESSION_IMPL *session;
@@ -583,32 +586,32 @@ __debug_page(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
session = ds->session;
/* Dump the page metadata. */
- WT_WITH_PAGE_INDEX(session, ret = __debug_page_metadata(ds, page));
+ WT_WITH_PAGE_INDEX(session, ret = __debug_page_metadata(ds, ref));
WT_RET(ret);
/* Dump the page. */
- switch (page->type) {
+ switch (ref->page->type) {
case WT_PAGE_COL_FIX:
if (LF_ISSET(WT_DEBUG_TREE_LEAF))
- __debug_page_col_fix(ds, page);
+ __debug_page_col_fix(ds, ref);
break;
case WT_PAGE_COL_INT:
WT_WITH_PAGE_INDEX(session,
- ret = __debug_page_col_int(ds, page, flags));
+ ret = __debug_page_col_int(ds, ref->page, flags));
WT_RET(ret);
break;
case WT_PAGE_COL_VAR:
if (LF_ISSET(WT_DEBUG_TREE_LEAF))
- WT_RET(__debug_page_col_var(ds, page));
+ WT_RET(__debug_page_col_var(ds, ref));
break;
case WT_PAGE_ROW_INT:
WT_WITH_PAGE_INDEX(session,
- ret = __debug_page_row_int(ds, page, flags));
+ ret = __debug_page_row_int(ds, ref->page, flags));
WT_RET(ret);
break;
case WT_PAGE_ROW_LEAF:
if (LF_ISSET(WT_DEBUG_TREE_LEAF))
- WT_RET(__debug_page_row_leaf(ds, page));
+ WT_RET(__debug_page_row_leaf(ds, ref->page));
break;
WT_ILLEGAL_VALUE(session);
}
@@ -621,30 +624,32 @@ __debug_page(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
* Dump an in-memory page's metadata.
*/
static int
-__debug_page_metadata(WT_DBG *ds, WT_PAGE *page)
+__debug_page_metadata(WT_DBG *ds, WT_REF *ref)
{
+ WT_PAGE *page;
WT_PAGE_INDEX *pindex;
WT_PAGE_MODIFY *mod;
WT_SESSION_IMPL *session;
uint32_t entries;
session = ds->session;
+ page = ref->page;
mod = page->modify;
__dmsg(ds, "%p", page);
switch (page->type) {
case WT_PAGE_COL_INT:
- __dmsg(ds, " recno %" PRIu64, page->pg_intl_recno);
+ __dmsg(ds, " recno %" PRIu64, ref->ref_recno);
WT_INTL_INDEX_GET(session, page, pindex);
entries = pindex->entries;
break;
case WT_PAGE_COL_FIX:
- __dmsg(ds, " recno %" PRIu64, page->pg_fix_recno);
+ __dmsg(ds, " recno %" PRIu64, ref->ref_recno);
entries = page->pg_fix_entries;
break;
case WT_PAGE_COL_VAR:
- __dmsg(ds, " recno %" PRIu64, page->pg_var_recno);
+ __dmsg(ds, " recno %" PRIu64, ref->ref_recno);
entries = page->pg_var_entries;
break;
case WT_PAGE_ROW_INT:
@@ -707,10 +712,11 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page)
* Dump an in-memory WT_PAGE_COL_FIX page.
*/
static void
-__debug_page_col_fix(WT_DBG *ds, WT_PAGE *page)
+__debug_page_col_fix(WT_DBG *ds, WT_REF *ref)
{
WT_BTREE *btree;
WT_INSERT *ins;
+ WT_PAGE *page;
const WT_PAGE_HEADER *dsk;
WT_SESSION_IMPL *session;
uint64_t recno;
@@ -721,8 +727,9 @@ __debug_page_col_fix(WT_DBG *ds, WT_PAGE *page)
session = ds->session;
btree = S2BT(session);
+ page = ref->page;
dsk = page->dsk;
- recno = page->pg_fix_recno;
+ recno = ref->ref_recno;
if (dsk != NULL) {
ins = WT_SKIP_FIRST(WT_COL_UPDATE_SINGLE(page));
@@ -767,7 +774,7 @@ __debug_page_col_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
session = ds->session;
WT_INTL_FOREACH_BEGIN(session, page, ref) {
- __dmsg(ds, "\trecno %" PRIu64 "\n", ref->key.recno);
+ __dmsg(ds, "\trecno %" PRIu64 "\n", ref->ref_recno);
__debug_ref(ds, ref);
} WT_INTL_FOREACH_END;
@@ -775,7 +782,7 @@ __debug_page_col_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
WT_INTL_FOREACH_BEGIN(session, page, ref) {
if (ref->state == WT_REF_MEM) {
__dmsg(ds, "\n");
- WT_RET(__debug_page(ds, ref->page, flags));
+ WT_RET(__debug_page(ds, ref, flags));
}
} WT_INTL_FOREACH_END;
@@ -787,18 +794,20 @@ __debug_page_col_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
* Dump an in-memory WT_PAGE_COL_VAR page.
*/
static int
-__debug_page_col_var(WT_DBG *ds, WT_PAGE *page)
+__debug_page_col_var(WT_DBG *ds, WT_REF *ref)
{
WT_CELL *cell;
WT_CELL_UNPACK *unpack, _unpack;
WT_COL *cip;
WT_INSERT_HEAD *update;
+ WT_PAGE *page;
uint64_t recno, rle;
uint32_t i;
char tag[64];
unpack = &_unpack;
- recno = page->pg_var_recno;
+ page = ref->page;
+ recno = ref->ref_recno;
WT_COL_FOREACH(page, cip, i) {
if ((cell = WT_COL_PTR(page, cip)) == NULL) {
@@ -849,7 +858,7 @@ __debug_page_row_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags)
WT_INTL_FOREACH_BEGIN(session, page, ref) {
if (ref->state == WT_REF_MEM) {
__dmsg(ds, "\n");
- WT_RET(__debug_page(ds, ref->page, flags));
+ WT_RET(__debug_page(ds, ref, flags));
}
} WT_INTL_FOREACH_END;
return (0);
@@ -952,8 +961,7 @@ __debug_update(WT_DBG *ds, WT_UPDATE *upd, bool hexbyte)
__dmsg(ds, "\tvalue {deleted}\n");
else if (hexbyte) {
__dmsg(ds, "\t{");
- __debug_hex_byte(ds,
- ((uint8_t *)WT_UPDATE_DATA(upd))[0]);
+ __debug_hex_byte(ds, *(uint8_t *)WT_UPDATE_DATA(upd));
__dmsg(ds, "}\n");
} else
__debug_item(ds,
diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c
index ba16dd204e8..54b7fedb31d 100644
--- a/src/btree/bt_delete.c
+++ b/src/btree/bt_delete.c
@@ -288,10 +288,9 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
* read-only or if the application never modifies the tree, we're not
* able to do so.)
*/
- if (btree->modified) {
- WT_RET(__wt_page_modify_init(session, page));
+ WT_RET(__wt_page_modify_init(session, page));
+ if (btree->modified)
__wt_page_modify_set(session, page);
- }
/*
* An operation is accessing a "deleted" page, and we're building an
@@ -326,7 +325,7 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
/* Allocate the per-page update array. */
WT_ERR(__wt_calloc_def(session, page->pg_row_entries, &upd_array));
- page->pg_row_upd = upd_array;
+ page->modify->mod_row_update = upd_array;
/*
* Fill in the per-reference update array with references to update
diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c
index 1181d92609f..9807d5bc88f 100644
--- a/src/btree/bt_discard.c
+++ b/src/btree/bt_discard.c
@@ -40,7 +40,6 @@ __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref)
void
__wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep)
{
- WT_FH *fh;
WT_PAGE *page;
WT_PAGE_HEADER *dsk;
WT_PAGE_MODIFY *mod;
@@ -134,10 +133,11 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep)
dsk = (WT_PAGE_HEADER *)page->dsk;
if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC))
__wt_overwrite_and_free_len(session, dsk, dsk->mem_size);
- if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_MAPPED)) {
- fh = S2BT(session)->bm->block->fh;
- (void)fh->fh_map_discard(session, fh, dsk, dsk->mem_size);
- }
+
+ /* Discard any mapped image. */
+ if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_MAPPED))
+ (void)S2BT(session)->bm->map_discard(
+ S2BT(session)->bm, session, dsk, (size_t)dsk->mem_size);
__wt_overwrite_and_free(session, page);
}
@@ -194,16 +194,33 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
__free_skip_list(
session, WT_SKIP_FIRST(append), update_ignore);
__wt_free(session, append);
- __wt_free(session, mod->mod_append);
+ __wt_free(session, mod->mod_col_append);
}
/* Free the insert/update array. */
- if (mod->mod_update != NULL)
- __free_skip_array(session, mod->mod_update,
+ if (mod->mod_col_update != NULL)
+ __free_skip_array(session, mod->mod_col_update,
page->type ==
WT_PAGE_COL_FIX ? 1 : page->pg_var_entries,
update_ignore);
break;
+ case WT_PAGE_ROW_LEAF:
+ /*
+ * Free the insert array.
+ *
+ * Row-store tables have one additional slot in the insert array
+ * (the insert array has an extra slot to hold keys that sort
+ * before keys found on the original page).
+ */
+ if (mod->mod_row_insert != NULL)
+ __free_skip_array(session, mod->mod_row_insert,
+ page->pg_row_entries + 1, update_ignore);
+
+ /* Free the update array. */
+ if (mod->mod_row_update != NULL)
+ __free_update(session, mod->mod_row_update,
+ page->pg_row_entries, update_ignore);
+ break;
}
/* Free the overflow on-page, reuse and transaction-cache skiplists. */
@@ -324,10 +341,6 @@ __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
WT_ROW *rip;
uint32_t i;
void *copy;
- bool update_ignore;
-
- /* In some failed-split cases, we can't discard updates. */
- update_ignore = F_ISSET_ATOMIC(page, WT_PAGE_UPDATE_IGNORE);
/*
* Free the in-memory index array.
@@ -342,22 +355,6 @@ __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page)
page, copy, &ikey, NULL, NULL, NULL);
__wt_free(session, ikey);
}
-
- /*
- * Free the insert array.
- *
- * Row-store tables have one additional slot in the insert array (the
- * insert array has an extra slot to hold keys that sort before keys
- * found on the original page).
- */
- if (page->pg_row_ins != NULL)
- __free_skip_array(session,
- page->pg_row_ins, page->pg_row_entries + 1, update_ignore);
-
- /* Free the update array. */
- if (page->pg_row_upd != NULL)
- __free_update(session,
- page->pg_row_upd, page->pg_row_entries, update_ignore);
}
/*
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index 02eea9c2f0c..ba545859d07 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -371,7 +371,7 @@ __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, bool is_recno)
root_ref->page = root;
root_ref->state = WT_REF_MEM;
- root_ref->key.recno = is_recno ? 1 : WT_RECNO_OOB;
+ root_ref->ref_recno = is_recno ? 1 : WT_RECNO_OOB;
root->pg_intl_parent_ref = root_ref;
}
@@ -495,7 +495,7 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, bool creation)
case BTREE_COL_FIX:
case BTREE_COL_VAR:
WT_ERR(__wt_page_alloc(
- session, WT_PAGE_COL_INT, 1, 1, true, &root));
+ session, WT_PAGE_COL_INT, 1, true, &root));
root->pg_intl_parent_ref = &btree->root;
pindex = WT_INTL_INDEX_GET_SAFE(root);
@@ -504,11 +504,11 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, bool creation)
ref->page = NULL;
ref->addr = NULL;
ref->state = WT_REF_DELETED;
- ref->key.recno = 1;
+ ref->ref_recno = 1;
break;
case BTREE_ROW:
WT_ERR(__wt_page_alloc(
- session, WT_PAGE_ROW_INT, 0, 1, true, &root));
+ session, WT_PAGE_ROW_INT, 1, true, &root));
root->pg_intl_parent_ref = &btree->root;
pindex = WT_INTL_INDEX_GET_SAFE(root);
@@ -524,7 +524,7 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, bool creation)
/* Bulk loads require a leaf page for reconciliation: create it now. */
if (F_ISSET(btree, WT_BTREE_BULK)) {
- WT_ERR(__wt_btree_new_leaf_page(session, 1, &leaf));
+ WT_ERR(__wt_btree_new_leaf_page(session, &leaf));
ref->page = leaf;
ref->state = WT_REF_MEM;
WT_ERR(__wt_page_modify_init(session, leaf));
@@ -548,8 +548,7 @@ err: if (leaf != NULL)
* Create an empty leaf page.
*/
int
-__wt_btree_new_leaf_page(
- WT_SESSION_IMPL *session, uint64_t recno, WT_PAGE **pagep)
+__wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep)
{
WT_BTREE *btree;
@@ -558,15 +557,15 @@ __wt_btree_new_leaf_page(
switch (btree->type) {
case BTREE_COL_FIX:
WT_RET(__wt_page_alloc(
- session, WT_PAGE_COL_FIX, recno, 0, false, pagep));
+ session, WT_PAGE_COL_FIX, 0, false, pagep));
break;
case BTREE_COL_VAR:
WT_RET(__wt_page_alloc(
- session, WT_PAGE_COL_VAR, recno, 0, false, pagep));
+ session, WT_PAGE_COL_VAR, 0, false, pagep));
break;
case BTREE_ROW:
WT_RET(__wt_page_alloc(
- session, WT_PAGE_ROW_LEAF, WT_RECNO_OOB, 0, false, pagep));
+ session, WT_PAGE_ROW_LEAF, 0, false, pagep));
break;
WT_ILLEGAL_VALUE(session);
}
@@ -639,7 +638,7 @@ __btree_get_last_recno(WT_SESSION_IMPL *session)
page = next_walk->page;
btree->last_recno = page->type == WT_PAGE_COL_VAR ?
- __col_var_last_recno(page) : __col_fix_last_recno(page);
+ __col_var_last_recno(next_walk) : __col_fix_last_recno(next_walk);
return (__wt_page_release(session, next_walk, 0));
}
diff --git a/src/btree/bt_huffman.c b/src/btree/bt_huffman.c
index a1aaf2c7ea0..9e9d69c342e 100644
--- a/src/btree/bt_huffman.c
+++ b/src/btree/bt_huffman.c
@@ -133,10 +133,10 @@ static int __wt_huffman_read(WT_SESSION_IMPL *,
* Check for a Huffman configuration file and return the file name.
*/
static int
-__huffman_confchk_file(
- WT_SESSION_IMPL *session, WT_CONFIG_ITEM *v, bool *is_utf8p, WT_FH **fhp)
+__huffman_confchk_file(WT_SESSION_IMPL *session,
+ WT_CONFIG_ITEM *v, bool *is_utf8p, WT_FSTREAM **fsp)
{
- WT_FH *fh;
+ WT_FSTREAM *fs;
WT_DECL_RET;
size_t len;
char *fname;
@@ -157,14 +157,13 @@ __huffman_confchk_file(
/* Check the file exists. */
WT_RET(__wt_strndup(session, v->str + len, v->len - len, &fname));
- WT_ERR(__wt_open(session, fname, WT_FILE_TYPE_REGULAR,
- WT_OPEN_FIXED | WT_OPEN_READONLY | WT_STREAM_READ, &fh));
+ WT_ERR(__wt_fopen(session, fname, WT_OPEN_FIXED, WT_STREAM_READ, &fs));
/* Optionally return the file handle. */
- if (fhp == NULL)
- (void)__wt_close(session, &fh);
+ if (fsp == NULL)
+ (void)__wt_fclose(session, &fs);
else
- *fhp = fh;
+ *fsp = fs;
err: __wt_free(session, fname);
@@ -300,7 +299,7 @@ __wt_huffman_read(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *ip,
struct __wt_huffman_table *table, *tp;
WT_DECL_ITEM(tmp);
WT_DECL_RET;
- WT_FH *fh;
+ WT_FSTREAM *fs;
int64_t symbol, frequency;
u_int entries, lineno;
int n;
@@ -309,13 +308,13 @@ __wt_huffman_read(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *ip,
*tablep = NULL;
*entriesp = *numbytesp = 0;
- fh = NULL;
+ fs = NULL;
table = NULL;
/*
* Try and open the backing file.
*/
- WT_RET(__huffman_confchk_file(session, ip, &is_utf8, &fh));
+ WT_RET(__huffman_confchk_file(session, ip, &is_utf8, &fs));
/*
* UTF-8 table is 256 bytes, with a range of 0-255.
@@ -333,7 +332,7 @@ __wt_huffman_read(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *ip,
WT_ERR(__wt_scr_alloc(session, 0, &tmp));
for (tp = table, lineno = 1;; ++tp, ++lineno) {
- WT_ERR(__wt_getline(session, tmp, fh));
+ WT_ERR(__wt_getline(session, fs, tmp));
if (tmp->size == 0)
break;
n = sscanf(
@@ -378,7 +377,7 @@ __wt_huffman_read(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *ip,
if (0) {
err: __wt_free(session, table);
}
- (void)__wt_close(session, &fh);
+ (void)__wt_fclose(session, &fs);
__wt_scr_free(session, &tmp);
return (ret);
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
index 9fa0145bbdd..00ec8aa4494 100644
--- a/src/btree/bt_page.c
+++ b/src/btree/bt_page.c
@@ -10,7 +10,7 @@
static void __inmem_col_fix(WT_SESSION_IMPL *, WT_PAGE *);
static void __inmem_col_int(WT_SESSION_IMPL *, WT_PAGE *);
-static int __inmem_col_var(WT_SESSION_IMPL *, WT_PAGE *, size_t *);
+static int __inmem_col_var(WT_SESSION_IMPL *, WT_PAGE *, uint64_t, size_t *);
static int __inmem_row_int(WT_SESSION_IMPL *, WT_PAGE *, size_t *);
static int __inmem_row_leaf(WT_SESSION_IMPL *, WT_PAGE *);
static int __inmem_row_leaf_entries(
@@ -21,8 +21,8 @@ static int __inmem_row_leaf_entries(
* Create or read a page into the cache.
*/
int
-__wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type,
- uint64_t recno, uint32_t alloc_entries, bool alloc_refs, WT_PAGE **pagep)
+__wt_page_alloc(WT_SESSION_IMPL *session,
+ uint8_t type, uint32_t alloc_entries, bool alloc_refs, WT_PAGE **pagep)
{
WT_CACHE *cache;
WT_DECL_RET;
@@ -67,13 +67,10 @@ __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type,
switch (type) {
case WT_PAGE_COL_FIX:
- page->pg_fix_recno = recno;
page->pg_fix_entries = alloc_entries;
break;
case WT_PAGE_COL_INT:
case WT_PAGE_ROW_INT:
- page->pg_intl_recno = recno;
-
/*
* Internal pages have an array of references to objects so they
* can split. Allocate the array of references and optionally,
@@ -105,7 +102,6 @@ err: if ((pindex = WT_INTL_INDEX_GET_SAFE(page)) != NULL) {
}
break;
case WT_PAGE_COL_VAR:
- page->pg_var_recno = recno;
page->pg_var_d = (WT_COL *)((uint8_t *)page + sizeof(WT_PAGE));
page->pg_var_entries = alloc_entries;
break;
@@ -191,8 +187,7 @@ __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref,
}
/* Allocate and initialize a new WT_PAGE. */
- WT_RET(__wt_page_alloc(
- session, dsk->type, dsk->recno, alloc_entries, true, &page));
+ WT_RET(__wt_page_alloc(session, dsk->type, alloc_entries, true, &page));
page->dsk = dsk;
F_SET_ATOMIC(page, flags);
@@ -211,7 +206,7 @@ __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref,
__inmem_col_int(session, page);
break;
case WT_PAGE_COL_VAR:
- WT_ERR(__inmem_col_var(session, page, &size));
+ WT_ERR(__inmem_col_var(session, page, dsk->recno, &size));
break;
case WT_PAGE_ROW_INT:
WT_ERR(__inmem_row_int(session, page, &size));
@@ -292,7 +287,7 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page)
__wt_cell_unpack(cell, unpack);
ref->addr = cell;
- ref->key.recno = unpack->v;
+ ref->ref_recno = unpack->v;
}
}
@@ -329,7 +324,8 @@ __inmem_col_var_repeats(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t *np)
* column-store trees.
*/
static int
-__inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
+__inmem_col_var(
+ WT_SESSION_IMPL *session, WT_PAGE *page, uint64_t recno, size_t *sizep)
{
WT_BTREE *btree;
WT_COL *cip;
@@ -337,13 +333,12 @@ __inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep)
WT_CELL *cell;
WT_CELL_UNPACK *unpack, _unpack;
const WT_PAGE_HEADER *dsk;
- uint64_t recno, rle;
+ uint64_t rle;
size_t bytes_allocated;
uint32_t i, indx, n, repeat_off;
btree = S2BT(session);
dsk = page->dsk;
- recno = page->pg_var_recno;
repeats = NULL;
repeat_off = 0;
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
index 5cf6a9bf2bc..89d16a3f827 100644
--- a/src/btree/bt_read.c
+++ b/src/btree/bt_read.c
@@ -326,7 +326,7 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref)
__wt_page_evict_soon(page);
/* Bump the oldest ID, we're about to do some visibility checks. */
- __wt_txn_update_oldest(session, false);
+ WT_RET(__wt_txn_update_oldest(session, false));
/* If eviction cannot succeed, don't try. */
return (__wt_page_can_evict(session, ref, NULL));
@@ -377,9 +377,7 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref)
if (addr == NULL) {
WT_ASSERT(session, previous_state == WT_REF_DELETED);
- WT_ERR(__wt_btree_new_leaf_page(session,
- btree->type == BTREE_ROW ? WT_RECNO_OOB : ref->key.recno,
- &page));
+ WT_ERR(__wt_btree_new_leaf_page(session, &page));
ref->page = page;
goto done;
}
@@ -463,6 +461,8 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
btree = S2BT(session);
+ WT_STAT_FAST_CONN_INCR(session, cache_pages_requested);
+ WT_STAT_FAST_DATA_INCR(session, cache_pages_requested);
for (evict_soon = stalled = false,
force_attempts = 0, sleep_cnt = wait_cnt = 0;;) {
switch (ref->state) {
diff --git a/src/btree/bt_rebalance.c b/src/btree/bt_rebalance.c
index d94eb2ddd80..de54e8433a8 100644
--- a/src/btree/bt_rebalance.c
+++ b/src/btree/bt_rebalance.c
@@ -90,7 +90,7 @@ __rebalance_leaf_append(WT_SESSION_IMPL *session,
if (recno == WT_RECNO_OOB)
WT_RET(__wt_row_ikey(session, 0, key, key_len, copy));
else
- copy->key.recno = recno;
+ copy->ref_recno = recno;
copy->page_del = NULL;
return (0);
@@ -147,8 +147,7 @@ __rebalance_internal(WT_SESSION_IMPL *session, WT_REBALANCE_STUFF *rs)
leaf_next = (uint32_t)rs->leaf_next;
/* Allocate a row-store root (internal) page and fill it in. */
- WT_RET(__wt_page_alloc(session, rs->type,
- rs->type == WT_PAGE_COL_INT ? 1 : 0, leaf_next, false, &page));
+ WT_RET(__wt_page_alloc(session, rs->type, leaf_next, false, &page));
page->pg_intl_parent_ref = &btree->root;
WT_ERR(__wt_page_modify_init(session, page));
__wt_page_modify_set(session, page);
diff --git a/src/btree/bt_ret.c b/src/btree/bt_ret.c
index ebc0499f6a2..8ef2db67e7b 100644
--- a/src/btree/bt_ret.c
+++ b/src/btree/bt_ret.c
@@ -46,7 +46,7 @@ __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
}
/* Take the value from the original page. */
- v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt);
+ v = __bit_getv_recno(cbt->ref, cursor->recno, btree->bitcnt);
return (__wt_buf_set(session, &cursor->value, &v, 1));
case WT_PAGE_COL_VAR:
/*
diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c
index 0e064d306b6..9b5e4daf74a 100644
--- a/src/btree/bt_slvg.c
+++ b/src/btree/bt_slvg.c
@@ -116,8 +116,8 @@ struct __wt_track {
static int __slvg_cleanup(WT_SESSION_IMPL *, WT_STUFF *);
static int __slvg_col_build_internal(WT_SESSION_IMPL *, uint32_t, WT_STUFF *);
static int __slvg_col_build_leaf(WT_SESSION_IMPL *, WT_TRACK *, WT_REF *);
-static int __slvg_col_ovfl(
- WT_SESSION_IMPL *, WT_TRACK *, WT_PAGE *, uint64_t, uint64_t);
+static int __slvg_col_ovfl(WT_SESSION_IMPL *,
+ WT_TRACK *, WT_PAGE *, uint64_t, uint64_t, uint64_t);
static int __slvg_col_range(WT_SESSION_IMPL *, WT_STUFF *);
static int __slvg_col_range_missing(WT_SESSION_IMPL *, WT_STUFF *);
static int __slvg_col_range_overlap(
@@ -166,11 +166,13 @@ __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[])
WT_DECL_RET;
WT_STUFF *ss, stuff;
uint32_t i, leaf_cnt;
+ bool evict_reset;
WT_UNUSED(cfg);
btree = S2BT(session);
bm = btree->bm;
+ evict_reset = false;
WT_CLEAR(stuff);
ss = &stuff;
@@ -182,6 +184,13 @@ __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[])
WT_ERR(__wt_scr_alloc(session, 0, &ss->tmp2));
/*
+ * Salvage handles its own page eviction; get exclusive access to the
+ * file, have eviction ignore the tree entirely.
+ */
+ WT_ERR(__wt_evict_file_exclusive_on(session));
+ evict_reset = true;
+
+ /*
* Step 1:
* Inform the underlying block manager that we're salvaging the file.
*/
@@ -295,13 +304,13 @@ __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[])
case WT_PAGE_COL_VAR:
WT_WITH_PAGE_INDEX(session,
ret = __slvg_col_build_internal(
- session, leaf_cnt, ss));
+ session, leaf_cnt, ss));
WT_ERR(ret);
break;
case WT_PAGE_ROW_LEAF:
WT_WITH_PAGE_INDEX(session,
ret = __slvg_row_build_internal(
- session, leaf_cnt, ss));
+ session, leaf_cnt, ss));
WT_ERR(ret);
break;
}
@@ -341,6 +350,9 @@ err: WT_TRET(bm->salvage_end(bm, session));
if (ss->root_ref.page != NULL)
__wt_ref_out(session, &ss->root_ref);
+ if (evict_reset)
+ __wt_evict_file_exclusive_off(session);
+
/* Discard the leaf and overflow page memory. */
WT_TRET(__slvg_cleanup(session, ss));
@@ -1159,7 +1171,7 @@ __slvg_col_build_internal(
/* Allocate a column-store root (internal) page and fill it in. */
WT_RET(__wt_page_alloc(
- session, WT_PAGE_COL_INT, 1, leaf_cnt, true, &page));
+ session, WT_PAGE_COL_INT, leaf_cnt, true, &page));
WT_ERR(__slvg_modify_init(session, page));
pindex = WT_INTL_INDEX_GET_SAFE(page);
@@ -1180,7 +1192,7 @@ __slvg_col_build_internal(
ref->addr = addr;
addr = NULL;
- ref->key.recno = trk->col_start;
+ ref->ref_recno = trk->col_start;
ref->state = WT_REF_DISK;
/*
@@ -1223,7 +1235,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref)
WT_DECL_RET;
WT_PAGE *page;
WT_SALVAGE_COOKIE *cookie, _cookie;
- uint64_t skip, take;
+ uint64_t recno, skip, take;
uint32_t *entriesp, save_entries;
cookie = &_cookie;
@@ -1243,7 +1255,8 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref)
* Calculate the number of K/V entries we are going to skip, and
* the total number of K/V entries we'll take from this page.
*/
- cookie->skip = skip = trk->col_start - page->pg_var_recno;
+ recno = page->dsk->recno;
+ cookie->skip = skip = trk->col_start - recno;
cookie->take = take = (trk->col_stop - trk->col_start) + 1;
WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
@@ -1255,7 +1268,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref)
/* Set the referenced flag on overflow pages we're using. */
if (page->type == WT_PAGE_COL_VAR && trk->trk_ovfl_cnt != 0)
- WT_ERR(__slvg_col_ovfl(session, trk, page, skip, take));
+ WT_ERR(__slvg_col_ovfl(session, trk, page, recno, skip, take));
/*
* If we're missing some part of the range, the real start range is in
@@ -1263,9 +1276,9 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref)
* reference as well as the page itself.
*/
if (trk->col_missing == 0)
- page->pg_var_recno = trk->col_start;
+ ref->ref_recno = trk->col_start;
else {
- page->pg_var_recno = trk->col_missing;
+ ref->ref_recno = trk->col_missing;
cookie->missing = trk->col_start - trk->col_missing;
WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE,
@@ -1274,7 +1287,6 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref)
session, trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1),
cookie->missing));
}
- ref->key.recno = page->pg_var_recno;
/*
* We can't discard the original blocks associated with this page now.
@@ -1338,21 +1350,20 @@ __slvg_col_ovfl_single(
* Mark overflow items referenced by the merged page.
*/
static int
-__slvg_col_ovfl(WT_SESSION_IMPL *session,
- WT_TRACK *trk, WT_PAGE *page, uint64_t skip, uint64_t take)
+__slvg_col_ovfl(WT_SESSION_IMPL *session, WT_TRACK *trk,
+ WT_PAGE *page, uint64_t recno, uint64_t skip, uint64_t take)
{
WT_CELL_UNPACK unpack;
WT_CELL *cell;
WT_COL *cip;
WT_DECL_RET;
- uint64_t recno, start, stop;
+ uint64_t start, stop;
uint32_t i;
/*
* Merging a variable-length column-store page, and we took some number
* of records, figure out which (if any) overflow records we used.
*/
- recno = page->pg_var_recno;
start = recno + skip;
stop = (recno + skip + take) - 1;
@@ -1816,7 +1827,7 @@ __slvg_row_build_internal(
/* Allocate a row-store root (internal) page and fill it in. */
WT_RET(__wt_page_alloc(
- session, WT_PAGE_ROW_INT, WT_RECNO_OOB, leaf_cnt, true, &page));
+ session, WT_PAGE_ROW_INT, leaf_cnt, true, &page));
WT_ERR(__slvg_modify_init(session, page));
pindex = WT_INTL_INDEX_GET_SAFE(page);
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 4f16a290958..2d7b0a0030f 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -207,8 +207,8 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
WT_INTL_FOREACH_BEGIN(session, page, ref) {
WT_ASSERT(session, ref->home == page);
- WT_ASSERT(session, ref->key.recno > recno);
- recno = ref->key.recno;
+ WT_ASSERT(session, ref->ref_recno > recno);
+ recno = ref->ref_recno;
} WT_INTL_FOREACH_END;
break;
case WT_PAGE_ROW_INT:
@@ -335,7 +335,7 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) {
__wt_ref_key(from_home, ref, &key, &size);
WT_RET(__wt_row_ikey(session, 0, key, size, ref));
- ikey = ref->key.ikey;
+ ikey = ref->ref_ikey;
} else {
WT_RET(
__split_ovfl_key_cleanup(session, from_home, ref));
@@ -529,7 +529,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_REF **child_refp, *ref, **root_refp;
WT_SPLIT_ERROR_PHASE complete;
size_t child_incr, root_decr, root_incr, size;
- uint64_t recno, split_gen;
+ uint64_t split_gen;
uint32_t children, chunk, i, j, remain;
uint32_t slots;
void *p;
@@ -593,10 +593,8 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
alloc_refp = alloc_index->index, i = 0; i < children; ++i) {
slots = i == children - 1 ? remain : chunk;
- recno = root->type == WT_PAGE_COL_INT ?
- (*root_refp)->key.recno : WT_RECNO_OOB;
WT_ERR(__wt_page_alloc(
- session, root->type, recno, slots, false, &child));
+ session, root->type, slots, false, &child));
/*
* Initialize the page's child reference; we need a copy of the
@@ -611,7 +609,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_ERR(__wt_row_ikey(session, 0, p, size, ref));
root_incr += sizeof(WT_IKEY) + size;
} else
- ref->key.recno = recno;
+ ref->ref_recno = (*root_refp)->ref_recno;
ref->state = WT_REF_MEM;
/* Initialize the child page. */
@@ -737,7 +735,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
alloc_index = pindex = NULL;
parent_decr = 0;
- parent_entries = 0;
empty_parent = false;
complete = WT_ERR_RETURN;
@@ -1014,7 +1011,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_REF **child_refp, *page_ref, **page_refp, *ref;
WT_SPLIT_ERROR_PHASE complete;
size_t child_incr, page_decr, page_incr, parent_incr, size;
- uint64_t recno, split_gen;
+ uint64_t split_gen;
uint32_t children, chunk, i, j, remain;
uint32_t slots;
void *p;
@@ -1099,10 +1096,8 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
for (alloc_refp = alloc_index->index + 1, i = 1; i < children; ++i) {
slots = i == children - 1 ? remain : chunk;
- recno = page->type == WT_PAGE_COL_INT ?
- (*page_refp)->key.recno : WT_RECNO_OOB;
WT_ERR(__wt_page_alloc(
- session, page->type, recno, slots, false, &child));
+ session, page->type, slots, false, &child));
/*
* Initialize the page's child reference; we need a copy of the
@@ -1117,7 +1112,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_ERR(__wt_row_ikey(session, 0, p, size, ref));
parent_incr += sizeof(WT_IKEY) + size;
} else
- ref->key.recno = recno;
+ ref->ref_recno = (*page_refp)->ref_recno;
ref->state = WT_REF_MEM;
/* Initialize the child page. */
@@ -1483,6 +1478,15 @@ __split_multi_inmem(
uint32_t i, slot;
/*
+ * In 04/2016, we removed column-store record numbers from the WT_PAGE
+ * structure, leading to hard-to-debug problems because we corrupt the
+ * page if we search it using the wrong initial record number. For now,
+ * assert the record number is set.
+ */
+ WT_ASSERT(session,
+ orig->type != WT_PAGE_COL_VAR || ref->ref_recno != 0);
+
+ /*
* This code re-creates an in-memory page that is part of a set created
* while evicting a large page, and adds references to any unresolved
* update chains to the new page. We get here due to choosing to keep
@@ -1525,7 +1529,7 @@ __split_multi_inmem(
/* Build a key. */
if (supd->ins == NULL) {
slot = WT_ROW_SLOT(orig, supd->rip);
- upd = orig->pg_row_upd[slot];
+ upd = orig->modify->mod_row_update[slot];
WT_ERR(__wt_row_leaf_key(
session, orig, supd->rip, key, false));
@@ -1588,7 +1592,7 @@ __split_multi_inmem_final(WT_PAGE *orig, WT_MULTI *multi)
case WT_PAGE_ROW_LEAF:
if (supd->ins == NULL) {
slot = WT_ROW_SLOT(orig, supd->rip);
- orig->pg_row_upd[slot] = NULL;
+ orig->modify->mod_row_update[slot] = NULL;
} else
supd->ins->upd = NULL;
break;
@@ -1605,11 +1609,16 @@ __split_multi_inmem_fail(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_REF *ref)
/*
* We failed creating new in-memory pages. For error-handling reasons,
* we've left the update chains referenced by both the original and
- * new pages. Discard the new pages, setting a flag so the discard code
- * doesn't discard the updates on the page.
+ * new pages. Discard the new allocated WT_REF structures and their
+ * pages (setting a flag so the discard code doesn't discard the updates
+ * on the page).
+ *
+ * Our callers allocate WT_REF arrays, then individual WT_REFs, check
+ * for uninitialized information.
*/
- if (ref->page != NULL) {
- F_SET_ATOMIC(ref->page, WT_PAGE_UPDATE_IGNORE);
+ if (ref != NULL) {
+ if (ref->page != NULL)
+ F_SET_ATOMIC(ref->page, WT_PAGE_UPDATE_IGNORE);
__wt_free_ref(session, ref, orig->type, true);
}
}
@@ -1627,7 +1636,6 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
WT_REF *ref;
size_t incr;
- addr = NULL;
incr = 0;
/* Allocate an underlying WT_REF. */
@@ -1635,9 +1643,24 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
ref = *refp;
incr += sizeof(WT_REF);
- /* Any parent reference is filled in by our caller. */
- ref->home = NULL;
+ /*
+ * Set the WT_REF key before (optionally) building the page, underlying
+ * column-store functions need the page's key space to search it.
+ */
+ switch (page->type) {
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ ikey = multi->key.ikey;
+ WT_RET(__wt_row_ikey(
+ session, 0, WT_IKEY_DATA(ikey), ikey->size, ref));
+ incr += sizeof(WT_IKEY) + ikey->size;
+ break;
+ default:
+ ref->ref_recno = multi->key.recno;
+ break;
+ }
+ /* If there's a disk image, build a page, otherwise set the address. */
if (multi->disk_image == NULL) {
/*
* Copy the address: we could simply take the buffer, but that
@@ -1651,28 +1674,13 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session,
addr->type = multi->addr.type;
WT_RET(__wt_strndup(session,
multi->addr.addr, addr->size, &addr->addr));
- } else
+ ref->state = WT_REF_DISK;
+ } else {
WT_RET(__split_multi_inmem(session, page, ref, multi));
-
- switch (page->type) {
- case WT_PAGE_ROW_INT:
- case WT_PAGE_ROW_LEAF:
- ikey = multi->key.ikey;
- WT_RET(__wt_row_ikey(
- session, 0, WT_IKEY_DATA(ikey), ikey->size, ref));
- incr += sizeof(WT_IKEY) + ikey->size;
- break;
- default:
- ref->key.recno = multi->key.recno;
- break;
+ ref->state = WT_REF_MEM;
}
- ref->state = addr != NULL ? WT_REF_DISK : WT_REF_MEM;
-
- /*
- * If our caller wants to track the memory allocations, we have a return
- * reference.
- */
+ /* Optionally return changes in the memory footprint. */
if (incrp != NULL)
*incrp += incr;
return (0);
@@ -1773,17 +1781,12 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
parent_incr += sizeof(WT_IKEY) + key->size;
__wt_scr_free(session, &key);
} else
- child->key.recno = ref->key.recno;
+ child->ref_recno = ref->ref_recno;
/*
* The second page in the split is a new WT_REF/page pair.
*/
- if (type == WT_PAGE_ROW_LEAF)
- WT_ERR(__wt_page_alloc(session,
- type, WT_RECNO_OOB, 0, false, &right));
- else
- WT_ERR(__wt_page_alloc(session,
- type, WT_INSERT_RECNO(moved_ins), 0, false, &right));
+ WT_ERR(__wt_page_alloc(session, type, 0, false, &right));
/*
* The new page is dirty by definition, plus column-store splits update
@@ -1793,11 +1796,15 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
__wt_page_modify_set(session, right);
if (type == WT_PAGE_ROW_LEAF) {
- WT_ERR(__wt_calloc_one(session, &right->pg_row_ins));
- WT_ERR(__wt_calloc_one(session, &right->pg_row_ins[0]));
+ WT_ERR(__wt_calloc_one(
+ session, &right->modify->mod_row_insert));
+ WT_ERR(__wt_calloc_one(
+ session, &right->modify->mod_row_insert[0]));
} else {
- WT_ERR(__wt_calloc_one(session, &right->modify->mod_append));
- WT_ERR(__wt_calloc_one(session, &right->modify->mod_append[0]));
+ WT_ERR(__wt_calloc_one(
+ session, &right->modify->mod_col_append));
+ WT_ERR(__wt_calloc_one(
+ session, &right->modify->mod_col_append[0]));
}
right_incr += sizeof(WT_INSERT_HEAD);
right_incr += sizeof(WT_INSERT_HEAD *);
@@ -1814,7 +1821,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
child));
parent_incr += sizeof(WT_IKEY) + WT_INSERT_KEY_SIZE(moved_ins);
} else
- child->key.recno = WT_INSERT_RECNO(moved_ins);
+ child->ref_recno = WT_INSERT_RECNO(moved_ins);
/*
* Allocation operations completed, we're going to split.
@@ -1823,8 +1830,8 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
*/
if (type != WT_PAGE_ROW_LEAF) {
WT_ASSERT(session,
- page->modify->mod_split_recno == WT_RECNO_OOB);
- page->modify->mod_split_recno = child->key.recno;
+ page->modify->mod_col_split_recno == WT_RECNO_OOB);
+ page->modify->mod_col_split_recno = child->ref_recno;
}
/*
@@ -1848,7 +1855,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
* can be ignored.)
*/
tmp_ins_head = type == WT_PAGE_ROW_LEAF ?
- right->pg_row_ins[0] : right->modify->mod_append[0];
+ right->modify->mod_row_insert[0] : right->modify->mod_col_append[0];
tmp_ins_head->head[0] = tmp_ins_head->tail[0] = moved_ins;
/*
@@ -1970,7 +1977,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
* Reset the split column-store page record.
*/
if (type != WT_PAGE_ROW_LEAF)
- page->modify->mod_split_recno = WT_RECNO_OOB;
+ page->modify->mod_col_split_recno = WT_RECNO_OOB;
/*
* Clear the allocated page's reference to the moved insert list element
@@ -1983,11 +1990,11 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
* lists have.
*/
if (type == WT_PAGE_ROW_LEAF)
- right->pg_row_ins[0]->head[0] =
- right->pg_row_ins[0]->tail[0] = NULL;
+ right->modify->mod_row_insert[0]->head[0] =
+ right->modify->mod_row_insert[0]->tail[0] = NULL;
else
- right->modify->mod_append[0]->head[0] =
- right->modify->mod_append[0]->tail[0] = NULL;
+ right->modify->mod_col_append[0]->head[0] =
+ right->modify->mod_col_append[0]->tail[0] = NULL;
ins_head->tail[0]->next[0] = moved_ins;
ins_head->tail[0] = moved_ins;
@@ -1999,12 +2006,12 @@ err: if (split_ref[0] != NULL) {
ref->addr = split_ref[0]->addr;
if (type == WT_PAGE_ROW_LEAF)
- __wt_free(session, split_ref[0]->key.ikey);
+ __wt_free(session, split_ref[0]->ref_ikey);
__wt_free(session, split_ref[0]);
}
if (split_ref[1] != NULL) {
if (type == WT_PAGE_ROW_LEAF)
- __wt_free(session, split_ref[1]->key.ikey);
+ __wt_free(session, split_ref[1]->ref_ikey);
__wt_free(session, split_ref[1]);
}
if (right != NULL) {
@@ -2170,7 +2177,7 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref)
WT_DECL_RET;
WT_PAGE *page;
WT_PAGE_MODIFY *mod;
- WT_REF new;
+ WT_REF *new;
page = ref->page;
mod = page->modify;
@@ -2187,9 +2194,15 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref)
* exactly what we want to do.
*
* Build the new page.
+ *
+ * Allocate a WT_REF because the error path uses routines that will ea
+ * free memory. The only field we need to set is the record number, as
+ * it's used by the search routines.
*/
- memset(&new, 0, sizeof(new));
- WT_ERR(__split_multi_inmem(session, page, &new, &mod->mod_multi[0]));
+ WT_RET(__wt_calloc_one(session, &new));
+ new->ref_recno = ref->ref_recno;
+
+ WT_ERR(__split_multi_inmem(session, page, new, &mod->mod_multi[0]));
/*
* The rewrite succeeded, we can no longer fail.
@@ -2209,11 +2222,12 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref)
__wt_ref_out(session, ref);
/* Swap the new page into place. */
- ref->page = new.page;
+ ref->page = new->page;
WT_PUBLISH(ref->state, WT_REF_MEM);
+ __wt_free(session, new);
return (0);
-err: __split_multi_inmem_fail(session, page, &new);
+err: __split_multi_inmem_fail(session, page, new);
return (ret);
}
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index 826589f8bdd..5d60c436a08 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -81,7 +81,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
if (__wt_page_is_modified(page) &&
WT_TXNID_LT(page->modify->update_txn, oldest_id)) {
if (txn->isolation == WT_ISO_READ_COMMITTED)
- __wt_txn_get_snapshot(session);
+ WT_ERR(__wt_txn_get_snapshot(session));
leaf_bytes += page->memory_footprint;
++leaf_pages;
WT_ERR(__wt_reconcile(session, walk, NULL, 0));
@@ -100,7 +100,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
* the metadata shouldn't be that big, and (b) if we do ever
*/
if (txn->isolation == WT_ISO_READ_COMMITTED)
- __wt_txn_get_snapshot(session);
+ WT_ERR(__wt_txn_get_snapshot(session));
/*
* We cannot check the tree modified flag in the case of a
diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c
index 83dc7924312..531a0dc125a 100644
--- a/src/btree/bt_vrfy.c
+++ b/src/btree/bt_vrfy.c
@@ -355,7 +355,7 @@ __verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs)
if (vs->dump_blocks)
WT_RET(__wt_debug_disk(session, page->dsk, NULL));
if (vs->dump_pages)
- WT_RET(__wt_debug_page(session, page, NULL));
+ WT_RET(__wt_debug_page(session, ref, NULL));
#endif
/*
@@ -364,13 +364,11 @@ __verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs)
*/
switch (page->type) {
case WT_PAGE_COL_FIX:
- recno = page->pg_fix_recno;
- goto recno_chk;
case WT_PAGE_COL_INT:
- recno = page->pg_intl_recno;
+ recno = ref->ref_recno;
goto recno_chk;
case WT_PAGE_COL_VAR:
- recno = page->pg_var_recno;
+ recno = ref->ref_recno;
recno_chk: if (recno != vs->record_total + 1)
WT_RET_MSG(session, WT_ERROR,
"page at %s has a starting record of %" PRIu64
@@ -485,7 +483,7 @@ celltype_err: WT_RET_MSG(session, WT_ERROR,
* reviewed to this point.
*/
++entry;
- if (child_ref->key.recno != vs->record_total + 1) {
+ if (child_ref->ref_recno != vs->record_total + 1) {
WT_RET_MSG(session, WT_ERROR,
"the starting record number in entry %"
PRIu32 " of the column internal page at "
@@ -494,7 +492,7 @@ celltype_err: WT_RET_MSG(session, WT_ERROR,
entry,
__wt_page_addr_string(
session, child_ref, vs->tmp1),
- child_ref->key.recno,
+ child_ref->ref_recno,
vs->record_total + 1);
}
diff --git a/src/btree/col_modify.c b/src/btree/col_modify.c
index fd60b12538a..a7920da5267 100644
--- a/src/btree/col_modify.c
+++ b/src/btree/col_modify.c
@@ -55,7 +55,8 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
*/
if (recno == WT_RECNO_OOB ||
recno > (btree->type == BTREE_COL_VAR ?
- __col_var_last_recno(page) : __col_fix_last_recno(page)))
+ __col_var_last_recno(cbt->ref) :
+ __col_fix_last_recno(cbt->ref)))
append = true;
}
@@ -107,17 +108,17 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
/* Allocate the append/update list reference as necessary. */
if (append) {
WT_PAGE_ALLOC_AND_SWAP(session,
- page, mod->mod_append, ins_headp, 1);
- ins_headp = &mod->mod_append[0];
+ page, mod->mod_col_append, ins_headp, 1);
+ ins_headp = &mod->mod_col_append[0];
} else if (page->type == WT_PAGE_COL_FIX) {
WT_PAGE_ALLOC_AND_SWAP(session,
- page, mod->mod_update, ins_headp, 1);
- ins_headp = &mod->mod_update[0];
+ page, mod->mod_col_update, ins_headp, 1);
+ ins_headp = &mod->mod_col_update[0];
} else {
WT_PAGE_ALLOC_AND_SWAP(session,
- page, mod->mod_update, ins_headp,
+ page, mod->mod_col_update, ins_headp,
page->pg_var_entries);
- ins_headp = &mod->mod_update[cbt->slot];
+ ins_headp = &mod->mod_col_update[cbt->slot];
}
/* Allocate the WT_INSERT_HEAD structure as necessary. */
@@ -142,8 +143,9 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
* it's easy (as opposed to in row-store) and a difficult bug to
* otherwise diagnose.
*/
- WT_ASSERT(session, mod->mod_split_recno == WT_RECNO_OOB ||
- (recno != WT_RECNO_OOB && mod->mod_split_recno > recno));
+ WT_ASSERT(session, mod->mod_col_split_recno == WT_RECNO_OOB ||
+ (recno != WT_RECNO_OOB &&
+ mod->mod_col_split_recno > recno));
if (upd_arg == NULL) {
WT_ERR(
diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c
index 4730267a545..6c96181d3bf 100644
--- a/src/btree/col_srch.c
+++ b/src/btree/col_srch.c
@@ -30,7 +30,7 @@ __check_leaf_key_range(WT_SESSION_IMPL *session,
* Check if the search key is smaller than the parent's starting key for
* this page.
*/
- if (recno < leaf->key.recno) {
+ if (recno < leaf->ref_recno) {
cbt->compare = 1; /* page keys > search key */
return (0);
}
@@ -48,7 +48,7 @@ __check_leaf_key_range(WT_SESSION_IMPL *session,
WT_INTL_INDEX_GET(session, leaf->home, pindex);
indx = leaf->pindex_hint;
if (indx + 1 < pindex->entries && pindex->index[indx] == leaf)
- if (recno >= pindex->index[indx + 1]->key.recno) {
+ if (recno >= pindex->index[indx + 1]->ref_recno) {
cbt->compare = -1; /* page keys < search key */
return (0);
}
@@ -133,14 +133,12 @@ restart: /*
if (page->type != WT_PAGE_COL_INT)
break;
- WT_ASSERT(session, current->key.recno == page->pg_intl_recno);
-
WT_INTL_INDEX_GET(session, page, pindex);
base = pindex->entries;
descent = pindex->index[base - 1];
/* Fast path appends. */
- if (recno >= descent->key.recno) {
+ if (recno >= descent->ref_recno) {
/*
* If on the last slot (the key is larger than any key
* on the page), check for an internal page split race.
@@ -158,9 +156,9 @@ restart: /*
indx = base + (limit >> 1);
descent = pindex->index[indx];
- if (recno == descent->key.recno)
+ if (recno == descent->ref_recno)
break;
- if (recno < descent->key.recno)
+ if (recno < descent->ref_recno)
continue;
base = indx + 1;
--limit;
@@ -172,7 +170,7 @@ descend: /*
* (last + 1) index. The slot for descent is the one before
* base.
*/
- if (recno != descent->key.recno) {
+ if (recno != descent->ref_recno) {
/*
* We don't have to correct for base == 0 because the
* only way for base to be 0 is if recno is the page's
@@ -237,13 +235,13 @@ leaf_only:
* do in that case, the record may be appended to the page.
*/
if (page->type == WT_PAGE_COL_FIX) {
- if (recno < page->pg_fix_recno) {
- cbt->recno = page->pg_fix_recno;
+ if (recno < current->ref_recno) {
+ cbt->recno = current->ref_recno;
cbt->compare = 1;
return (0);
}
- if (recno >= page->pg_fix_recno + page->pg_fix_entries) {
- cbt->recno = page->pg_fix_recno + page->pg_fix_entries;
+ if (recno >= current->ref_recno + page->pg_fix_entries) {
+ cbt->recno = current->ref_recno + page->pg_fix_entries;
goto past_end;
} else {
cbt->recno = recno;
@@ -251,14 +249,14 @@ leaf_only:
ins_head = WT_COL_UPDATE_SINGLE(page);
}
} else {
- if (recno < page->pg_var_recno) {
- cbt->recno = page->pg_var_recno;
+ if (recno < current->ref_recno) {
+ cbt->recno = current->ref_recno;
cbt->slot = 0;
cbt->compare = 1;
return (0);
}
- if ((cip = __col_var_search(page, recno, NULL)) == NULL) {
- cbt->recno = __col_var_last_recno(page);
+ if ((cip = __col_var_search(current, recno, NULL)) == NULL) {
+ cbt->recno = __col_var_last_recno(current);
cbt->slot = page->pg_var_entries == 0 ?
0 : page->pg_var_entries - 1;
goto past_end;
diff --git a/src/btree/row_key.c b/src/btree/row_key.c
index 9fff092d079..83fd2dad9e4 100644
--- a/src/btree/row_key.c
+++ b/src/btree/row_key.c
@@ -517,7 +517,7 @@ __wt_row_ikey(WT_SESSION_IMPL *session,
{
uintptr_t oldv;
- oldv = (uintptr_t)ref->key.ikey;
+ oldv = (uintptr_t)ref->ref_ikey;
WT_DIAGNOSTIC_YIELD;
/*
@@ -527,10 +527,10 @@ __wt_row_ikey(WT_SESSION_IMPL *session,
WT_ASSERT(session, oldv == 0 || (oldv & WT_IK_FLAG) != 0);
WT_ASSERT(session, ref->state != WT_REF_SPLIT);
WT_ASSERT(session,
- __wt_atomic_cas_ptr(&ref->key.ikey, (WT_IKEY *)oldv, ikey));
+ __wt_atomic_cas_ptr(&ref->ref_ikey, (WT_IKEY *)oldv, ikey));
}
#else
- ref->key.ikey = ikey;
+ ref->ref_ikey = ikey;
#endif
return (0);
}
diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c
index 176016bb340..f0424ff93b4 100644
--- a/src/btree/row_modify.c
+++ b/src/btree/row_modify.c
@@ -53,6 +53,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
WT_INSERT *ins;
WT_INSERT_HEAD *ins_head, **ins_headp;
WT_PAGE *page;
+ WT_PAGE_MODIFY *mod;
WT_UPDATE *old_upd, *upd, **upd_entry;
size_t ins_size, upd_size;
uint32_t ins_slot;
@@ -70,6 +71,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
/* If we don't yet have a modify structure, we'll need one. */
WT_RET(__wt_page_modify_init(session, page));
+ mod = page->modify;
/*
* Modify: allocate an update array as necessary, build a WT_UPDATE
@@ -83,11 +85,12 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
if (cbt->compare == 0) {
if (cbt->ins == NULL) {
/* Allocate an update array as necessary. */
- WT_PAGE_ALLOC_AND_SWAP(session, page,
- page->pg_row_upd, upd_entry, page->pg_row_entries);
+ WT_PAGE_ALLOC_AND_SWAP(session,
+ page, mod->mod_row_update,
+ upd_entry, page->pg_row_entries);
/* Set the WT_UPDATE array reference. */
- upd_entry = &page->pg_row_upd[cbt->slot];
+ upd_entry = &mod->mod_row_update[cbt->slot];
} else
upd_entry = &cbt->ins->upd;
@@ -144,11 +147,11 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
* slot. That's hard, so we set a flag.
*/
WT_PAGE_ALLOC_AND_SWAP(session, page,
- page->pg_row_ins, ins_headp, page->pg_row_entries + 1);
+ mod->mod_row_insert, ins_headp, page->pg_row_entries + 1);
ins_slot = F_ISSET(cbt, WT_CBT_SEARCH_SMALLEST) ?
page->pg_row_entries: cbt->slot;
- ins_headp = &page->pg_row_ins[ins_slot];
+ ins_headp = &mod->mod_row_insert[ins_slot];
/* Allocate the WT_INSERT_HEAD structure as necessary. */
WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1);
diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c
index 6169a0a810a..a631764be7e 100644
--- a/src/btree/row_srch.c
+++ b/src/btree/row_srch.c
@@ -634,6 +634,7 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
WT_INSERT *ins, **start, **stop;
WT_INSERT_HEAD *ins_head;
WT_PAGE *page;
+ uint64_t samples;
uint32_t choice, entries, i;
int level;
@@ -688,7 +689,7 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
* Step down the skip list levels, selecting a random chunk of the name
* space at each level.
*/
- while (level > 0) {
+ for (samples = entries; level > 0; samples += entries) {
/*
* There are (entries) or (entries + 1) chunks of the name space
* considered at each level. They are: between start and the 1st
@@ -765,6 +766,16 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
cbt->ins_head = ins_head;
cbt->compare = 0;
+ /*
+ * Random lookups in newly created collections can be slow if a page
+ * consists of a large skiplist. Schedule the page for eviction if we
+ * encounter a large skiplist. This worthwhile because applications
+ * that take a sample often take many samples, so the overhead of
+ * traversing the skip list each time accumulates to real time.
+ */
+ if (samples > 5000)
+ __wt_page_evict_soon(page);
+
return (0);
}
diff --git a/src/support/cksum.c b/src/checksum/checksum.c
index 0b086753406..0b086753406 100644
--- a/src/support/cksum.c
+++ b/src/checksum/checksum.c
diff --git a/src/support/power8/LICENSE.TXT b/src/checksum/power8/LICENSE.TXT
index 2f4bb91f574..2f4bb91f574 100644
--- a/src/support/power8/LICENSE.TXT
+++ b/src/checksum/power8/LICENSE.TXT
diff --git a/src/support/power8/README.md b/src/checksum/power8/README.md
index 3e2976650cd..3e2976650cd 100644
--- a/src/support/power8/README.md
+++ b/src/checksum/power8/README.md
diff --git a/src/support/power8/crc32.S b/src/checksum/power8/crc32.S
index c0b81143f07..c0b81143f07 100644
--- a/src/support/power8/crc32.S
+++ b/src/checksum/power8/crc32.S
diff --git a/src/support/power8/crc32_constants.h b/src/checksum/power8/crc32_constants.h
index 02c471d1c56..02c471d1c56 100644
--- a/src/support/power8/crc32_constants.h
+++ b/src/checksum/power8/crc32_constants.h
diff --git a/src/support/power8/crc32_wrapper.c b/src/checksum/power8/crc32_wrapper.c
index 34ac4150338..34ac4150338 100644
--- a/src/support/power8/crc32_wrapper.c
+++ b/src/checksum/power8/crc32_wrapper.c
diff --git a/src/support/power8/ppc-opcode.h b/src/checksum/power8/ppc-opcode.h
index b63feea60a0..b63feea60a0 100644
--- a/src/support/power8/ppc-opcode.h
+++ b/src/checksum/power8/ppc-opcode.h
diff --git a/src/config/config_def.c b/src/config/config_def.c
index 5b6f0bac323..c7bbdf50280 100644
--- a/src/config/config_def.c
+++ b/src/config/config_def.c
@@ -17,6 +17,7 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_close[] = {
static const WT_CONFIG_CHECK confchk_WT_CONNECTION_load_extension[] = {
{ "config", "string", NULL, NULL, NULL, 0 },
+ { "early_load", "boolean", NULL, NULL, NULL, 0 },
{ "entry", "string", NULL, NULL, NULL, 0 },
{ "terminate", "string", NULL, NULL, NULL, 0 },
{ NULL, NULL, NULL, NULL, NULL, 0 }
@@ -304,6 +305,9 @@ static const WT_CONFIG_CHECK confchk_WT_SESSION_join[] = {
NULL, "choices=[\"eq\",\"ge\",\"gt\",\"le\",\"lt\"]",
NULL, 0 },
{ "count", "int", NULL, NULL, NULL, 0 },
+ { "operation", "string",
+ NULL, "choices=[\"and\",\"or\"]",
+ NULL, 0 },
{ "strategy", "string",
NULL, "choices=[\"bloom\",\"default\"]",
NULL, 0 },
@@ -955,9 +959,9 @@ static const WT_CONFIG_ENTRY config_entries[] = {
confchk_WT_CONNECTION_close, 1
},
{ "WT_CONNECTION.load_extension",
- "config=,entry=wiredtiger_extension_init,"
+ "config=,early_load=0,entry=wiredtiger_extension_init,"
"terminate=wiredtiger_extension_terminate",
- confchk_WT_CONNECTION_load_extension, 3
+ confchk_WT_CONNECTION_load_extension, 4
},
{ "WT_CONNECTION.open_session",
"isolation=read-committed",
@@ -979,6 +983,10 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"timestamp=\"%b %d %H:%M:%S\",wait=0),verbose=",
confchk_WT_CONNECTION_reconfigure, 18
},
+ { "WT_CONNECTION.set_file_system",
+ "",
+ NULL, 0
+ },
{ "WT_CURSOR.close",
"",
NULL, 0
@@ -1031,8 +1039,8 @@ static const WT_CONFIG_ENTRY config_entries[] = {
},
{ "WT_SESSION.join",
"bloom_bit_count=16,bloom_hash_count=8,compare=\"eq\",count=,"
- "strategy=",
- confchk_WT_SESSION_join, 5
+ "operation=\"and\",strategy=",
+ confchk_WT_SESSION_join, 6
},
{ "WT_SESSION.log_flush",
"sync=on",
diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c
index 9e2f03da21f..18ad383ec74 100644
--- a/src/conn/conn_api.c
+++ b/src/conn/conn_api.c
@@ -806,6 +806,7 @@ static int
__conn_load_default_extensions(WT_CONNECTION_IMPL *conn)
{
WT_UNUSED(conn);
+
#ifdef HAVE_BUILTIN_EXTENSION_SNAPPY
WT_RET(snappy_extension_init(&conn->iface, NULL));
#endif
@@ -819,18 +820,16 @@ __conn_load_default_extensions(WT_CONNECTION_IMPL *conn)
}
/*
- * __conn_load_extension --
- * WT_CONNECTION->load_extension method.
+ * __conn_load_extension_int --
+ * Internal extension load interface
*/
static int
-__conn_load_extension(
- WT_CONNECTION *wt_conn, const char *path, const char *config)
+__conn_load_extension_int(WT_SESSION_IMPL *session,
+ const char *path, const char *cfg[], bool early_load)
{
WT_CONFIG_ITEM cval;
- WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_DLH *dlh;
- WT_SESSION_IMPL *session;
int (*load)(WT_CONNECTION *, WT_CONFIG_ARG *);
bool is_local;
const char *init_name, *terminate_name;
@@ -839,8 +838,10 @@ __conn_load_extension(
init_name = terminate_name = NULL;
is_local = strcmp(path, "local") == 0;
- conn = (WT_CONNECTION_IMPL *)wt_conn;
- CONNECTION_API_CALL(conn, session, load_extension, config, cfg);
+ /* Ensure that the load matches the phase of startup we are in. */
+ WT_ERR(__wt_config_gets(session, cfg, "early_load", &cval));
+ if ((cval.val == 0 && early_load) || (cval.val != 0 && !early_load))
+ return (0);
/*
* This assumes the underlying shared libraries are reference counted,
@@ -865,20 +866,39 @@ __conn_load_extension(
__wt_dlsym(session, dlh, terminate_name, false, &dlh->terminate));
/* Call the load function last, it simplifies error handling. */
- WT_ERR(load(wt_conn, (WT_CONFIG_ARG *)cfg));
+ WT_ERR(load(&S2C(session)->iface, (WT_CONFIG_ARG *)cfg));
/* Link onto the environment's list of open libraries. */
- __wt_spin_lock(session, &conn->api_lock);
- TAILQ_INSERT_TAIL(&conn->dlhqh, dlh, q);
- __wt_spin_unlock(session, &conn->api_lock);
+ __wt_spin_lock(session, &S2C(session)->api_lock);
+ TAILQ_INSERT_TAIL(&S2C(session)->dlhqh, dlh, q);
+ __wt_spin_unlock(session, &S2C(session)->api_lock);
dlh = NULL;
err: if (dlh != NULL)
WT_TRET(__wt_dlclose(session, dlh));
__wt_free(session, init_name);
__wt_free(session, terminate_name);
+ return (ret);
+}
- API_END_RET_NOTFOUND_MAP(session, ret);
+/*
+ * __conn_load_extension --
+ * WT_CONNECTION->load_extension method.
+ */
+static int
+__conn_load_extension(
+ WT_CONNECTION *wt_conn, const char *path, const char *config)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+ CONNECTION_API_CALL(conn, session, load_extension, config, cfg);
+
+ ret = __conn_load_extension_int(session, path, cfg, false);
+
+err: API_END_RET_NOTFOUND_MAP(session, ret);
}
/*
@@ -886,18 +906,16 @@ err: if (dlh != NULL)
* Load the list of application-configured extensions.
*/
static int
-__conn_load_extensions(WT_SESSION_IMPL *session, const char *cfg[])
+__conn_load_extensions(
+ WT_SESSION_IMPL *session, const char *cfg[], bool early_load)
{
WT_CONFIG subconfig;
WT_CONFIG_ITEM cval, skey, sval;
- WT_CONNECTION_IMPL *conn;
WT_DECL_ITEM(exconfig);
WT_DECL_ITEM(expath);
WT_DECL_RET;
-
- conn = S2C(session);
-
- WT_ERR(__conn_load_default_extensions(conn));
+ const char *sub_cfg[] = {
+ WT_CONFIG_BASE(session, WT_CONNECTION_load_extension), NULL, NULL };
WT_ERR(__wt_config_gets(session, cfg, "extensions", &cval));
WT_ERR(__wt_config_subinit(session, &subconfig, &cval));
@@ -912,8 +930,9 @@ __conn_load_extensions(WT_SESSION_IMPL *session, const char *cfg[])
WT_ERR(__wt_buf_fmt(session,
exconfig, "%.*s", (int)sval.len, sval.str));
}
- WT_ERR(conn->iface.load_extension(&conn->iface,
- expath->data, (sval.len > 0) ? exconfig->data : NULL));
+ sub_cfg[1] = sval.len > 0 ? exconfig->data : NULL;
+ WT_ERR(__conn_load_extension_int(
+ session, expath->data, sub_cfg, early_load));
}
WT_ERR_NOTFOUND_OK(ret);
@@ -1192,13 +1211,12 @@ __conn_config_file(WT_SESSION_IMPL *session,
fh = NULL;
/* Configuration files are always optional. */
- WT_RET(__wt_exist(session, filename, &exist));
+ WT_RET(__wt_fs_exist(session, filename, &exist));
if (!exist)
return (0);
/* Open the configuration file. */
- WT_RET(__wt_open(
- session, filename, WT_FILE_TYPE_REGULAR, WT_OPEN_READONLY, &fh));
+ WT_RET(__wt_open(session, filename, WT_OPEN_FILE_TYPE_REGULAR, 0, &fh));
WT_ERR(__wt_filesize(session, fh, &size));
if (size == 0)
goto err;
@@ -1489,8 +1507,8 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[])
*/
exist = false;
if (!is_create)
- WT_ERR(__wt_exist(session, WT_WIREDTIGER, &exist));
- ret = __wt_open(session, WT_SINGLETHREAD, WT_FILE_TYPE_REGULAR,
+ WT_ERR(__wt_fs_exist(session, WT_WIREDTIGER, &exist));
+ ret = __wt_open(session, WT_SINGLETHREAD, WT_OPEN_FILE_TYPE_REGULAR,
is_create || exist ? WT_OPEN_CREATE : 0, &conn->lock_fh);
/*
@@ -1546,7 +1564,7 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[])
/* We own the lock file, optionally create the WiredTiger file. */
ret = __wt_open(session, WT_WIREDTIGER,
- WT_FILE_TYPE_REGULAR, is_create ? WT_OPEN_CREATE : 0, &fh);
+ WT_OPEN_FILE_TYPE_REGULAR, is_create ? WT_OPEN_CREATE : 0, &fh);
/*
* If we're read-only, check for success as well as handled errors.
@@ -1583,13 +1601,14 @@ __conn_single(WT_SESSION_IMPL *session, const char *cfg[])
* and there's never a database home after that point without a turtle
* file. If the turtle file doesn't exist, it's a create.
*/
- WT_ERR(__wt_exist(session, WT_METADATA_TURTLE, &exist));
+ WT_ERR(__wt_fs_exist(session, WT_METADATA_TURTLE, &exist));
conn->is_new = exist ? 0 : 1;
if (conn->is_new) {
if (F_ISSET(conn, WT_CONN_READONLY))
- WT_ERR_MSG(session, EINVAL, "Creating a new database is"
- " incompatible with read-only configuration.");
+ WT_ERR_MSG(session, EINVAL,
+ "Creating a new database is incompatible with "
+ "read-only configuration");
len = (size_t)snprintf(buf, sizeof(buf),
"%s\n%s\n", WT_WIREDTIGER, WIREDTIGER_VERSION_STRING);
WT_ERR(__wt_write(session, fh, (wt_off_t)0, len, buf));
@@ -1754,14 +1773,14 @@ __wt_verbose_config(WT_SESSION_IMPL *session, const char *cfg[])
static int
__conn_write_base_config(WT_SESSION_IMPL *session, const char *cfg[])
{
- WT_FH *fh;
+ WT_FSTREAM *fs;
WT_CONFIG parser;
WT_CONFIG_ITEM cval, k, v;
WT_DECL_RET;
bool exist;
const char *base_config;
- fh = NULL;
+ fs = NULL;
base_config = NULL;
/*
@@ -1789,15 +1808,14 @@ __conn_write_base_config(WT_SESSION_IMPL *session, const char *cfg[])
* only NOT exist if we crashed before it was created; in other words,
* if the base configuration file exists, we're done.
*/
- WT_RET(__wt_exist(session, WT_BASECONFIG, &exist));
+ WT_RET(__wt_fs_exist(session, WT_BASECONFIG, &exist));
if (exist)
return (0);
- WT_RET(__wt_open(session,
- WT_BASECONFIG_SET, WT_FILE_TYPE_REGULAR,
- WT_OPEN_CREATE | WT_OPEN_EXCLUSIVE | WT_STREAM_WRITE, &fh));
+ WT_RET(__wt_fopen(session, WT_BASECONFIG_SET,
+ WT_OPEN_CREATE | WT_OPEN_EXCLUSIVE, WT_STREAM_WRITE, &fs));
- WT_ERR(__wt_fprintf(session, fh, "%s\n\n",
+ WT_ERR(__wt_fprintf(session, fs, "%s\n\n",
"# Do not modify this file.\n"
"#\n"
"# WiredTiger created this file when the database was created,\n"
@@ -1844,18 +1862,18 @@ __conn_write_base_config(WT_SESSION_IMPL *session, const char *cfg[])
--v.str;
v.len += 2;
}
- WT_ERR(__wt_fprintf(session, fh,
+ WT_ERR(__wt_fprintf(session, fs,
"%.*s=%.*s\n", (int)k.len, k.str, (int)v.len, v.str));
}
WT_ERR_NOTFOUND_OK(ret);
- /* Flush the handle and rename the file into place. */
- ret = __wt_sync_handle_and_rename(
- session, &fh, WT_BASECONFIG_SET, WT_BASECONFIG);
+ /* Flush the stream and rename the file into place. */
+ ret = __wt_sync_and_rename(
+ session, &fs, WT_BASECONFIG_SET, WT_BASECONFIG);
if (0) {
/* Close open file handle, remove any temporary file. */
-err: WT_TRET(__wt_close(session, &fh));
+err: WT_TRET(__wt_fclose(session, &fs));
WT_TRET(__wt_remove_if_exists(session, WT_BASECONFIG_SET));
}
@@ -1865,6 +1883,57 @@ err: WT_TRET(__wt_close(session, &fh));
}
/*
+ * __conn_set_file_system --
+ * Configure a custom file system implementation on database open.
+ */
+static int
+__conn_set_file_system(
+ WT_CONNECTION *wt_conn, WT_FILE_SYSTEM *file_system, const char *config)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ conn = (WT_CONNECTION_IMPL *)wt_conn;
+ CONNECTION_API_CALL(conn, session, set_file_system, config, cfg);
+ WT_UNUSED(cfg);
+
+ conn->file_system = file_system;
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __conn_chk_file_system --
+ * Check the configured file system.
+ */
+static int
+__conn_chk_file_system(WT_SESSION_IMPL *session, bool readonly)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+
+#define WT_CONN_SET_FILE_SYSTEM_REQ(name) \
+ if (conn->file_system->name == NULL) \
+ WT_RET_MSG(session, EINVAL, \
+ "a WT_FILE_SYSTEM.%s method must be configured", #name)
+
+ WT_CONN_SET_FILE_SYSTEM_REQ(directory_list);
+ WT_CONN_SET_FILE_SYSTEM_REQ(directory_list_free);
+ /* not required: directory_sync */
+ WT_CONN_SET_FILE_SYSTEM_REQ(exist);
+ WT_CONN_SET_FILE_SYSTEM_REQ(open_file);
+ if (!readonly) {
+ WT_CONN_SET_FILE_SYSTEM_REQ(remove);
+ WT_CONN_SET_FILE_SYSTEM_REQ(rename);
+ }
+ WT_CONN_SET_FILE_SYSTEM_REQ(size);
+
+ return (0);
+}
+
+/*
* wiredtiger_open --
* Main library entry point: open a new connection to a WiredTiger
* database.
@@ -1888,12 +1957,13 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
__conn_add_compressor,
__conn_add_encryptor,
__conn_add_extractor,
+ __conn_set_file_system,
__conn_get_extension_api
};
static const WT_NAME_FLAG file_types[] = {
- { "checkpoint", WT_FILE_TYPE_CHECKPOINT },
- { "data", WT_FILE_TYPE_DATA },
- { "log", WT_FILE_TYPE_LOG },
+ { "checkpoint", WT_DIRECT_IO_CHECKPOINT },
+ { "data", WT_DIRECT_IO_DATA },
+ { "log", WT_DIRECT_IO_LOG },
{ NULL, 0 }
};
@@ -1983,10 +2053,27 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
F_SET(conn, WT_CONN_READONLY);
/*
- * After checking readonly and in-memory, but before we do anything that
- * touches the filesystem, configure the OS layer.
+ * Load early extensions before doing further initialization (one early
+ * extension is to configure a file system).
*/
- WT_ERR(__wt_os_init(session));
+ WT_ERR(__conn_load_extensions(session, cfg, true));
+
+ /*
+ * If the application didn't configure its own file system, configure
+ * one of ours. Check to ensure we have a valid file system.
+ */
+ if (conn->file_system == NULL) {
+ if (F_ISSET(conn, WT_CONN_IN_MEMORY))
+ WT_ERR(__wt_os_inmemory(session));
+ else
+#if defined(_MSC_VER)
+ WT_ERR(__wt_os_win(session));
+#else
+ WT_ERR(__wt_os_posix(session));
+#endif
+ }
+ WT_ERR(
+ __conn_chk_file_system(session, F_ISSET(conn, WT_CONN_READONLY)));
/*
* Capture the config_base setting file for later use. Again, if the
@@ -2036,7 +2123,7 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
__conn_config_append(cfg, version);
/* Ignore the base_config file if config_base_set is false. */
- if (config_base_set || F_ISSET(conn, WT_CONN_READONLY))
+ if (config_base_set)
WT_ERR(
__conn_config_file(session, WT_BASECONFIG, false, cfg, i1));
__conn_config_append(cfg, config);
@@ -2119,8 +2206,8 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
if (ret == 0) {
if (sval.val)
FLD_SET(conn->direct_io, ft->flag);
- } else if (ret != WT_NOTFOUND)
- goto err;
+ } else
+ WT_ERR_NOTFOUND_OK(ret);
}
WT_ERR(__wt_config_gets(session, cfg, "write_through", &cval));
@@ -2129,8 +2216,8 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
if (ret == 0) {
if (sval.val)
FLD_SET(conn->write_through, ft->flag);
- } else if (ret != WT_NOTFOUND)
- goto err;
+ } else
+ WT_ERR_NOTFOUND_OK(ret);
}
/*
@@ -2154,15 +2241,15 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
ret = __wt_config_subgets(session, &cval, ft->name, &sval);
if (ret == 0) {
switch (ft->flag) {
- case WT_FILE_TYPE_DATA:
+ case WT_DIRECT_IO_DATA:
conn->data_extend_len = sval.val;
break;
- case WT_FILE_TYPE_LOG:
+ case WT_DIRECT_IO_LOG:
conn->log_extend_len = sval.val;
break;
}
- } else if (ret != WT_NOTFOUND)
- goto err;
+ } else
+ WT_ERR_NOTFOUND_OK(ret);
}
WT_ERR(__wt_config_gets(session, cfg, "mmap", &cval));
@@ -2191,7 +2278,8 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler,
* everything else to be in place, and the extensions call back into the
* library.
*/
- WT_ERR(__conn_load_extensions(session, cfg));
+ WT_ERR(__conn_load_default_extensions(conn));
+ WT_ERR(__conn_load_extensions(session, cfg, false));
/*
* The metadata/log encryptor is configured after extensions, since
diff --git a/src/conn/conn_cache.c b/src/conn/conn_cache.c
index 9a2c394e9a6..4d33ac608bb 100644
--- a/src/conn/conn_cache.c
+++ b/src/conn/conn_cache.c
@@ -127,6 +127,7 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[])
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
+ int i;
conn = S2C(session);
@@ -157,13 +158,18 @@ __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[])
false, 10000, WT_MILLION, &cache->evict_cond));
WT_ERR(__wt_cond_alloc(session,
"eviction waiters", false, &cache->evict_waiter_cond));
- WT_ERR(__wt_spin_init(session, &cache->evict_lock, "cache eviction"));
+ WT_ERR(__wt_spin_init(session,
+ &cache->evict_queue_lock, "cache eviction queue"));
WT_ERR(__wt_spin_init(session, &cache->evict_walk_lock, "cache walk"));
/* Allocate the LRU eviction queue. */
cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR;
- WT_ERR(__wt_calloc_def(session,
- cache->evict_slots, &cache->evict_queue));
+ for (i = 0; i < WT_EVICT_QUEUE_MAX; ++i) {
+ WT_ERR(__wt_calloc_def(session,
+ cache->evict_slots, &cache->evict_queues[i].evict_queue));
+ WT_ERR(__wt_spin_init(session,
+ &cache->evict_queues[i].evict_lock, "cache eviction"));
+ }
/*
* We get/set some values in the cache statistics (rather than have
@@ -229,6 +235,7 @@ __wt_cache_destroy(WT_SESSION_IMPL *session)
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
+ int i;
conn = S2C(session);
cache = conn->cache;
@@ -254,10 +261,13 @@ __wt_cache_destroy(WT_SESSION_IMPL *session)
WT_TRET(__wt_cond_auto_destroy(session, &cache->evict_cond));
WT_TRET(__wt_cond_destroy(session, &cache->evict_waiter_cond));
- __wt_spin_destroy(session, &cache->evict_lock);
+ __wt_spin_destroy(session, &cache->evict_queue_lock);
__wt_spin_destroy(session, &cache->evict_walk_lock);
- __wt_free(session, cache->evict_queue);
+ for (i = 0; i < WT_EVICT_QUEUE_MAX; ++i) {
+ __wt_spin_destroy(session, &cache->evict_queues[i].evict_lock);
+ __wt_free(session, cache->evict_queues[i].evict_queue);
+ }
__wt_free(session, conn->cache);
return (ret);
}
diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c
index 72f23b015b7..1e34b514aa7 100644
--- a/src/conn/conn_cache_pool.c
+++ b/src/conn/conn_cache_pool.c
@@ -58,7 +58,6 @@ __wt_cache_pool_config(WT_SESSION_IMPL *session, const char **cfg)
created = updating = false;
pool_name = NULL;
cp = NULL;
- size = 0;
if (F_ISSET(conn, WT_CONN_CACHE_POOL))
updating = true;
diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c
index 5f4c38e7361..509966793e5 100644
--- a/src/conn/conn_handle.c
+++ b/src/conn/conn_handle.c
@@ -149,15 +149,17 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn)
__wt_spin_destroy(session, &conn->page_lock[i]);
__wt_free(session, conn->page_lock);
+ /* Destroy the file-system configuration. */
+ if (conn->file_system != NULL && conn->file_system->terminate != NULL)
+ WT_TRET(conn->file_system->terminate(
+ conn->file_system, (WT_SESSION *)session));
+
/* Free allocated memory. */
__wt_free(session, conn->cfg);
__wt_free(session, conn->home);
__wt_free(session, conn->error_prefix);
__wt_free(session, conn->sessions);
- /* Destroy the OS configuration. */
- WT_TRET(__wt_os_cleanup(session));
-
__wt_free(NULL, conn);
return (ret);
}
diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c
index 6cb8ba3d0f9..5397962bc4f 100644
--- a/src/conn/conn_log.c
+++ b/src/conn/conn_log.c
@@ -178,6 +178,7 @@ __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file)
conn = S2C(session);
log = conn->log;
logcount = 0;
+ locked = false;
logfiles = NULL;
/*
@@ -198,14 +199,14 @@ __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file)
* Main archive code. Get the list of all log files and
* remove any earlier than the minimum log number.
*/
- WT_RET(__wt_dirlist(session, conn->log_path,
- WT_LOG_FILENAME, WT_DIRLIST_INCLUDE, &logfiles, &logcount));
+ WT_ERR(__wt_fs_directory_list(
+ session, conn->log_path, WT_LOG_FILENAME, &logfiles, &logcount));
/*
* We can only archive files if a hot backup is not in progress or
* if we are the backup.
*/
- WT_RET(__wt_readlock(session, conn->hot_backup_lock));
+ WT_ERR(__wt_readlock(session, conn->hot_backup_lock));
locked = true;
if (!conn->hot_backup || backup_file != 0) {
for (i = 0; i < logcount; i++) {
@@ -218,9 +219,6 @@ __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file)
}
WT_ERR(__wt_readunlock(session, conn->hot_backup_lock));
locked = false;
- __wt_log_files_free(session, logfiles, logcount);
- logfiles = NULL;
- logcount = 0;
/*
* Indicate what is our new earliest LSN. It is the start
@@ -232,8 +230,7 @@ __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file)
err: __wt_err(session, ret, "log archive server error");
if (locked)
WT_TRET(__wt_readunlock(session, conn->hot_backup_lock));
- if (logfiles != NULL)
- __wt_log_files_free(session, logfiles, logcount);
+ WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount));
return (ret);
}
@@ -259,10 +256,9 @@ __log_prealloc_once(WT_SESSION_IMPL *session)
* Allocate up to the maximum number, accounting for any existing
* files that may not have been used yet.
*/
- WT_ERR(__wt_dirlist(session, conn->log_path,
- WT_LOG_PREPNAME, WT_DIRLIST_INCLUDE, &recfiles, &reccount));
- __wt_log_files_free(session, recfiles, reccount);
- recfiles = NULL;
+ WT_ERR(__wt_fs_directory_list(
+ session, conn->log_path, WT_LOG_PREPNAME, &recfiles, &reccount));
+
/*
* Adjust the number of files to pre-allocate if we find that
* the critical path had to allocate them since we last ran.
@@ -292,8 +288,7 @@ __log_prealloc_once(WT_SESSION_IMPL *session)
if (0)
err: __wt_err(session, ret, "log pre-alloc server error");
- if (recfiles != NULL)
- __wt_log_files_free(session, recfiles, reccount);
+ WT_TRET(__wt_fs_directory_list_free(session, &recfiles, reccount));
return (ret);
}
@@ -314,12 +309,15 @@ __wt_log_truncate_files(
WT_UNUSED(cfg);
conn = S2C(session);
- log = conn->log;
+ if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
+ return (0);
if (F_ISSET(conn, WT_CONN_SERVER_RUN) &&
FLD_ISSET(conn->log_flags, WT_CONN_LOG_ARCHIVE))
WT_RET_MSG(session, EINVAL,
"Attempt to archive manually while a server is running");
+ log = conn->log;
+
backup_file = 0;
if (cursor != NULL)
backup_file = WT_CURSOR_BACKUP_ID(cursor);
@@ -327,6 +325,7 @@ __wt_log_truncate_files(
WT_RET(__wt_verbose(session, WT_VERB_LOG,
"log_truncate_files: Archive once up to %" PRIu32,
backup_file));
+
WT_RET(__wt_writelock(session, log->log_archive_lock));
locked = true;
WT_ERR(__log_archive_once(session, backup_file));
@@ -679,7 +678,6 @@ __log_wrlsn_server(void *arg)
log = conn->log;
yield = 0;
WT_INIT_LSN(&prev);
- did_work = false;
while (F_ISSET(conn, WT_CONN_LOG_SERVER_RUN)) {
/*
* Write out any log record buffers if anything was done
@@ -694,10 +692,8 @@ __log_wrlsn_server(void *arg)
else
WT_STAT_FAST_CONN_INCR(session, log_write_lsn_skip);
prev = log->alloc_lsn;
- if (yield == 0)
- did_work = true;
- else
- did_work = false;
+ did_work = yield == 0;
+
/*
* If __wt_log_wrlsn did work we want to yield instead of sleep.
*/
@@ -867,9 +863,9 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[])
"log write LSN"));
WT_RET(__wt_rwlock_alloc(session,
&log->log_archive_lock, "log archive lock"));
- if (FLD_ISSET(conn->direct_io, WT_FILE_TYPE_LOG))
- log->allocsize =
- WT_MAX((uint32_t)conn->buffer_alignment, WT_LOG_ALIGN);
+ if (FLD_ISSET(conn->direct_io, WT_DIRECT_IO_LOG))
+ log->allocsize = (uint32_t)
+ WT_MAX(conn->buffer_alignment, WT_LOG_ALIGN);
else
log->allocsize = WT_LOG_ALIGN;
WT_INIT_LSN(&log->alloc_lsn);
diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c
index 38c3288209e..f5722d343f7 100644
--- a/src/conn/conn_open.c
+++ b/src/conn/conn_open.c
@@ -93,7 +93,7 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn)
* transaction ID will catch up with the current ID.
*/
for (;;) {
- __wt_txn_update_oldest(session, true);
+ WT_TRET(__wt_txn_update_oldest(session, true));
if (txn_global->oldest_id == txn_global->current)
break;
__wt_yield();
diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c
index fccc4786402..855ff57808e 100644
--- a/src/conn/conn_stat.c
+++ b/src/conn/conn_stat.c
@@ -209,11 +209,11 @@ __statlog_dump(WT_SESSION_IMPL *session, const char *name, bool conn_stats)
}
if (FLD_ISSET(conn->stat_flags, WT_CONN_STAT_JSON)) {
- WT_ERR(__wt_fprintf(session, conn->stat_fh,
+ WT_ERR(__wt_fprintf(session, conn->stat_fs,
"{\"version\":\"%s\",\"localTime\":\"%s\"",
WIREDTIGER_VERSION_STRING, conn->stat_stamp));
WT_ERR(__wt_fprintf(
- session, conn->stat_fh, ",\"wiredTiger\":{"));
+ session, conn->stat_fs, ",\"wiredTiger\":{"));
while ((ret = cursor->next(cursor)) == 0) {
WT_ERR(cursor->get_value(cursor, &desc, &valstr, &val));
/* Check if we are starting a new section. */
@@ -225,23 +225,23 @@ __statlog_dump(WT_SESSION_IMPL *session, const char *name, bool conn_stats)
strncmp(desc, tmp->data, tmp->size) != 0) {
WT_ERR(__wt_buf_set(
session, tmp, desc, prefixlen));
- WT_ERR(__wt_fprintf(session, conn->stat_fh,
+ WT_ERR(__wt_fprintf(session, conn->stat_fs,
"%s\"%.*s\":{", first ? "" : "},",
(int)prefixlen, desc));
first = false;
groupfirst = true;
}
- WT_ERR(__wt_fprintf(session, conn->stat_fh,
+ WT_ERR(__wt_fprintf(session, conn->stat_fs,
"%s\"%s\":%" PRId64,
groupfirst ? "" : ",", endprefix + 2, val));
groupfirst = false;
}
WT_ERR_NOTFOUND_OK(ret);
- WT_ERR(__wt_fprintf(session, conn->stat_fh, "}}}\n"));
+ WT_ERR(__wt_fprintf(session, conn->stat_fs, "}}}\n"));
} else {
while ((ret = cursor->next(cursor)) == 0) {
WT_ERR(cursor->get_value(cursor, &desc, &valstr, &val));
- WT_ERR(__wt_fprintf(session, conn->stat_fh,
+ WT_ERR(__wt_fprintf(session, conn->stat_fs,
"%s %" PRId64 " %s %s\n",
conn->stat_stamp, val, name, desc));
}
@@ -354,7 +354,7 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp)
struct tm *tm, _tm;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
- WT_FH *log_file;
+ WT_FSTREAM *log_stream;
conn = S2C(session);
@@ -367,18 +367,16 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp)
WT_RET_MSG(session, ENOMEM, "strftime path conversion");
/* If the path has changed, cycle the log file. */
- if ((log_file = conn->stat_fh) == NULL ||
+ if ((log_stream = conn->stat_fs) == NULL ||
path == NULL || strcmp(tmp->mem, path->mem) != 0) {
- conn->stat_fh = NULL;
- WT_RET(__wt_close(session, &log_file));
+ WT_RET(__wt_fclose(session, &conn->stat_fs));
if (path != NULL)
(void)strcpy(path->mem, tmp->mem);
- WT_RET(__wt_open(session, tmp->mem,
- WT_FILE_TYPE_REGULAR,
- WT_OPEN_CREATE | WT_OPEN_FIXED | WT_STREAM_APPEND,
- &log_file));
+ WT_RET(__wt_fopen(session, tmp->mem,
+ WT_OPEN_CREATE | WT_OPEN_FIXED, WT_STREAM_APPEND,
+ &log_stream));
}
- conn->stat_fh = log_file;
+ conn->stat_fs = log_stream;
/* Create the entry prefix for this time of day. */
if (strftime(tmp->mem, tmp->memsize, conn->stat_format, tm) == 0)
@@ -411,7 +409,7 @@ __statlog_log_one(WT_SESSION_IMPL *session, WT_ITEM *path, WT_ITEM *tmp)
WT_RET(__statlog_lsm_apply(session));
/* Flush. */
- return (__wt_fsync(session, conn->stat_fh, true));
+ return (__wt_fflush(session, conn->stat_fs));
}
/*
@@ -597,7 +595,7 @@ __wt_statlog_destroy(WT_SESSION_IMPL *session, bool is_close)
conn->stat_session = NULL;
conn->stat_tid_set = false;
conn->stat_format = NULL;
- WT_TRET(__wt_close(session, &conn->stat_fh));
+ WT_TRET(__wt_fclose(session, &conn->stat_fs));
conn->stat_path = NULL;
conn->stat_sources = NULL;
conn->stat_stamp = NULL;
diff --git a/src/cursor/cur_backup.c b/src/cursor/cur_backup.c
index 5be9b311a79..4ee23008687 100644
--- a/src/cursor/cur_backup.c
+++ b/src/cursor/cur_backup.c
@@ -10,7 +10,6 @@
static int __backup_all(WT_SESSION_IMPL *);
static int __backup_cleanup_handles(WT_SESSION_IMPL *, WT_CURSOR_BACKUP *);
-static int __backup_file_create(WT_SESSION_IMPL *, WT_CURSOR_BACKUP *, bool);
static int __backup_list_append(
WT_SESSION_IMPL *, WT_CURSOR_BACKUP *, const char *);
static int __backup_list_uri_append(WT_SESSION_IMPL *, const char *, bool *);
@@ -178,8 +177,7 @@ __backup_log_append(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, bool active)
for (i = 0; i < logcount; i++)
WT_ERR(__backup_list_append(session, cb, logfiles[i]));
}
-err: if (logfiles != NULL)
- __wt_log_files_free(session, logfiles, logcount);
+err: WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount));
return (ret);
}
@@ -193,9 +191,13 @@ __backup_start(
{
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
+ WT_FSTREAM *srcfs;
+ const char *dest;
bool exist, log_only, target_list;
conn = S2C(session);
+ srcfs = NULL;
+ dest = NULL;
cb->next = 0;
cb->list = NULL;
@@ -224,11 +226,16 @@ __backup_start(
conn->hot_backup = true;
WT_ERR(__wt_writeunlock(session, conn->hot_backup_lock));
- /* Create the hot backup file. */
- WT_ERR(__backup_file_create(session, cb, false));
-
- /* Add log files if logging is enabled. */
-
+ /*
+ * Create a temporary backup file. This must be opened before
+ * generating the list of targets in backup_uri. This file will
+ * later be renamed to the correct name depending on whether or not
+ * we're doing an incremental backup. We need a temp file so that if
+ * we fail or crash while filling it, the existence of a partial file
+ * doesn't confuse restarting in the source database.
+ */
+ WT_ERR(__wt_fopen(session, WT_BACKUP_TMP,
+ WT_OPEN_CREATE, WT_STREAM_WRITE, &cb->bfs));
/*
* If a list of targets was specified, work our way through them.
* Else, generate a list of all database objects.
@@ -248,20 +255,23 @@ __backup_start(
/* Add the hot backup and standard WiredTiger files to the list. */
if (log_only) {
/*
- * Close any hot backup file.
- * We're about to open the incremental backup file.
+ * We also open an incremental backup source file so that we
+ * can detect a crash with an incremental backup existing in
+ * the source directory versus an improper destination.
*/
- WT_TRET(__wt_close(session, &cb->bfh));
- WT_ERR(__backup_file_create(session, cb, log_only));
+ dest = WT_INCREMENTAL_BACKUP;
+ WT_ERR(__wt_fopen(session, WT_INCREMENTAL_SRC,
+ WT_OPEN_CREATE, WT_STREAM_WRITE, &srcfs));
WT_ERR(__backup_list_append(
session, cb, WT_INCREMENTAL_BACKUP));
} else {
+ dest = WT_METADATA_BACKUP;
WT_ERR(__backup_list_append(session, cb, WT_METADATA_BACKUP));
- WT_ERR(__wt_exist(session, WT_BASECONFIG, &exist));
+ WT_ERR(__wt_fs_exist(session, WT_BASECONFIG, &exist));
if (exist)
WT_ERR(__backup_list_append(
session, cb, WT_BASECONFIG));
- WT_ERR(__wt_exist(session, WT_USERCONFIG, &exist));
+ WT_ERR(__wt_fs_exist(session, WT_USERCONFIG, &exist));
if (exist)
WT_ERR(__backup_list_append(
session, cb, WT_USERCONFIG));
@@ -269,10 +279,15 @@ __backup_start(
}
err: /* Close the hot backup file. */
- WT_TRET(__wt_close(session, &cb->bfh));
+ WT_TRET(__wt_fclose(session, &cb->bfs));
+ if (srcfs != NULL)
+ WT_TRET(__wt_fclose(session, &srcfs));
if (ret != 0) {
WT_TRET(__backup_cleanup_handles(session, cb));
WT_TRET(__backup_stop(session));
+ } else {
+ WT_ASSERT(session, dest != NULL);
+ WT_TRET(__wt_fs_rename(session, WT_BACKUP_TMP, dest));
}
return (ret);
@@ -384,13 +399,23 @@ __backup_uri(WT_SESSION_IMPL *session,
uri);
/*
- * Handle log targets. We do not need to go through the
- * schema worker, just call the function to append them.
- * Set log_only only if it is our only URI target.
+ * Handle log targets. We do not need to go through the schema
+ * worker, just call the function to append them. Set log_only
+ * only if it is our only URI target.
*/
if (WT_PREFIX_MATCH(uri, "log:")) {
+ /*
+ * Log archive cannot mix with incremental backup, don't
+ * let that happen.
+ */
+ if (FLD_ISSET(
+ S2C(session)->log_flags, WT_CONN_LOG_ARCHIVE))
+ WT_ERR_MSG(session, EINVAL,
+ "incremental backup not possible when "
+ "automatic log archival configured");
*log_only = !target_list;
- WT_ERR(__backup_list_uri_append(session, uri, NULL));
+ WT_ERR(__backup_log_append(
+ session, session->bkp_cursor, false));
} else {
*log_only = false;
WT_ERR(__wt_schema_worker(session,
@@ -404,19 +429,6 @@ err: __wt_scr_free(session, &tmp);
}
/*
- * __backup_file_create --
- * Create the meta-data backup file.
- */
-static int
-__backup_file_create(
- WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, bool incremental)
-{
- return (__wt_open(session,
- incremental ? WT_INCREMENTAL_BACKUP : WT_METADATA_BACKUP,
- WT_FILE_TYPE_REGULAR, WT_OPEN_CREATE | WT_STREAM_WRITE, &cb->bfh));
-}
-
-/*
* __wt_backup_file_remove --
* Remove the incremental and meta-data backup files.
*/
@@ -425,7 +437,15 @@ __wt_backup_file_remove(WT_SESSION_IMPL *session)
{
WT_DECL_RET;
+ /*
+ * Note that order matters for removing the incremental files. We must
+ * remove the backup file before removing the source file so that we
+ * always know we were a source directory while there's any chance of
+ * an incremental backup file existing.
+ */
+ WT_TRET(__wt_remove_if_exists(session, WT_BACKUP_TMP));
WT_TRET(__wt_remove_if_exists(session, WT_INCREMENTAL_BACKUP));
+ WT_TRET(__wt_remove_if_exists(session, WT_INCREMENTAL_SRC));
WT_TRET(__wt_remove_if_exists(session, WT_METADATA_BACKUP));
return (ret);
}
@@ -453,11 +473,6 @@ __backup_list_uri_append(
* if there's an entry backed by anything other than a file or lsm
* entry, we're confused.
*/
- if (WT_PREFIX_MATCH(name, "log:")) {
- WT_RET(__backup_log_append(session, cb, false));
- return (0);
- }
-
if (!WT_PREFIX_MATCH(name, "file:") &&
!WT_PREFIX_MATCH(name, "colgroup:") &&
!WT_PREFIX_MATCH(name, "index:") &&
@@ -473,7 +488,7 @@ __backup_list_uri_append(
/* Add the metadata entry to the backup file. */
WT_RET(__wt_metadata_search(session, name, &value));
- ret = __wt_fprintf(session, cb->bfh, "%s\n%s\n", name, value);
+ ret = __wt_fprintf(session, cb->bfs, "%s\n%s\n", name, value);
__wt_free(session, value);
WT_RET(ret);
diff --git a/src/cursor/cur_ds.c b/src/cursor/cur_ds.c
index 804c24a3d2e..d2b8d81ab37 100644
--- a/src/cursor/cur_ds.c
+++ b/src/cursor/cur_ds.c
@@ -16,7 +16,7 @@ static int
__curds_txn_enter(WT_SESSION_IMPL *session)
{
session->ncursors++; /* XXX */
- __wt_txn_cursor_op(session);
+ WT_RET(__wt_txn_cursor_op(session));
return (0);
}
diff --git a/src/cursor/cur_index.c b/src/cursor/cur_index.c
index dbe8046ca21..6de68d86778 100644
--- a/src/cursor/cur_index.c
+++ b/src/cursor/cur_index.c
@@ -8,20 +8,6 @@
#include "wt_internal.h"
- /*
- * __wt_curindex_joined --
- * Produce an error that this cursor is being used in a join call.
- */
-int
-__wt_curindex_joined(WT_CURSOR *cursor)
-{
- WT_SESSION_IMPL *session;
-
- session = (WT_SESSION_IMPL *)cursor->session;
- __wt_errx(session, "index cursor is being used in a join");
- return (ENOTSUP);
-}
-
/*
* __curindex_get_value --
* WT_CURSOR->get_value implementation for index cursors.
@@ -462,7 +448,7 @@ __wt_curindex_open(WT_SESSION_IMPL *session,
if (WT_CURSOR_RECNO(cursor))
WT_ERR_MSG(session, WT_ERROR,
"Column store indexes based on a record number primary "
- "key are not supported.");
+ "key are not supported");
/* Handle projections. */
if (columns != NULL) {
diff --git a/src/cursor/cur_join.c b/src/cursor/cur_join.c
index 38a83217933..fd7de53c981 100644
--- a/src/cursor/cur_join.c
+++ b/src/cursor/cur_join.c
@@ -8,159 +8,299 @@
#include "wt_internal.h"
+static int __curjoin_entries_in_range(WT_SESSION_IMPL *, WT_CURSOR_JOIN *,
+ WT_ITEM *, WT_CURSOR_JOIN_ITER *);
+static int __curjoin_entry_in_range(WT_SESSION_IMPL *, WT_CURSOR_JOIN_ENTRY *,
+ WT_ITEM *, WT_CURSOR_JOIN_ITER *);
+static int __curjoin_entry_member(WT_SESSION_IMPL *, WT_CURSOR_JOIN_ENTRY *,
+ WT_ITEM *, WT_CURSOR_JOIN_ITER *);
static int __curjoin_insert_endpoint(WT_SESSION_IMPL *,
WT_CURSOR_JOIN_ENTRY *, u_int, WT_CURSOR_JOIN_ENDPOINT **);
+static int __curjoin_iter_close(WT_CURSOR_JOIN_ITER *);
+static int __curjoin_iter_close_all(WT_CURSOR_JOIN_ITER *);
+static bool __curjoin_iter_ready(WT_CURSOR_JOIN_ITER *);
+static int __curjoin_iter_set_entry(WT_CURSOR_JOIN_ITER *, u_int);
+static int __curjoin_pack_recno(WT_SESSION_IMPL *, uint64_t, uint8_t *,
+ size_t, WT_ITEM *);
+static int __curjoin_split_key(WT_SESSION_IMPL *, WT_CURSOR_JOIN *, WT_ITEM *,
+ WT_CURSOR *, WT_CURSOR *, const char *, bool);
+
+#define WT_CURJOIN_ITER_CONSUMED(iter) \
+ ((iter)->entry_pos >= (iter)->entry_count)
/*
- * __curjoin_entry_iter_init --
+ * __wt_curjoin_joined --
+ * Produce an error that this cursor is being used in a join call.
+ */
+int
+__wt_curjoin_joined(WT_CURSOR *cursor)
+{
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)cursor->session;
+ __wt_errx(session, "cursor is being used in a join");
+ return (ENOTSUP);
+}
+
+/*
+ * __curjoin_iter_init --
* Initialize an iteration for the index managed by a join entry.
*
*/
static int
-__curjoin_entry_iter_init(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
- WT_CURSOR_JOIN_ENTRY *entry, WT_CURSOR_JOIN_ITER **iterp)
+__curjoin_iter_init(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
+ WT_CURSOR_JOIN_ITER **iterp)
{
- WT_CURSOR *to_dup;
- WT_DECL_RET;
- const char *raw_cfg[] = { WT_CONFIG_BASE(
- session, WT_SESSION_open_cursor), "raw", NULL };
- const char *def_cfg[] = { WT_CONFIG_BASE(
- session, WT_SESSION_open_cursor), NULL };
- const char *urimain, **config;
- char *mainbuf, *uri;
WT_CURSOR_JOIN_ITER *iter;
- size_t size;
-
- iter = NULL;
- mainbuf = uri = NULL;
- to_dup = entry->ends[0].cursor;
-
- if (F_ISSET((WT_CURSOR *)cjoin, WT_CURSTD_RAW))
- config = &raw_cfg[0];
- else
- config = &def_cfg[0];
-
- size = strlen(to_dup->internal_uri) + 3;
- WT_ERR(__wt_calloc(session, size, 1, &uri));
- snprintf(uri, size, "%s()", to_dup->internal_uri);
- urimain = cjoin->table->name;
- if (cjoin->projection != NULL) {
- size = strlen(urimain) + strlen(cjoin->projection) + 1;
- WT_ERR(__wt_calloc(session, size, 1, &mainbuf));
- snprintf(mainbuf, size, "%s%s", urimain, cjoin->projection);
- urimain = mainbuf;
- }
- WT_ERR(__wt_calloc_one(session, &iter));
- WT_ERR(__wt_open_cursor(session, uri, (WT_CURSOR *)cjoin, config,
- &iter->cursor));
- WT_ERR(__wt_cursor_dup_position(to_dup, iter->cursor));
- WT_ERR(__wt_open_cursor(session, urimain, (WT_CURSOR *)cjoin, config,
- &iter->main));
+ *iterp = NULL;
+ WT_RET(__wt_calloc_one(session, iterp));
+ iter = *iterp;
iter->cjoin = cjoin;
iter->session = session;
- iter->entry = entry;
- iter->positioned = false;
- iter->isequal = (entry->ends_next == 1 &&
- WT_CURJOIN_END_RANGE(&entry->ends[0]) == WT_CURJOIN_END_EQ);
- *iterp = iter;
+ cjoin->iter = iter;
+ WT_RET(__curjoin_iter_set_entry(iter, 0));
+ return (0);
+}
- if (0) {
-err: __wt_free(session, iter);
- }
- __wt_free(session, mainbuf);
- __wt_free(session, uri);
+/*
+ * __curjoin_iter_close --
+ * Close the iteration, release resources.
+ *
+ */
+static int
+__curjoin_iter_close(WT_CURSOR_JOIN_ITER *iter)
+{
+ WT_DECL_RET;
+
+ if (iter->cursor != NULL)
+ WT_TRET(iter->cursor->close(iter->cursor));
+ __wt_free(iter->session, iter);
return (ret);
}
/*
- * __curjoin_pack_recno --
- * Pack the given recno into a buffer; prepare an item referencing it.
+ * __curjoin_iter_close_all --
+ * Free the iterator and all of its children recursively.
*
*/
static int
-__curjoin_pack_recno(WT_SESSION_IMPL *session, uint64_t r, uint8_t *buf,
- size_t bufsize, WT_ITEM *item)
+__curjoin_iter_close_all(WT_CURSOR_JOIN_ITER *iter)
{
- WT_SESSION *wtsession;
- size_t sz;
+ WT_CURSOR_JOIN *parent;
+ WT_DECL_RET;
- wtsession = (WT_SESSION *)session;
- WT_RET(wiredtiger_struct_size(wtsession, &sz, "r", r));
- WT_ASSERT(session, sz < bufsize);
- WT_RET(wiredtiger_struct_pack(wtsession, buf, bufsize, "r", r));
- item->size = sz;
- item->data = buf;
+ if (iter->child)
+ WT_TRET(__curjoin_iter_close_all(iter->child));
+ iter->child = NULL;
+ WT_ASSERT(iter->session, iter->cjoin->parent == NULL ||
+ iter->cjoin->parent->iter->child == iter);
+ if ((parent = iter->cjoin->parent) != NULL)
+ parent->iter->child = NULL;
+ iter->cjoin->iter = NULL;
+ WT_TRET(__curjoin_iter_close(iter));
+ return (ret);
+}
+
+/*
+ * __curjoin_iter_reset --
+ * Reset an iteration to the starting point.
+ *
+ */
+static int
+__curjoin_iter_reset(WT_CURSOR_JOIN_ITER *iter)
+{
+ if (iter->child != NULL)
+ WT_RET(__curjoin_iter_close_all(iter->child));
+ WT_RET(__curjoin_iter_set_entry(iter, 0));
+ iter->positioned = false;
return (0);
}
/*
- * __curjoin_split_key --
- * Copy the primary key from a cursor (either main table or index)
- * to another cursor. When copying from an index file, the index
- * key is also returned.
+ * __curjoin_iter_ready --
+ * Check the positioned flag for all nested iterators.
+ *
+ */
+static bool
+__curjoin_iter_ready(WT_CURSOR_JOIN_ITER *iter)
+{
+ while (iter != NULL) {
+ if (!iter->positioned)
+ return (false);
+ iter = iter->child;
+ }
+ return (true);
+}
+
+/*
+ * __curjoin_iter_set_entry --
+ * Set the current entry for an iterator.
*
*/
static int
-__curjoin_split_key(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
- WT_ITEM *idxkey, WT_CURSOR *tocur, WT_CURSOR *fromcur,
- const char *repack_fmt, bool isindex)
+__curjoin_iter_set_entry(WT_CURSOR_JOIN_ITER *iter, u_int entry_pos)
{
- WT_CURSOR *firstcg_cur;
- WT_CURSOR_INDEX *cindex;
- WT_ITEM *keyp;
- const uint8_t *p;
+ WT_CURSOR *c, *to_dup;
+ WT_CURSOR_JOIN *cjoin, *topjoin;
+ WT_CURSOR_JOIN_ENTRY *entry;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ char *uri;
+ const char *raw_cfg[] = { WT_CONFIG_BASE(
+ iter->session, WT_SESSION_open_cursor), "raw", NULL };
+ const char *def_cfg[] = { WT_CONFIG_BASE(
+ iter->session, WT_SESSION_open_cursor), NULL };
+ const char **config;
+ size_t size;
- if (isindex) {
- cindex = ((WT_CURSOR_INDEX *)fromcur);
- /*
- * Repack tells us where the index key ends; advance past
- * that to get where the raw primary key starts.
- */
- WT_RET(__wt_struct_repack(session, cindex->child->key_format,
- repack_fmt != NULL ? repack_fmt : cindex->iface.key_format,
- &cindex->child->key, idxkey));
- WT_ASSERT(session, cindex->child->key.size > idxkey->size);
- tocur->key.data = (uint8_t *)idxkey->data + idxkey->size;
- tocur->key.size = cindex->child->key.size - idxkey->size;
- if (WT_CURSOR_RECNO(tocur)) {
- p = (const uint8_t *)tocur->key.data;
- WT_RET(__wt_vunpack_uint(&p, tocur->key.size,
- &tocur->recno));
- } else
- tocur->recno = 0;
- } else {
- firstcg_cur = ((WT_CURSOR_TABLE *)fromcur)->cg_cursors[0];
- keyp = &firstcg_cur->key;
- if (WT_CURSOR_RECNO(tocur)) {
- WT_ASSERT(session, keyp->size == sizeof(uint64_t));
- tocur->recno = *(uint64_t *)keyp->data;
- WT_RET(__curjoin_pack_recno(session, tocur->recno,
- cjoin->recno_buf, sizeof(cjoin->recno_buf),
- &tocur->key));
- } else {
- WT_ITEM_SET(tocur->key, *keyp);
- tocur->recno = 0;
+ session = iter->session;
+ cjoin = iter->cjoin;
+ uri = NULL;
+ entry = iter->entry = &cjoin->entries[entry_pos];
+ iter->positioned = false;
+ iter->entry_pos = entry_pos;
+ iter->end_pos = 0;
+
+ iter->is_equal = (entry->ends_next == 1 &&
+ WT_CURJOIN_END_RANGE(&entry->ends[0]) == WT_CURJOIN_END_EQ);
+ iter->end_skip = (entry->ends_next > 0 &&
+ WT_CURJOIN_END_RANGE(&entry->ends[0]) == WT_CURJOIN_END_GE) ? 1 : 0;
+
+ iter->end_count = WT_MIN(1, entry->ends_next);
+ if (F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION)) {
+ iter->entry_count = cjoin->entries_next;
+ if (iter->is_equal)
+ iter->end_count = entry->ends_next;
+ } else
+ iter->entry_count = 1;
+ WT_ASSERT(iter->session, iter->entry_pos < iter->entry_count);
+
+ entry->stats.actual_count = 0;
+
+ if (entry->subjoin == NULL) {
+ for (topjoin = iter->cjoin; topjoin->parent != NULL;
+ topjoin = topjoin->parent)
+ ;
+ to_dup = entry->ends[0].cursor;
+
+ if (F_ISSET((WT_CURSOR *)topjoin, WT_CURSTD_RAW))
+ config = &raw_cfg[0];
+ else
+ config = &def_cfg[0];
+
+ size = strlen(to_dup->internal_uri) + 3;
+ WT_ERR(__wt_calloc(session, size, 1, &uri));
+ snprintf(uri, size, "%s()", to_dup->internal_uri);
+ if ((c = iter->cursor) == NULL || !WT_STREQ(c->uri, uri)) {
+ iter->cursor = NULL;
+ if (c != NULL)
+ WT_ERR(c->close(c));
+ WT_ERR(__wt_open_cursor(session, uri,
+ (WT_CURSOR *)topjoin, config, &iter->cursor));
}
- idxkey->data = NULL;
- idxkey->size = 0;
+ WT_ERR(__wt_cursor_dup_position(to_dup, iter->cursor));
+ } else if (iter->cursor != NULL) {
+ WT_ERR(iter->cursor->close(iter->cursor));
+ iter->cursor = NULL;
+ }
+
+err: __wt_free(session, uri);
+ return (ret);
+}
+
+/*
+ * __curjoin_iter_bump --
+ * Called to advance the iterator to the next endpoint, which may in turn
+ * advance to the next entry.
+ */
+static int
+__curjoin_iter_bump(WT_CURSOR_JOIN_ITER *iter)
+{
+ WT_CURSOR_JOIN_ENTRY *entry;
+ WT_SESSION_IMPL *session;
+
+ session = iter->session;
+ iter->positioned = false;
+ entry = iter->entry;
+ if (entry->subjoin == NULL && iter->is_equal &&
+ ++iter->end_pos < iter->end_count) {
+ WT_RET(__wt_cursor_dup_position(
+ entry->ends[iter->end_pos].cursor, iter->cursor));
+ return (0);
}
+ iter->end_pos = iter->end_count = iter->end_skip = 0;
+ if (entry->subjoin != NULL && entry->subjoin->iter != NULL)
+ WT_RET(__curjoin_iter_close_all(entry->subjoin->iter));
+
+ if (++iter->entry_pos >= iter->entry_count) {
+ iter->entry = NULL;
+ return (0);
+ }
+ iter->entry = ++entry;
+ if (entry->subjoin != NULL) {
+ WT_RET(__curjoin_iter_init(session, entry->subjoin,
+ &iter->child));
+ return (0);
+ }
+ WT_RET(__curjoin_iter_set_entry(iter, iter->entry_pos));
return (0);
}
/*
- * __curjoin_entry_iter_next --
+ * __curjoin_iter_next --
* Get the next item in an iteration.
*
*/
static int
-__curjoin_entry_iter_next(WT_CURSOR_JOIN_ITER *iter, WT_CURSOR *cursor)
+__curjoin_iter_next(WT_CURSOR_JOIN_ITER *iter, WT_CURSOR *cursor)
{
- if (iter->positioned)
- WT_RET(iter->cursor->next(iter->cursor));
- else
+ WT_CURSOR_JOIN_ENTRY *entry;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+
+ session = iter->session;
+
+ if (WT_CURJOIN_ITER_CONSUMED(iter))
+ return (WT_NOTFOUND);
+again:
+ entry = iter->entry;
+ if (entry->subjoin != NULL) {
+ if (iter->child == NULL)
+ WT_RET(__curjoin_iter_init(session,
+ entry->subjoin, &iter->child));
+ ret = __curjoin_iter_next(iter->child, cursor);
+ if (ret == 0) {
+ /* The child did the work, we're done. */
+ iter->curkey = &cursor->key;
+ iter->positioned = true;
+ return (ret);
+ }
+ else if (ret == WT_NOTFOUND) {
+ WT_RET(__curjoin_iter_close_all(iter->child));
+ entry->subjoin->iter = NULL;
+ iter->child = NULL;
+ WT_RET(__curjoin_iter_bump(iter));
+ ret = 0;
+ }
+ } else if (iter->positioned) {
+ ret = iter->cursor->next(iter->cursor);
+ if (ret == WT_NOTFOUND) {
+ WT_RET(__curjoin_iter_bump(iter));
+ ret = 0;
+ } else
+ WT_RET(ret);
+ } else
iter->positioned = true;
+ if (WT_CURJOIN_ITER_CONSUMED(iter))
+ return (WT_NOTFOUND);
+
+ if (!__curjoin_iter_ready(iter))
+ goto again;
+
+ WT_RET(ret);
+
/*
* Set our key to the primary key, we'll also need this
* to check membership.
@@ -175,51 +315,380 @@ __curjoin_entry_iter_next(WT_CURSOR_JOIN_ITER *iter, WT_CURSOR *cursor)
}
/*
- * __curjoin_entry_iter_reset --
- * Reset an iteration to the starting point.
- *
+ * __curjoin_close --
+ * WT_CURSOR::close for join cursors.
*/
static int
-__curjoin_entry_iter_reset(WT_CURSOR_JOIN_ITER *iter)
+__curjoin_close(WT_CURSOR *cursor)
{
- if (iter->positioned) {
- WT_RET(iter->cursor->reset(iter->cursor));
- WT_RET(iter->main->reset(iter->main));
- WT_RET(__wt_cursor_dup_position(
- iter->cjoin->entries[0].ends[0].cursor, iter->cursor));
- iter->positioned = false;
- iter->entry->stats.actual_count = 0;
+ WT_CURSOR_JOIN *cjoin;
+ WT_CURSOR_JOIN_ENDPOINT *end;
+ WT_CURSOR_JOIN_ENTRY *entry;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ u_int i;
+
+ cjoin = (WT_CURSOR_JOIN *)cursor;
+
+ JOINABLE_CURSOR_API_CALL(cursor, session, close, NULL);
+
+ __wt_schema_release_table(session, cjoin->table);
+ /* These are owned by the table */
+ cursor->internal_uri = NULL;
+ cursor->key_format = NULL;
+ if (cjoin->projection != NULL) {
+ __wt_free(session, cjoin->projection);
+ __wt_free(session, cursor->value_format);
+ }
+
+ for (entry = cjoin->entries, i = 0; i < cjoin->entries_next;
+ entry++, i++) {
+ if (entry->subjoin != NULL) {
+ F_CLR(&entry->subjoin->iface, WT_CURSTD_JOINED);
+ entry->subjoin->parent = NULL;
+ }
+ if (entry->main != NULL)
+ WT_TRET(entry->main->close(entry->main));
+ if (F_ISSET(entry, WT_CURJOIN_ENTRY_OWN_BLOOM))
+ WT_TRET(__wt_bloom_close(entry->bloom));
+ for (end = &entry->ends[0];
+ end < &entry->ends[entry->ends_next]; end++) {
+ F_CLR(end->cursor, WT_CURSTD_JOINED);
+ if (F_ISSET(end, WT_CURJOIN_END_OWN_CURSOR))
+ WT_TRET(end->cursor->close(end->cursor));
+ }
+ __wt_free(session, entry->ends);
+ __wt_free(session, entry->repack_format);
+ }
+
+ if (cjoin->iter != NULL)
+ WT_TRET(__curjoin_iter_close_all(cjoin->iter));
+ if (cjoin->main != NULL)
+ WT_TRET(cjoin->main->close(cjoin->main));
+
+ __wt_free(session, cjoin->entries);
+ WT_TRET(__wt_cursor_close(cursor));
+
+err: API_END_RET(session, ret);
+}
+
+/*
+ * __curjoin_endpoint_init_key --
+ * Set the key in the reference endpoint.
+ */
+static int
+__curjoin_endpoint_init_key(WT_SESSION_IMPL *session,
+ WT_CURSOR_JOIN_ENTRY *entry, WT_CURSOR_JOIN_ENDPOINT *endpoint)
+{
+ WT_CURSOR *cursor;
+ WT_CURSOR_INDEX *cindex;
+ WT_ITEM *k;
+ uint64_t r;
+
+ if ((cursor = endpoint->cursor) != NULL) {
+ if (entry->index != NULL) {
+ /* Extract and save the index's logical key. */
+ cindex = (WT_CURSOR_INDEX *)endpoint->cursor;
+ WT_RET(__wt_struct_repack(session,
+ cindex->child->key_format,
+ (entry->repack_format != NULL ?
+ entry->repack_format : cindex->iface.key_format),
+ &cindex->child->key, &endpoint->key));
+ } else {
+ k = &((WT_CURSOR_TABLE *)cursor)->cg_cursors[0]->key;
+ if (WT_CURSOR_RECNO(cursor)) {
+ r = *(uint64_t *)k->data;
+ WT_RET(__curjoin_pack_recno(session, r,
+ endpoint->recno_buf,
+ sizeof(endpoint->recno_buf),
+ &endpoint->key));
+ }
+ else
+ endpoint->key = *k;
+ }
}
return (0);
}
/*
- * __curjoin_entry_iter_ready --
- * The iterator is positioned.
- *
+ * __curjoin_entries_in_range --
+ * Check if a key is in the range specified by the remaining entries,
+ * returning WT_NOTFOUND if not.
*/
-static bool
-__curjoin_entry_iter_ready(WT_CURSOR_JOIN_ITER *iter)
+static int
+__curjoin_entries_in_range(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
+ WT_ITEM *curkey, WT_CURSOR_JOIN_ITER *iterarg)
{
- return (iter->positioned);
+ WT_CURSOR_JOIN_ENTRY *entry;
+ WT_CURSOR_JOIN_ITER *iter;
+ WT_DECL_RET;
+ int fastret, slowret;
+ u_int pos;
+
+ iter = iterarg;
+ if (F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION)) {
+ fastret = 0;
+ slowret = WT_NOTFOUND;
+ } else {
+ fastret = WT_NOTFOUND;
+ slowret = 0;
+ }
+ pos = iter == NULL ? 0 : iter->entry_pos;
+ for (entry = &cjoin->entries[pos]; pos < cjoin->entries_next;
+ entry++, pos++) {
+ ret = __curjoin_entry_member(session, entry, curkey, iter);
+ if (ret == fastret)
+ return (fastret);
+ if (ret != slowret)
+ break;
+ iter = NULL;
+ }
+
+ return (ret == 0 ? slowret : ret);
}
/*
- * __curjoin_entry_iter_close --
- * Close the iteration, release resources.
- *
+ * __curjoin_entry_in_range --
+ * Check if a key is in the range specified by the entry, returning
+ * WT_NOTFOUND if not.
*/
static int
-__curjoin_entry_iter_close(WT_CURSOR_JOIN_ITER *iter)
+__curjoin_entry_in_range(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry,
+ WT_ITEM *curkey, WT_CURSOR_JOIN_ITER *iter)
{
+ WT_COLLATOR *collator;
+ WT_CURSOR_JOIN_ENDPOINT *end, *endmax;
+ bool disjunction, passed;
+ int cmp;
+ u_int pos;
+
+ collator = (entry->index != NULL) ? entry->index->collator : NULL;
+ endmax = &entry->ends[entry->ends_next];
+ disjunction = F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION);
+ passed = false;
+
+ /*
+ * The iterator may have already satisfied some endpoint conditions.
+ * If so and we're a disjunction, we're done. If so and we're a
+ * conjunction, we can start past the satisfied conditions.
+ */
+ if (iter == NULL)
+ pos = 0;
+ else {
+ if (disjunction && iter->end_skip)
+ return (0);
+ pos = iter->end_pos + iter->end_skip;
+ }
+
+ for (end = &entry->ends[pos]; end < endmax; end++) {
+ WT_RET(__wt_compare(session, collator, curkey, &end->key,
+ &cmp));
+ switch (WT_CURJOIN_END_RANGE(end)) {
+ case WT_CURJOIN_END_EQ:
+ passed = (cmp == 0);
+ break;
+
+ case WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ:
+ passed = (cmp >= 0);
+ WT_ASSERT(session, iter == NULL);
+ break;
+
+ case WT_CURJOIN_END_GT:
+ passed = (cmp > 0);
+ if (passed && iter != NULL && pos == 0)
+ iter->end_skip = 1;
+ break;
+
+ case WT_CURJOIN_END_LT | WT_CURJOIN_END_EQ:
+ passed = (cmp <= 0);
+ break;
+
+ case WT_CURJOIN_END_LT:
+ passed = (cmp < 0);
+ break;
+
+ default:
+ WT_RET(__wt_illegal_value(session, NULL));
+ break;
+ }
+
+ if (!passed) {
+ if (iter != NULL &&
+ (iter->is_equal ||
+ F_ISSET(end, WT_CURJOIN_END_LT))) {
+ WT_RET(__curjoin_iter_bump(iter));
+ return (WT_NOTFOUND);
+ }
+ if (!disjunction)
+ return (WT_NOTFOUND);
+ iter = NULL;
+ } else if (disjunction)
+ break;
+ }
+ if (disjunction && end == endmax)
+ return (WT_NOTFOUND);
+ else
+ return (0);
+}
+
+typedef struct {
+ WT_CURSOR iface;
+ WT_CURSOR_JOIN_ENTRY *entry;
+ bool ismember;
+} WT_CURJOIN_EXTRACTOR;
+
+/*
+ * __curjoin_extract_insert --
+ * Handle a key produced by a custom extractor.
+ */
+static int
+__curjoin_extract_insert(WT_CURSOR *cursor) {
+ WT_CURJOIN_EXTRACTOR *cextract;
WT_DECL_RET;
+ WT_ITEM ikey;
+ WT_SESSION_IMPL *session;
- if (iter->cursor != NULL)
- WT_TRET(iter->cursor->close(iter->cursor));
- if (iter->main != NULL)
- WT_TRET(iter->main->close(iter->main));
- __wt_free(iter->session, iter);
+ cextract = (WT_CURJOIN_EXTRACTOR *)cursor;
+ /*
+ * This insert method may be called multiple times during a single
+ * extraction. If we already have a definitive answer to the
+ * membership question, exit early.
+ */
+ if (cextract->ismember)
+ return (0);
+
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ WT_ITEM_SET(ikey, cursor->key);
+ /*
+ * We appended a padding byte to the key to avoid rewriting the last
+ * column. Strip that away here.
+ */
+ WT_ASSERT(session, ikey.size > 0);
+ --ikey.size;
+
+ ret = __curjoin_entry_in_range(session, cextract->entry, &ikey, false);
+ if (ret == WT_NOTFOUND)
+ ret = 0;
+ else if (ret == 0)
+ cextract->ismember = true;
+
+ return (ret);
+}
+
+/*
+ * __curjoin_entry_member --
+ * Do a membership check for a particular index that was joined,
+ * if not a member, returns WT_NOTFOUND.
+ */
+static int
+__curjoin_entry_member(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry,
+ WT_ITEM *key, WT_CURSOR_JOIN_ITER *iter)
+{
+ WT_CURJOIN_EXTRACTOR extract_cursor;
+ WT_CURSOR *c;
+ WT_CURSOR_STATIC_INIT(iface,
+ __wt_cursor_get_key, /* get-key */
+ __wt_cursor_get_value, /* get-value */
+ __wt_cursor_set_key, /* set-key */
+ __wt_cursor_set_value, /* set-value */
+ __wt_cursor_compare_notsup, /* compare */
+ __wt_cursor_equals_notsup, /* equals */
+ __wt_cursor_notsup, /* next */
+ __wt_cursor_notsup, /* prev */
+ __wt_cursor_notsup, /* reset */
+ __wt_cursor_notsup, /* search */
+ __wt_cursor_search_near_notsup, /* search-near */
+ __curjoin_extract_insert, /* insert */
+ __wt_cursor_notsup, /* update */
+ __wt_cursor_notsup, /* remove */
+ __wt_cursor_reconfigure_notsup, /* reconfigure */
+ __wt_cursor_notsup); /* close */
+ WT_DECL_RET;
+ WT_INDEX *idx;
+ WT_ITEM v;
+ bool bloom_found;
+
+ if (entry->subjoin == NULL && iter != NULL &&
+ (iter->end_pos + iter->end_skip >= entry->ends_next ||
+ (iter->end_skip > 0 &&
+ F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION))))
+ return (0); /* no checks to make */
+
+ entry->stats.accesses++;
+ bloom_found = false;
+
+ if (entry->bloom != NULL) {
+ /*
+ * If we don't own the Bloom filter, we must be sharing one
+ * in a previous entry. So the shared filter has already
+ * been checked and passed.
+ */
+ if (!F_ISSET(entry, WT_CURJOIN_ENTRY_OWN_BLOOM))
+ return (0);
+
+ /*
+ * If the item is not in the Bloom filter, we return
+ * immediately, otherwise, we still need to check the
+ * long way.
+ */
+ WT_ERR(__wt_bloom_inmem_get(entry->bloom, key));
+ bloom_found = true;
+ }
+ if (entry->subjoin != NULL) {
+ WT_ASSERT(session,
+ iter == NULL || entry->subjoin == iter->child->cjoin);
+ ret = __curjoin_entries_in_range(session, entry->subjoin,
+ key, iter == NULL ? NULL : iter->child);
+ if (iter != NULL &&
+ WT_CURJOIN_ITER_CONSUMED(iter->child)) {
+ WT_ERR(__curjoin_iter_bump(iter));
+ ret = WT_NOTFOUND;
+ }
+ return (ret);
+ }
+ if (entry->index != NULL) {
+ /*
+ * If this entry is used by the iterator, then we already
+ * have the index key, and we won't have to do any
+ * extraction either.
+ */
+ if (iter != NULL && entry == iter->entry)
+ WT_ITEM_SET(v, iter->idxkey);
+ else {
+ memset(&v, 0, sizeof(v)); /* Keep lint quiet. */
+ c = entry->main;
+ c->set_key(c, key);
+ if ((ret = c->search(c)) == 0)
+ ret = c->get_value(c, &v);
+ else if (ret == WT_NOTFOUND)
+ WT_ERR_MSG(session, WT_ERROR,
+ "main table for join is missing entry");
+ WT_TRET(c->reset(c));
+ WT_ERR(ret);
+ }
+ } else
+ WT_ITEM_SET(v, *key);
+
+ if ((idx = entry->index) != NULL && idx->extractor != NULL &&
+ (iter == NULL || entry != iter->entry)) {
+ WT_CLEAR(extract_cursor);
+ extract_cursor.iface = iface;
+ extract_cursor.iface.session = &session->iface;
+ extract_cursor.iface.key_format = idx->exkey_format;
+ extract_cursor.ismember = false;
+ extract_cursor.entry = entry;
+ WT_ERR(idx->extractor->extract(idx->extractor,
+ &session->iface, key, &v, &extract_cursor.iface));
+ if (!extract_cursor.ismember)
+ WT_ERR(WT_NOTFOUND);
+ } else
+ WT_ERR(__curjoin_entry_in_range(session, entry, &v, iter));
+ if (0) {
+err: if (ret == WT_NOTFOUND && bloom_found)
+ entry->stats.bloom_false_positive++;
+ }
return (ret);
}
@@ -238,10 +707,10 @@ __curjoin_get_key(WT_CURSOR *cursor, ...)
cjoin = (WT_CURSOR_JOIN *)cursor;
va_start(ap, cursor);
- CURSOR_API_CALL(cursor, session, get_key, NULL);
+ JOINABLE_CURSOR_API_CALL(cursor, session, get_key, NULL);
if (!F_ISSET(cjoin, WT_CURJOIN_INITIALIZED) ||
- !__curjoin_entry_iter_ready(cjoin->iter))
+ !cjoin->iter->positioned)
WT_ERR_MSG(session, EINVAL,
"join cursor must be advanced with next()");
WT_ERR(__wt_cursor_get_keyv(cursor, cursor->flags, ap));
@@ -258,23 +727,21 @@ static int
__curjoin_get_value(WT_CURSOR *cursor, ...)
{
WT_CURSOR_JOIN *cjoin;
- WT_CURSOR_JOIN_ITER *iter;
WT_DECL_RET;
WT_SESSION_IMPL *session;
va_list ap;
cjoin = (WT_CURSOR_JOIN *)cursor;
- iter = cjoin->iter;
va_start(ap, cursor);
- CURSOR_API_CALL(cursor, session, get_value, NULL);
+ JOINABLE_CURSOR_API_CALL(cursor, session, get_value, NULL);
if (!F_ISSET(cjoin, WT_CURJOIN_INITIALIZED) ||
- !__curjoin_entry_iter_ready(iter))
+ !cjoin->iter->positioned)
WT_ERR_MSG(session, EINVAL,
"join cursor must be advanced with next()");
- WT_ERR(__wt_curtable_get_valuev(iter->main, ap));
+ WT_ERR(__wt_curtable_get_valuev(cjoin->main, ap));
err: va_end(ap);
API_END_RET(session, ret);
@@ -298,7 +765,8 @@ __curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
session, WT_SESSION_open_cursor), "raw", NULL };
const char *uri;
size_t size;
- int cmp, skip;
+ u_int skip;
+ int cmp;
c = NULL;
skip = 0;
@@ -354,7 +822,34 @@ __curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
for (end = &entry->ends[skip]; end < endmax; end++) {
WT_ERR(__wt_compare(session, collator, &curkey,
&end->key, &cmp));
- if (!F_ISSET(end, WT_CURJOIN_END_LT)) {
+ if (F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION)) {
+ /* if condition satisfied, insert immediately */
+ switch (WT_CURJOIN_END_RANGE(end)) {
+ case WT_CURJOIN_END_EQ:
+ if (cmp == 0)
+ goto insert;
+ break;
+ case WT_CURJOIN_END_GT:
+ if (cmp > 0) {
+ /* skip this check next time */
+ skip = entry->ends_next;
+ goto insert;
+ }
+ break;
+ case WT_CURJOIN_END_GE:
+ if (cmp >= 0)
+ goto insert;
+ break;
+ case WT_CURJOIN_END_LT:
+ if (cmp < 0)
+ goto insert;
+ break;
+ case WT_CURJOIN_END_LE:
+ if (cmp <= 0)
+ goto insert;
+ break;
+ }
+ } else if (!F_ISSET(end, WT_CURJOIN_END_LT)) {
if (cmp < 0 || (cmp == 0 &&
!F_ISSET(end, WT_CURJOIN_END_EQ)))
goto advance;
@@ -370,6 +865,14 @@ __curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
goto done;
}
}
+ /*
+ * Either it's a disjunction that hasn't satisfied any
+ * condition, or it's a conjunction that has satisfied all
+ * conditions.
+ */
+ if (F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION))
+ goto advance;
+insert:
if (entry->index != NULL) {
curvalue.data =
(unsigned char *)curkey.data + curkey.size;
@@ -394,107 +897,86 @@ err: if (c != NULL)
}
/*
- * __curjoin_endpoint_init_key --
- * Set the key in the reference endpoint.
- */
-static int
-__curjoin_endpoint_init_key(WT_SESSION_IMPL *session,
- WT_CURSOR_JOIN_ENTRY *entry, WT_CURSOR_JOIN_ENDPOINT *endpoint)
-{
- WT_CURSOR *cursor;
- WT_CURSOR_INDEX *cindex;
- WT_ITEM *k;
- uint64_t r;
-
- if ((cursor = endpoint->cursor) != NULL) {
- if (entry->index != NULL) {
- /* Extract and save the index's logical key. */
- cindex = (WT_CURSOR_INDEX *)endpoint->cursor;
- WT_RET(__wt_struct_repack(session,
- cindex->child->key_format,
- (entry->repack_format != NULL ?
- entry->repack_format : cindex->iface.key_format),
- &cindex->child->key, &endpoint->key));
- } else {
- k = &((WT_CURSOR_TABLE *)cursor)->cg_cursors[0]->key;
- if (WT_CURSOR_RECNO(cursor)) {
- r = *(uint64_t *)k->data;
- WT_RET(__curjoin_pack_recno(session, r,
- endpoint->recno_buf,
- sizeof(endpoint->recno_buf),
- &endpoint->key));
- }
- else
- endpoint->key = *k;
- }
- }
- return (0);
-}
-
-/*
- * __curjoin_init_iter --
- * Initialize before any iteration.
+ * __curjoin_init_next --
+ * Initialize the cursor join when the next function is first called.
*/
static int
-__curjoin_init_iter(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin)
+__curjoin_init_next(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
+ bool iterable)
{
WT_BLOOM *bloom;
WT_DECL_RET;
WT_CURSOR *origcur;
WT_CURSOR_JOIN_ENTRY *je, *jeend, *je2;
WT_CURSOR_JOIN_ENDPOINT *end;
+ char *mainbuf;
const char *def_cfg[] = { WT_CONFIG_BASE(
session, WT_SESSION_open_cursor), NULL };
const char *raw_cfg[] = { WT_CONFIG_BASE(
session, WT_SESSION_open_cursor), "raw", NULL };
+ const char **config, *proj, *urimain;
+ size_t size;
uint32_t f, k;
+ mainbuf = NULL;
if (cjoin->entries_next == 0)
WT_RET_MSG(session, EINVAL,
"join cursor has not yet been joined with any other "
"cursors");
- je = &cjoin->entries[0];
- jeend = &cjoin->entries[cjoin->entries_next];
-
- /*
- * For a single compare=le endpoint in the first iterated entry,
- * construct a companion compare=ge endpoint that will actually
- * be iterated.
- */
- if (((je = cjoin->entries) != jeend) &&
- je->ends_next == 1 && F_ISSET(&je->ends[0], WT_CURJOIN_END_LT)) {
- origcur = je->ends[0].cursor;
- WT_RET(__curjoin_insert_endpoint(session, je, 0, &end));
- WT_RET(__wt_open_cursor(session, origcur->uri,
- (WT_CURSOR *)cjoin,
- F_ISSET(origcur, WT_CURSTD_RAW) ? raw_cfg : def_cfg,
- &end->cursor));
- WT_RET(end->cursor->next(end->cursor));
- end->flags = WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ |
- WT_CURJOIN_END_OWN_CURSOR;
+ if (F_ISSET((WT_CURSOR *)cjoin, WT_CURSTD_RAW))
+ config = &raw_cfg[0];
+ else
+ config = &def_cfg[0];
+ urimain = cjoin->table->name;
+ if ((proj = cjoin->projection) != NULL) {
+ size = strlen(urimain) + strlen(proj) + 1;
+ WT_ERR(__wt_calloc(session, size, 1, &mainbuf));
+ snprintf(mainbuf, size, "%s%s", urimain, proj);
+ urimain = mainbuf;
}
- WT_RET(__curjoin_entry_iter_init(session, cjoin, je, &cjoin->iter));
+ WT_ERR(__wt_open_cursor(session, urimain, (WT_CURSOR *)cjoin, config,
+ &cjoin->main));
+ jeend = &cjoin->entries[cjoin->entries_next];
for (je = cjoin->entries; je < jeend; je++) {
+ if (je->subjoin != NULL) {
+ WT_ERR(__curjoin_init_next(session, je->subjoin,
+ iterable));
+ continue;
+ }
__wt_stat_join_init_single(&je->stats);
+ /*
+ * For a single compare=le/lt endpoint in any entry that may
+ * be iterated, construct a companion compare=ge endpoint
+ * that will actually be iterated.
+ */
+ if (iterable && je->ends_next == 1 &&
+ F_ISSET(&je->ends[0], WT_CURJOIN_END_LT)) {
+ origcur = je->ends[0].cursor;
+ WT_ERR(__curjoin_insert_endpoint(session, je, 0, &end));
+ WT_ERR(__wt_open_cursor(session, origcur->uri,
+ (WT_CURSOR *)cjoin,
+ F_ISSET(origcur, WT_CURSTD_RAW) ? raw_cfg : def_cfg,
+ &end->cursor));
+ end->flags = WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ |
+ WT_CURJOIN_END_OWN_CURSOR;
+ WT_ERR(end->cursor->next(end->cursor));
+ F_CLR(je, WT_CURJOIN_ENTRY_DISJUNCTION);
+ }
for (end = &je->ends[0]; end < &je->ends[je->ends_next];
end++)
- WT_RET(__curjoin_endpoint_init_key(session, je, end));
+ WT_ERR(__curjoin_endpoint_init_key(session, je, end));
/*
- * The first entry is iterated as the 'outermost' cursor.
- * For the common GE case, we don't have to test against
- * the left reference key, we know it will be true since
- * the btree is ordered.
+ * Do any needed Bloom filter initialization. Ignore Bloom
+ * filters for entries that will be iterated. They won't
+ * help since these entries either don't need an inclusion
+ * check or are doing any needed check during the iteration.
*/
- if (je == cjoin->entries && je->ends[0].flags ==
- (WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ))
- F_SET(cjoin, WT_CURJOIN_SKIP_FIRST_LEFT);
-
- if (F_ISSET(je, WT_CURJOIN_ENTRY_BLOOM)) {
+ if (!iterable && F_ISSET(je, WT_CURJOIN_ENTRY_BLOOM)) {
if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED)
- WT_RET_MSG(session, EINVAL,
+ WT_ERR_MSG(session, EINVAL,
"join cursors with Bloom filters cannot be "
"used with read-uncommitted isolation");
if (je->bloom == NULL) {
@@ -516,10 +998,10 @@ __curjoin_init_iter(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin)
}
je->bloom_bit_count = f;
je->bloom_hash_count = k;
- WT_RET(__wt_bloom_create(session, NULL,
+ WT_ERR(__wt_bloom_create(session, NULL,
NULL, je->count, f, k, &je->bloom));
F_SET(je, WT_CURJOIN_ENTRY_OWN_BLOOM);
- WT_RET(__curjoin_init_bloom(session, cjoin,
+ WT_ERR(__curjoin_init_bloom(session, cjoin,
je, je->bloom));
/*
* Share the Bloom filter, making all
@@ -541,201 +1023,45 @@ __curjoin_init_iter(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin)
* merge into the shared one. The Bloom
* parameters of the two filters must match.
*/
- WT_RET(__wt_bloom_create(session, NULL,
+ WT_ERR(__wt_bloom_create(session, NULL,
NULL, je->count, je->bloom_bit_count,
je->bloom_hash_count, &bloom));
- WT_RET(__curjoin_init_bloom(session, cjoin,
+ WT_ERR(__curjoin_init_bloom(session, cjoin,
je, bloom));
- WT_RET(__wt_bloom_intersection(je->bloom,
+ WT_ERR(__wt_bloom_intersection(je->bloom,
bloom));
- WT_RET(__wt_bloom_close(bloom));
+ WT_ERR(__wt_bloom_close(bloom));
}
}
+ if (!F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION))
+ iterable = false;
}
-
F_SET(cjoin, WT_CURJOIN_INITIALIZED);
- return (ret);
-}
-
-/*
- * __curjoin_entry_in_range --
- * Check if a key is in the range specified by the entry, returning
- * WT_NOTFOUND if not.
- */
-static int
-__curjoin_entry_in_range(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry,
- WT_ITEM *curkey, bool skip_left)
-{
- WT_COLLATOR *collator;
- WT_CURSOR_JOIN_ENDPOINT *end, *endmax;
- int cmp;
-
- collator = (entry->index != NULL) ? entry->index->collator : NULL;
- endmax = &entry->ends[entry->ends_next];
- for (end = &entry->ends[skip_left ? 1 : 0]; end < endmax; end++) {
- WT_RET(__wt_compare(session, collator, curkey, &end->key,
- &cmp));
- if (!F_ISSET(end, WT_CURJOIN_END_LT)) {
- if (cmp < 0 ||
- (cmp == 0 &&
- !F_ISSET(end, WT_CURJOIN_END_EQ)) ||
- (cmp > 0 && !F_ISSET(end, WT_CURJOIN_END_GT)))
- WT_RET(WT_NOTFOUND);
- } else {
- if (cmp > 0 ||
- (cmp == 0 &&
- !F_ISSET(end, WT_CURJOIN_END_EQ)) ||
- (cmp < 0 && !F_ISSET(end, WT_CURJOIN_END_LT)))
- WT_RET(WT_NOTFOUND);
- }
- }
- return (0);
-}
-
-typedef struct {
- WT_CURSOR iface;
- WT_CURSOR_JOIN_ENTRY *entry;
- bool ismember;
-} WT_CURJOIN_EXTRACTOR;
-
-/*
- * __curjoin_extract_insert --
- * Handle a key produced by a custom extractor.
- */
-static int
-__curjoin_extract_insert(WT_CURSOR *cursor) {
- WT_CURJOIN_EXTRACTOR *cextract;
- WT_DECL_RET;
- WT_ITEM ikey;
- WT_SESSION_IMPL *session;
-
- cextract = (WT_CURJOIN_EXTRACTOR *)cursor;
- /*
- * This insert method may be called multiple times during a single
- * extraction. If we already have a definitive answer to the
- * membership question, exit early.
- */
- if (cextract->ismember)
- return (0);
-
- session = (WT_SESSION_IMPL *)cursor->session;
-
- WT_ITEM_SET(ikey, cursor->key);
- /*
- * We appended a padding byte to the key to avoid rewriting the last
- * column. Strip that away here.
- */
- WT_ASSERT(session, ikey.size > 0);
- --ikey.size;
-
- ret = __curjoin_entry_in_range(session, cextract->entry, &ikey, false);
- if (ret == WT_NOTFOUND)
- ret = 0;
- else if (ret == 0)
- cextract->ismember = true;
+err: __wt_free(session, mainbuf);
return (ret);
}
/*
- * __curjoin_entry_member --
- * Do a membership check for a particular index that was joined,
- * if not a member, returns WT_NOTFOUND.
+ * __curjoin_insert_endpoint --
+ * Insert a new entry into the endpoint array for the join entry.
*/
static int
-__curjoin_entry_member(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
- WT_CURSOR_JOIN_ENTRY *entry, bool skip_left)
+__curjoin_insert_endpoint(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry,
+ u_int pos, WT_CURSOR_JOIN_ENDPOINT **newendp)
{
- WT_CURJOIN_EXTRACTOR extract_cursor;
- WT_CURSOR *c;
- WT_CURSOR_STATIC_INIT(iface,
- __wt_cursor_get_key, /* get-key */
- __wt_cursor_get_value, /* get-value */
- __wt_cursor_set_key, /* set-key */
- __wt_cursor_set_value, /* set-value */
- __wt_cursor_compare_notsup, /* compare */
- __wt_cursor_equals_notsup, /* equals */
- __wt_cursor_notsup, /* next */
- __wt_cursor_notsup, /* prev */
- __wt_cursor_notsup, /* reset */
- __wt_cursor_notsup, /* search */
- __wt_cursor_search_near_notsup, /* search-near */
- __curjoin_extract_insert, /* insert */
- __wt_cursor_notsup, /* update */
- __wt_cursor_notsup, /* remove */
- __wt_cursor_reconfigure_notsup, /* reconfigure */
- __wt_cursor_notsup); /* close */
- WT_DECL_RET;
- WT_INDEX *idx;
- WT_ITEM *key, v;
- bool bloom_found;
-
- if (skip_left && entry->ends_next == 1)
- return (0); /* no checks to make */
- key = cjoin->iter->curkey;
- entry->stats.accesses++;
- bloom_found = false;
-
- if (entry->bloom != NULL) {
- /*
- * If we don't own the Bloom filter, we must be sharing one
- * in a previous entry. So the shared filter has already
- * been checked and passed.
- */
- if (!F_ISSET(entry, WT_CURJOIN_ENTRY_OWN_BLOOM))
- return (0);
-
- /*
- * If the item is not in the Bloom filter, we return
- * immediately, otherwise, we still need to check the
- * long way.
- */
- WT_ERR(__wt_bloom_inmem_get(entry->bloom, key));
- bloom_found = true;
- }
- if (entry->index != NULL) {
- /*
- * If this entry is used by the iterator, then we already
- * have the index key, and we won't have to do any extraction
- * either.
- */
- if (entry == cjoin->iter->entry)
- WT_ITEM_SET(v, cjoin->iter->idxkey);
- else {
- memset(&v, 0, sizeof(v)); /* Keep lint quiet. */
- c = entry->main;
- c->set_key(c, key);
- if ((ret = c->search(c)) == 0)
- ret = c->get_value(c, &v);
- else if (ret == WT_NOTFOUND)
- WT_ERR_MSG(session, WT_ERROR,
- "main table for join is missing entry");
- WT_TRET(c->reset(c));
- WT_ERR(ret);
- }
- } else
- WT_ITEM_SET(v, *key);
+ WT_CURSOR_JOIN_ENDPOINT *newend;
- if ((idx = entry->index) != NULL && idx->extractor != NULL &&
- entry != cjoin->iter->entry) {
- WT_CLEAR(extract_cursor);
- extract_cursor.iface = iface;
- extract_cursor.iface.session = &session->iface;
- extract_cursor.iface.key_format = idx->exkey_format;
- extract_cursor.ismember = false;
- extract_cursor.entry = entry;
- WT_ERR(idx->extractor->extract(idx->extractor,
- &session->iface, key, &v, &extract_cursor.iface));
- if (!extract_cursor.ismember)
- WT_ERR(WT_NOTFOUND);
- } else
- WT_ERR(__curjoin_entry_in_range(session, entry, &v, skip_left));
+ WT_RET(__wt_realloc_def(session, &entry->ends_allocated,
+ entry->ends_next + 1, &entry->ends));
+ newend = &entry->ends[pos];
+ memmove(newend + 1, newend,
+ (entry->ends_next - pos) * sizeof(WT_CURSOR_JOIN_ENDPOINT));
+ memset(newend, 0, sizeof(WT_CURSOR_JOIN_ENDPOINT));
+ entry->ends_next++;
+ *newendp = newend;
- if (0) {
-err: if (ret == WT_NOTFOUND && bloom_found)
- entry->stats.bloom_false_positive++;
- }
- return (ret);
+ return (0);
}
/*
@@ -750,61 +1076,52 @@ __curjoin_next(WT_CURSOR *cursor)
WT_CURSOR_JOIN_ITER *iter;
WT_DECL_RET;
WT_SESSION_IMPL *session;
- bool skip_left;
- u_int i;
+ int tret;
cjoin = (WT_CURSOR_JOIN *)cursor;
- CURSOR_API_CALL(cursor, session, next, NULL);
+ JOINABLE_CURSOR_API_CALL(cursor, session, next, NULL);
if (F_ISSET(cjoin, WT_CURJOIN_ERROR))
WT_ERR_MSG(session, WT_ERROR,
"join cursor encountered previous error");
if (!F_ISSET(cjoin, WT_CURJOIN_INITIALIZED))
- WT_ERR(__curjoin_init_iter(session, cjoin));
-
- F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+ WT_ERR(__curjoin_init_next(session, cjoin, true));
+ if (cjoin->iter == NULL)
+ WT_ERR(__curjoin_iter_init(session, cjoin, &cjoin->iter));
iter = cjoin->iter;
+ F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
-nextkey:
- if ((ret = __curjoin_entry_iter_next(iter, cursor)) == 0) {
- F_SET(cursor, WT_CURSTD_KEY_EXT);
+ while ((ret = __curjoin_iter_next(iter, cursor)) == 0) {
+ if ((ret = __curjoin_entries_in_range(session, cjoin,
+ iter->curkey, iter)) != WT_NOTFOUND)
+ break;
+ }
+ iter->positioned = (ret == 0);
+ if (ret != 0 && ret != WT_NOTFOUND)
+ WT_ERR(ret);
+ if (ret == 0) {
/*
- * We may have already established membership for the
- * 'left' case for the first entry, since we're
- * using that in our iteration.
+ * Position the 'main' cursor, this will be used to retrieve
+ * values from the cursor join. The key we have is raw, but
+ * the main cursor may not be raw.
*/
- skip_left = F_ISSET(cjoin, WT_CURJOIN_SKIP_FIRST_LEFT);
- for (i = 0; i < cjoin->entries_next; i++) {
- ret = __curjoin_entry_member(session, cjoin,
- &cjoin->entries[i], skip_left);
- if (ret == WT_NOTFOUND) {
- /*
- * If this is compare=eq on our outer iterator,
- * and we've moved past it, we're done.
- */
- if (iter->isequal && i == 0)
- break;
- goto nextkey;
- }
- skip_left = false;
- WT_ERR(ret);
- }
- } else if (ret != WT_NOTFOUND)
- WT_ERR(ret);
+ c = cjoin->main;
+ __wt_cursor_set_raw_key(c, iter->curkey);
- if (ret == 0) {
/*
- * Position the 'main' cursor, this will be used to
- * retrieve values from the cursor join.
+ * A failed search is not expected, convert WT_NOTFOUND into a
+ * generic error.
*/
- c = iter->main;
- c->set_key(c, iter->curkey);
- if ((ret = c->search(c)) != 0)
- WT_ERR(c->search(c));
+ if ((ret = c->search(c)) == WT_NOTFOUND)
+ ret = WT_ERROR;
+ WT_ERR(ret);
+
F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
- }
+ } else if (ret == WT_NOTFOUND &&
+ (tret = __curjoin_iter_close_all(iter)) != 0)
+ WT_ERR(tret);
if (0) {
err: F_SET(cjoin, WT_CURJOIN_ERROR);
@@ -813,78 +1130,148 @@ err: F_SET(cjoin, WT_CURJOIN_ERROR);
}
/*
- * __curjoin_reset --
- * WT_CURSOR::reset for join cursors.
+ * __curjoin_open_main --
+ * For the given index, open the main file with a projection
+ * that is the index keys.
*/
static int
-__curjoin_reset(WT_CURSOR *cursor)
+__curjoin_open_main(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
+ WT_CURSOR_JOIN_ENTRY *entry)
{
- WT_CURSOR_JOIN *cjoin;
WT_DECL_RET;
- WT_SESSION_IMPL *session;
+ WT_INDEX *idx;
+ char *main_uri, *newformat;
+ const char *raw_cfg[] = { WT_CONFIG_BASE(
+ session, WT_SESSION_open_cursor), "raw", NULL };
+ size_t len, newsize;
- cjoin = (WT_CURSOR_JOIN *)cursor;
+ main_uri = NULL;
+ idx = entry->index;
+
+ newsize = strlen(cjoin->table->name) + idx->colconf.len + 1;
+ WT_ERR(__wt_calloc(session, 1, newsize, &main_uri));
+ snprintf(main_uri, newsize, "%s%.*s",
+ cjoin->table->name, (int)idx->colconf.len,
+ idx->colconf.str);
+ WT_ERR(__wt_open_cursor(session, main_uri,
+ (WT_CURSOR *)cjoin, raw_cfg, &entry->main));
+ if (idx->extractor == NULL) {
+ /*
+ * Add no-op padding so trailing 'u' formats are not
+ * transformed to 'U'. This matches what happens in
+ * the index. We don't do this when we have an
+ * extractor, extractors already use the padding
+ * byte trick.
+ */
+ len = strlen(entry->main->value_format) + 3;
+ WT_ERR(__wt_calloc(session, len, 1, &newformat));
+ snprintf(newformat, len, "%s0x",
+ entry->main->value_format);
+ __wt_free(session, entry->main->value_format);
+ entry->main->value_format = newformat;
+ }
- CURSOR_API_CALL(cursor, session, reset, NULL);
+err: __wt_free(session, main_uri);
+ return (ret);
+}
- if (F_ISSET(cjoin, WT_CURJOIN_INITIALIZED))
- WT_ERR(__curjoin_entry_iter_reset(cjoin->iter));
+/*
+ * __curjoin_pack_recno --
+ * Pack the given recno into a buffer; prepare an item referencing it.
+ *
+ */
+static int
+__curjoin_pack_recno(WT_SESSION_IMPL *session, uint64_t r, uint8_t *buf,
+ size_t bufsize, WT_ITEM *item)
+{
+ WT_SESSION *wtsession;
+ size_t sz;
-err: API_END_RET(session, ret);
+ wtsession = (WT_SESSION *)session;
+ WT_RET(wiredtiger_struct_size(wtsession, &sz, "r", r));
+ WT_ASSERT(session, sz < bufsize);
+ WT_RET(wiredtiger_struct_pack(wtsession, buf, bufsize, "r", r));
+ item->size = sz;
+ item->data = buf;
+ return (0);
}
/*
- * __curjoin_close --
- * WT_CURSOR::close for join cursors.
+ * __curjoin_reset --
+ * WT_CURSOR::reset for join cursors.
*/
static int
-__curjoin_close(WT_CURSOR *cursor)
+__curjoin_reset(WT_CURSOR *cursor)
{
WT_CURSOR_JOIN *cjoin;
- WT_CURSOR_JOIN_ENDPOINT *end;
- WT_CURSOR_JOIN_ENTRY *entry;
WT_DECL_RET;
WT_SESSION_IMPL *session;
- u_int i;
cjoin = (WT_CURSOR_JOIN *)cursor;
- CURSOR_API_CALL(cursor, session, close, NULL);
-
- __wt_schema_release_table(session, cjoin->table);
- /* These are owned by the table */
- cursor->internal_uri = NULL;
- cursor->key_format = NULL;
- if (cjoin->projection != NULL) {
- __wt_free(session, cjoin->projection);
- __wt_free(session, cursor->value_format);
- }
-
- for (entry = cjoin->entries, i = 0; i < cjoin->entries_next;
- entry++, i++) {
- if (entry->main != NULL)
- WT_TRET(entry->main->close(entry->main));
- if (F_ISSET(entry, WT_CURJOIN_ENTRY_OWN_BLOOM))
- WT_TRET(__wt_bloom_close(entry->bloom));
- for (end = &entry->ends[0];
- end < &entry->ends[entry->ends_next]; end++) {
- F_CLR(end->cursor, WT_CURSTD_JOINED);
- if (F_ISSET(end, WT_CURJOIN_END_OWN_CURSOR))
- WT_TRET(end->cursor->close(end->cursor));
- }
- __wt_free(session, entry->ends);
- __wt_free(session, entry->repack_format);
- }
+ JOINABLE_CURSOR_API_CALL(cursor, session, reset, NULL);
if (cjoin->iter != NULL)
- WT_TRET(__curjoin_entry_iter_close(cjoin->iter));
- __wt_free(session, cjoin->entries);
- WT_TRET(__wt_cursor_close(cursor));
+ WT_ERR(__curjoin_iter_reset(cjoin->iter));
err: API_END_RET(session, ret);
}
/*
+ * __curjoin_split_key --
+ * Copy the primary key from a cursor (either main table or index)
+ * to another cursor. When copying from an index file, the index
+ * key is also returned.
+ *
+ */
+static int
+__curjoin_split_key(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
+ WT_ITEM *idxkey, WT_CURSOR *tocur, WT_CURSOR *fromcur,
+ const char *repack_fmt, bool isindex)
+{
+ WT_CURSOR *firstcg_cur;
+ WT_CURSOR_INDEX *cindex;
+ WT_ITEM *keyp;
+ const uint8_t *p;
+
+ if (isindex) {
+ cindex = ((WT_CURSOR_INDEX *)fromcur);
+ /*
+ * Repack tells us where the index key ends; advance past
+ * that to get where the raw primary key starts.
+ */
+ WT_RET(__wt_struct_repack(session, cindex->child->key_format,
+ repack_fmt != NULL ? repack_fmt : cindex->iface.key_format,
+ &cindex->child->key, idxkey));
+ WT_ASSERT(session, cindex->child->key.size > idxkey->size);
+ tocur->key.data = (uint8_t *)idxkey->data + idxkey->size;
+ tocur->key.size = cindex->child->key.size - idxkey->size;
+ if (WT_CURSOR_RECNO(tocur)) {
+ p = (const uint8_t *)tocur->key.data;
+ WT_RET(__wt_vunpack_uint(&p, tocur->key.size,
+ &tocur->recno));
+ } else
+ tocur->recno = 0;
+ } else {
+ firstcg_cur = ((WT_CURSOR_TABLE *)fromcur)->cg_cursors[0];
+ keyp = &firstcg_cur->key;
+ if (WT_CURSOR_RECNO(tocur)) {
+ WT_ASSERT(session, keyp->size == sizeof(uint64_t));
+ tocur->recno = *(uint64_t *)keyp->data;
+ WT_RET(__curjoin_pack_recno(session, tocur->recno,
+ cjoin->recno_buf, sizeof(cjoin->recno_buf),
+ &tocur->key));
+ } else {
+ WT_ITEM_SET(tocur->key, *keyp);
+ tocur->recno = 0;
+ }
+ idxkey->data = NULL;
+ idxkey->size = 0;
+ }
+ return (0);
+}
+
+/*
* __wt_curjoin_open --
* Initialize a join cursor.
*
@@ -979,33 +1366,51 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
WT_CURSOR_INDEX *cindex;
WT_CURSOR_JOIN_ENDPOINT *end;
WT_CURSOR_JOIN_ENTRY *entry;
- WT_DECL_RET;
- bool hasins, needbloom, range_eq;
- char *main_uri, *newformat;
- const char *raw_cfg[] = { WT_CONFIG_BASE(
- session, WT_SESSION_open_cursor), "raw", NULL };
- size_t len, newsize;
+ WT_CURSOR_JOIN *child;
+ bool hasins, needbloom, nested, range_eq;
+ size_t len;
u_int i, ins, nonbloom;
+ uint8_t endrange;
entry = NULL;
hasins = needbloom = false;
- ins = 0; /* -Wuninitialized */
- main_uri = NULL;
- nonbloom = 0; /* -Wuninitialized */
+ ins = nonbloom = 0; /* -Wuninitialized */
- for (i = 0; i < cjoin->entries_next; i++) {
- if (cjoin->entries[i].index == idx) {
- entry = &cjoin->entries[i];
- break;
- }
- if (!needbloom && i > 0 &&
- !F_ISSET(&cjoin->entries[i], WT_CURJOIN_ENTRY_BLOOM)) {
- needbloom = true;
- nonbloom = i;
+ if (cjoin->entries_next == 0) {
+ if (LF_ISSET(WT_CURJOIN_ENTRY_DISJUNCTION))
+ F_SET(cjoin, WT_CURJOIN_DISJUNCTION);
+ } else if (LF_ISSET(WT_CURJOIN_ENTRY_DISJUNCTION) &&
+ !F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION))
+ WT_RET_MSG(session, EINVAL,
+ "operation=or does not match previous operation=and");
+ else if (!LF_ISSET(WT_CURJOIN_ENTRY_DISJUNCTION) &&
+ F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION))
+ WT_RET_MSG(session, EINVAL,
+ "operation=and does not match previous operation=or");
+
+ nested = WT_PREFIX_MATCH(ref_cursor->uri, "join:");
+ if (!nested)
+ for (i = 0; i < cjoin->entries_next; i++) {
+ if (cjoin->entries[i].index == idx &&
+ cjoin->entries[i].subjoin == NULL) {
+ entry = &cjoin->entries[i];
+ break;
+ }
+ if (!needbloom && i > 0 &&
+ !F_ISSET(&cjoin->entries[i],
+ WT_CURJOIN_ENTRY_BLOOM)) {
+ needbloom = true;
+ nonbloom = i;
+ }
}
+ else {
+ if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM))
+ WT_RET_MSG(session, EINVAL,
+ "Bloom filters cannot be used with subjoins");
}
+
if (entry == NULL) {
- WT_ERR(__wt_realloc_def(session, &cjoin->entries_allocated,
+ WT_RET(__wt_realloc_def(session, &cjoin->entries_allocated,
cjoin->entries_next + 1, &cjoin->entries));
if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM) && needbloom) {
/*
@@ -1034,13 +1439,13 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
} else {
/* Merge the join into an existing entry for this index */
if (count != 0 && entry->count != 0 && entry->count != count)
- WT_ERR_MSG(session, EINVAL,
+ WT_RET_MSG(session, EINVAL,
"count=%" PRIu64 " does not match "
"previous count=%" PRIu64 " for this index",
count, entry->count);
if (LF_MASK(WT_CURJOIN_ENTRY_BLOOM) !=
F_MASK(entry, WT_CURJOIN_ENTRY_BLOOM))
- WT_ERR_MSG(session, EINVAL,
+ WT_RET_MSG(session, EINVAL,
"join has incompatible strategy "
"values for the same index");
@@ -1063,19 +1468,20 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
for (i = 0; i < entry->ends_next; i++) {
end = &entry->ends[i];
range_eq = (range == WT_CURJOIN_END_EQ);
+ endrange = WT_CURJOIN_END_RANGE(end);
if ((F_ISSET(end, WT_CURJOIN_END_GT) &&
((range & WT_CURJOIN_END_GT) != 0 || range_eq)) ||
(F_ISSET(end, WT_CURJOIN_END_LT) &&
((range & WT_CURJOIN_END_LT) != 0 || range_eq)) ||
- (WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_EQ &&
+ (endrange == WT_CURJOIN_END_EQ &&
(range & (WT_CURJOIN_END_LT | WT_CURJOIN_END_GT))
!= 0))
- WT_ERR_MSG(session, EINVAL,
+ WT_RET_MSG(session, EINVAL,
"join has overlapping ranges");
if (range == WT_CURJOIN_END_EQ &&
- WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_EQ &&
+ endrange == WT_CURJOIN_END_EQ &&
!F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION))
- WT_ERR_MSG(session, EINVAL,
+ WT_RET_MSG(session, EINVAL,
"compare=eq can only be combined "
"using operation=or");
@@ -1086,6 +1492,7 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
if (!hasins &&
((range & WT_CURJOIN_END_GT) != 0 ||
(range == WT_CURJOIN_END_EQ &&
+ endrange != WT_CURJOIN_END_EQ &&
!F_ISSET(end, WT_CURJOIN_END_GT)))) {
ins = i;
hasins = true;
@@ -1098,70 +1505,35 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
entry->bloom_hash_count =
WT_MAX(entry->bloom_hash_count, bloom_hash_count);
}
- WT_ERR(__curjoin_insert_endpoint(session, entry,
- hasins ? ins : entry->ends_next, &end));
- end->cursor = ref_cursor;
- F_SET(end, range);
-
- /* Open the main file with a projection of the indexed columns. */
- if (entry->main == NULL && idx != NULL) {
- newsize = strlen(cjoin->table->name) + idx->colconf.len + 1;
- WT_ERR(__wt_calloc(session, 1, newsize, &main_uri));
- snprintf(main_uri, newsize, "%s%.*s",
- cjoin->table->name, (int)idx->colconf.len,
- idx->colconf.str);
- WT_ERR(__wt_open_cursor(session, main_uri,
- (WT_CURSOR *)cjoin, raw_cfg, &entry->main));
- if (idx->extractor == NULL) {
+ if (nested) {
+ child = (WT_CURSOR_JOIN *)ref_cursor;
+ entry->subjoin = child;
+ child->parent = cjoin;
+ } else {
+ WT_RET(__curjoin_insert_endpoint(session, entry,
+ hasins ? ins : entry->ends_next, &end));
+ end->cursor = ref_cursor;
+ F_SET(end, range);
+
+ if (entry->main == NULL && idx != NULL) {
/*
- * Add no-op padding so trailing 'u' formats are not
- * transformed to 'U'. This matches what happens in
- * the index. We don't do this when we have an
- * extractor, extractors already use the padding
- * byte trick.
+ * Open the main file with a projection of the
+ * indexed columns.
*/
- len = strlen(entry->main->value_format) + 3;
- WT_ERR(__wt_calloc(session, len, 1, &newformat));
- snprintf(newformat, len, "%s0x",
- entry->main->value_format);
- __wt_free(session, entry->main->value_format);
- entry->main->value_format = newformat;
- }
+ WT_RET(__curjoin_open_main(session, cjoin, entry));
- /*
- * When we are repacking index keys to remove the primary
- * key, we never want to transform trailing 'u'. Use no-op
- * padding to force this.
- */
- cindex = (WT_CURSOR_INDEX *)ref_cursor;
- len = strlen(cindex->iface.key_format) + 3;
- WT_ERR(__wt_calloc(session, len, 1, &entry->repack_format));
- snprintf(entry->repack_format, len, "%s0x",
- cindex->iface.key_format);
+ /*
+ * When we are repacking index keys to remove the
+ * primary key, we never want to transform trailing
+ * 'u'. Use no-op padding to force this.
+ */
+ cindex = (WT_CURSOR_INDEX *)ref_cursor;
+ len = strlen(cindex->iface.key_format) + 3;
+ WT_RET(__wt_calloc(session, len, 1,
+ &entry->repack_format));
+ snprintf(entry->repack_format, len, "%s0x",
+ cindex->iface.key_format);
+ }
}
-
-err: __wt_free(session, main_uri);
- return (ret);
-}
-
-/*
- * __curjoin_insert_endpoint --
- * Insert a new entry into the endpoint array for the join entry.
- */
-static int
-__curjoin_insert_endpoint(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry,
- u_int pos, WT_CURSOR_JOIN_ENDPOINT **newendp)
-{
- WT_CURSOR_JOIN_ENDPOINT *newend;
-
- WT_RET(__wt_realloc_def(session, &entry->ends_allocated,
- entry->ends_next + 1, &entry->ends));
- newend = &entry->ends[pos];
- memmove(newend + 1, newend,
- (entry->ends_next - pos) * sizeof(WT_CURSOR_JOIN_ENDPOINT));
- memset(newend, 0, sizeof(WT_CURSOR_JOIN_ENDPOINT));
- entry->ends_next++;
- *newendp = newend;
-
return (0);
}
diff --git a/src/cursor/cur_std.c b/src/cursor/cur_std.c
index 7839971f975..7f220a3faa2 100644
--- a/src/cursor/cur_std.c
+++ b/src/cursor/cur_std.c
@@ -571,7 +571,6 @@ __wt_cursor_equals(WT_CURSOR *cursor, WT_CURSOR *other, int *equalp)
WT_SESSION_IMPL *session;
int cmp;
- session = (WT_SESSION_IMPL *)cursor->session;
CURSOR_API_CALL(cursor, session, equals, NULL);
WT_ERR(cursor->compare(cursor, other, &cmp));
diff --git a/src/docs/Doxyfile b/src/docs/Doxyfile
index 4c1682de6eb..69e9716b425 100644
--- a/src/docs/Doxyfile
+++ b/src/docs/Doxyfile
@@ -1570,6 +1570,8 @@ PREDEFINED = DOXYGEN \
__wt_event_handler:=WT_EVENT_HANDLER \
__wt_extension_api:=WT_EXTENSION_API \
__wt_extractor:=WT_EXTRACTOR \
+ __wt_file_handle:=WT_FILE_HANDLE \
+ __wt_file_system:=WT_FILE_SYSTEM \
__wt_item:=WT_ITEM \
__wt_lsn:=WT_LSN \
__wt_session:=WT_SESSION \
diff --git a/src/docs/backup.dox b/src/docs/backup.dox
index 7742e698ac8..45edc85d6a5 100644
--- a/src/docs/backup.dox
+++ b/src/docs/backup.dox
@@ -42,6 +42,12 @@ Copying the database files for a backup does not require any special
alignment or block size (specifically, Linux or Windows filesystems that
do not support read/write isolation can be safely read for backups).
+The database file may grow in size during the copy, and the file copy
+should not consider that an error. Blocks appended to the file after the
+copy starts can be safely ignored, that is, it is correct for the copy
+to determine an initial size of the file and then copy that many bytes,
+ignoring any bytes appended after the backup cursor was opened.
+
The cursor must not be closed until all of the files have been copied,
however, there is no requirement the files be copied in any order or in
any relationship to the WT_CURSOR::next calls, only that all files have
@@ -98,29 +104,35 @@ and removing log files from the original database home:
1. Perform a full backup of the database (as described above).
-2. Perform a full database checkpoint.
-
-3. Open a cursor on the \c "backup:" data source, with the
- \c "target=(\"log:\\")" target specified, which begins the
- process of an incremental backup.
+2. Open a cursor on the \c "backup:" data source, configured with the
+ \c "target=(\"log:\\")" target specified, which begins the process
+ of an incremental backup.
-4. Copy each log file returned by the WT_CURSOR::next method to the backup
+3. Copy each log file returned by the WT_CURSOR::next method to the backup
directory. It is not an error to copy a log file which has been copied
before, but care should be taken to ensure each log file is completely copied
- as the most recent log file may change in size while being copied.
+ as the most recent log file may grow in size while being copied.
-5. If all log files have been successfully copied, archive the log
+4. If all log files have been successfully copied, archive the log
files by calling the WT_SESSION::truncate method with the URI
- <code>log:</code> and specifying the backup cursor as the
- start cursor to that method.
+ <code>log:</code> and specifying the backup cursor as the start
+ cursor to that method. (Note there is no requirement backups be
+ coordinated with database checkpoints, however, an incremental backup
+ will repeatedly copy the same files, and will not make additional log
+ files available for archival, unless there was a checkpoint after the
+ previous incremental backup.)
+
+5. Close the backup cursor.
+
+Steps 2-5 can be repeated any number of times before step 1 is repeated.
+Full and incremental backups may be repeated as long as the backup
+database directory has not been opened and recovery run. Once recovery
+has run in a backup directory, you can no longer back up to that
+database directory.
-6. Close the backup cursor.
+An example of opening the backup data source for an incremental backup:
-Steps 2-6 can be repeated any number of times before step 1 is
-repeated. These steps can be repeated as long as the backup database
-directory has not been opened, recovery run and become live. Once
-the database becomes live, you must repeat all steps 1-6 to another,
-different backup database directory.
+@snippet ex_all.c incremental backup
@section backup_o_direct Backup and O_DIRECT
diff --git a/src/docs/cursor-join.dox b/src/docs/cursor-join.dox
index 51da6b174bf..5ea064a250b 100644
--- a/src/docs/cursor-join.dox
+++ b/src/docs/cursor-join.dox
@@ -14,6 +14,31 @@ Here is an example using join cursors:
Joins support various comparison operators: \c "eq", \c "gt", \c "ge", \c "lt", \c "le". Ranges with lower and upper bounds can also be specified, by joining two cursors on the same index, for example, one with \c "compare=ge" and another \c "compare=lt". In addition to joining indices, the main table can be joined so that a range of primary keys can be specified.
+By default, a join cursor returns a conjunction, that is, all keys that
+satisfy all the joined comparisons. By specifying a configuration with \c
+"operation=or", a join cursor will return a disjunction, or all keys that
+satisfy at least one of the joined comparisons. More complex joins can be
+composed by specifying another join cursor as the reference cursor in a join
+call.
+
+Here is an example using these concepts to show a conjunction of a disjunction:
+
+@snippet ex_schema.c Complex join cursors
+
All the joins should be done on the join cursor before WT_CURSOR::next is called. Calling WT_CURSOR::next on a join cursor for the first time populates any bloom filters and performs other initialization. The join cursor's key is the primary key (the key for the main table), and its value is the entire set of values of the main table. A join cursor can be created with a projection by appending \c "(col1,col2,...)" to the URI if a different set of values is needed.
+Keys returned from the join cursor are ordered according to the
+first reference cursor joined. For example, if an index cursor was joined
+first, that index determines the order of results. If the join cursor
+uses disjunctions, then the ordering of all joins determines the order.
+The first join in a conjunctive join, or all joins in a disjunctive join,
+are distinctive in that they are iterated internally as the cursor join
+returns values in order. Any bloom filters specified on the
+joins that are used for iteration are not useful, and are silently ignored.
+
+When disjunctions are used where the sets of keys overlap on these 'iteration
+joins', a join cursor will return duplicates. A join cursor never returns
+duplicates unless \c "operation=or" is used in a join configuration, or unless
+the first joined cursor is itself a join cursor that would return duplicates.
+
*/
diff --git a/src/docs/custom-file-systems.dox b/src/docs/custom-file-systems.dox
new file mode 100644
index 00000000000..4b012952e15
--- /dev/null
+++ b/src/docs/custom-file-systems.dox
@@ -0,0 +1,25 @@
+/*! @page custom_file_systems Custom File Systems
+
+Applications can provide a custom file system implementation that will be
+used by WiredTiger to interact with the I/O subsystem using the
+WT_FILE_SYSTEM and WT_FILE_HANDLE interfaces.
+
+It is not necessary for all file system providers to implement all methods
+in the WT_FILE_SYSTEM and WT_FILE_HANDLE structures. The documentation for
+those structures indicate which methods are optional. Methods which are not
+provided should be set to NULL. Generally the function pointers should not
+be changed once a handle is created. There is one exception to this, which
+are the fallocate and fallocate_nolock - for an example of how fallocate
+can be changed after create see the WiredTiger POSIX file system
+implementation.
+
+WT_FILE_SYSTEM and WT_FILE_HANDLE methods which fail but not fatally
+(for example, a file truncation call which fails because the file is
+currently mapped into memory), should return EBUSY.
+
+Unless explicitly stated otherwise, WiredTiger may invoke methods on the
+WT_FILE_SYSTEM and WT_FILE_HANDLE interfaces from multiple threads
+concurrently. It is the responsibility of the implementation to protect
+any shared data.
+
+*/
diff --git a/src/docs/error-handling.dox b/src/docs/error-handling.dox
index d91a126ee21..62be498fc15 100644
--- a/src/docs/error-handling.dox
+++ b/src/docs/error-handling.dox
@@ -52,6 +52,9 @@ This error indicates an underlying problem that requires the application exit an
@par <code>WT_RUN_RECOVERY</code>
This error is generated when wiredtiger_open is configured to return an error if recovery is required to use the database.
+@par <code>WT_CACHE_FULL</code>
+This error is only generated when wiredtiger_open is configured to run in-memory, and an insert or update operation requires more than the configured cache size to complete. The operation may be retried; if a transaction is in progress, it should be rolled back and the operation retried in a new transaction.
+
@if IGNORE_BUILT_BY_API_ERR_END
@endif
diff --git a/src/docs/examples.dox b/src/docs/examples.dox
index 3ed7357b52c..c5a106a00c9 100644
--- a/src/docs/examples.dox
+++ b/src/docs/examples.dox
@@ -9,9 +9,6 @@ Show how to configure and use asynchronous operations.
A more complex schema based on a call center example, showing how to map
some SQL constructs onto the WiredTiger API.
-@example ex_config.c
-Shows how to configure some properties of the database and tables.
-
@example ex_cursor.c
Shows some common cursor types and operations.
@@ -55,4 +52,7 @@ Shows how to access the database log files.
@example ex_thread.c
Shows how to access a database with multiple threads.
+@example ex_file_system.c
+Shows how to extend WiredTiger with a custom file-system implementation.
+
*/
diff --git a/src/docs/in-memory.dox b/src/docs/in-memory.dox
new file mode 100644
index 00000000000..df221dc34d6
--- /dev/null
+++ b/src/docs/in-memory.dox
@@ -0,0 +1,12 @@
+/*! @m_page{{c,java},in_memory,In-memory databases}
+
+The ::wiredtiger_open \c in_memory configuration changes WiredTiger to
+run in cache without writing to a backing disk. Data is limited to the
+configured cache size.
+
+If \c in_memory is configured, WT_CURSOR::insert and WT_CURSOR::update
+methods may return an additional error, ::WT_CACHE_FULL, indicating the
+insert or update operation requires more than the configured cache size
+to complete. If a transaction is in progress, it should be rolled back.
+
+ */
diff --git a/src/docs/programming.dox b/src/docs/programming.dox
index f717f4ed1fe..81e612e8ee8 100644
--- a/src/docs/programming.dox
+++ b/src/docs/programming.dox
@@ -41,6 +41,7 @@ each of which is ordered by one or more columns.
- @subpage compact
- @subpage checkpoint
- @subpage durability
+- @subpage in_memory
- @subpage cursor_join
- @subpage cursor_log
- @ref transaction_named_snapshots
@@ -55,6 +56,7 @@ each of which is ordered by one or more columns.
- @subpage custom_collators
- @subpage custom_extractors
- @subpage custom_data_sources
+- @subpage custom_file_systems
- @subpage helium
@m_endif
diff --git a/src/docs/spell.ok b/src/docs/spell.ok
index efc306568cd..d197b5517f2 100644
--- a/src/docs/spell.ok
+++ b/src/docs/spell.ok
@@ -25,6 +25,7 @@ EBUSY
ECMA
EINVAL
ENCRYPTOR
+ENOTSUP
EmpId
Encryptors
Facebook
@@ -178,6 +179,8 @@ desc
destructor
destructors
dev
+disjunction
+disjunctions
distclean
dl
dll
@@ -207,6 +210,7 @@ erlang
errno
exe
fadvise
+fallocate
failchk
fd's
fdatasync
@@ -331,6 +335,7 @@ nocase
nocasecoll
nodup
noflush
+nolock
nolocking
nommap
nop
diff --git a/src/docs/tune-cache.dox b/src/docs/tune-cache.dox
index c9603085905..505da436277 100644
--- a/src/docs/tune-cache.dox
+++ b/src/docs/tune-cache.dox
@@ -11,9 +11,9 @@ The cache size for the database is normally configured by setting the
function. The cache size can be adjusted after the open call with
WT_CONNECTION::reconfigure.
-An example of setting a cache size to 500MB:
+An example of setting a cache size to 5GB:
-@snippet ex_config.c configure cache size
+@snippet ex_all.c Open a connection
The effectiveness of the chosen cache size can be measured by reviewing
the page eviction statistics for the database.
diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c
index ca98b1bd62a..ffd48afd1a7 100644
--- a/src/evict/evict_file.c
+++ b/src/evict/evict_file.c
@@ -26,7 +26,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
WT_RET(__wt_evict_file_exclusive_on(session));
/* Make sure the oldest transaction ID is up-to-date. */
- __wt_txn_update_oldest(session, true);
+ WT_RET(__wt_txn_update_oldest(session, true));
/* Walk the tree, discarding pages. */
next_ref = NULL;
@@ -86,6 +86,8 @@ __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
__wt_page_can_evict(session, ref, NULL));
__wt_ref_out(session, ref);
break;
+ case WT_SYNC_CHECKPOINT:
+ case WT_SYNC_WRITE_LEAVES:
WT_ILLEGAL_VALUE_ERR(session);
}
}
diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c
index d3e32d7fc23..f5a6c33e50f 100644
--- a/src/evict/evict_lru.c
+++ b/src/evict/evict_lru.c
@@ -15,10 +15,9 @@ static int __evict_lru_pages(WT_SESSION_IMPL *, bool);
static int __evict_lru_walk(WT_SESSION_IMPL *);
static int __evict_page(WT_SESSION_IMPL *, bool);
static int __evict_pass(WT_SESSION_IMPL *);
-static int __evict_walk(WT_SESSION_IMPL *);
-static int __evict_walk_file(WT_SESSION_IMPL *, u_int *);
+static int __evict_walk(WT_SESSION_IMPL *, uint32_t);
+static int __evict_walk_file(WT_SESSION_IMPL *, uint32_t, u_int *);
static WT_THREAD_RET __evict_worker(void *);
-static int __evict_server_work(WT_SESSION_IMPL *);
/*
* __evict_read_gen --
@@ -108,7 +107,8 @@ __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref)
{
WT_CACHE *cache;
WT_EVICT_ENTRY *evict;
- uint32_t i, elem;
+ uint32_t i, elem, q;
+ bool found;
WT_ASSERT(session,
__wt_ref_is_root(ref) || ref->state == WT_REF_LOCKED);
@@ -118,18 +118,25 @@ __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref)
return;
cache = S2C(session)->cache;
- __wt_spin_lock(session, &cache->evict_lock);
-
- elem = cache->evict_max;
- for (i = 0, evict = cache->evict_queue; i < elem; i++, evict++)
- if (evict->ref == ref) {
- __evict_list_clear(session, evict);
- break;
- }
-
- WT_ASSERT(session, !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU));
+ __wt_spin_lock(session, &cache->evict_queue_lock);
+
+ found = false;
+ for (q = 0; q < WT_EVICT_QUEUE_MAX && !found; q++) {
+ __wt_spin_lock(session, &cache->evict_queues[q].evict_lock);
+ elem = cache->evict_queues[q].evict_max;
+ for (i = 0, evict = cache->evict_queues[q].evict_queue;
+ i < elem; i++, evict++)
+ if (evict->ref == ref) {
+ found = true;
+ __evict_list_clear(session, evict);
+ break;
+ }
+ __wt_spin_unlock(session, &cache->evict_queues[q].evict_lock);
+ }
+ WT_ASSERT(session,
+ !F_ISSET_ATOMIC(ref->page, WT_PAGE_EVICT_LRU));
- __wt_spin_unlock(session, &cache->evict_lock);
+ __wt_spin_unlock(session, &cache->evict_queue_lock);
}
/*
@@ -183,6 +190,9 @@ __evict_server(void *arg)
conn = S2C(session);
cache = conn->cache;
+#ifdef HAVE_DIAGNOSTIC
+ WT_ERR(__wt_epoch(session, &stuck_ts)); /* -Wuninitialized */
+#endif
while (F_ISSET(conn, WT_CONN_EVICTION_RUN)) {
/* Evict pages from the cache as needed. */
WT_ERR(__evict_pass(session));
@@ -594,7 +604,7 @@ __evict_pass(WT_SESSION_IMPL *session)
* of whether the cache is full, to prevent the oldest ID
* falling too far behind.
*/
- __wt_txn_update_oldest(session, true);
+ WT_RET(__wt_txn_update_oldest(session, loop > 0));
if (!__evict_update_work(session))
break;
@@ -629,7 +639,7 @@ __evict_pass(WT_SESSION_IMPL *session)
conn->cache_size, cache->bytes_inmem, cache->bytes_dirty));
WT_RET(__evict_lru_walk(session));
- WT_RET(__evict_server_work(session));
+ WT_RET_NOTFOUND_OK(__evict_lru_pages(session, true));
/*
* If we're making progress, keep going; if we're not making
@@ -637,6 +647,8 @@ __evict_pass(WT_SESSION_IMPL *session)
* sleep, it's not something we can fix.
*/
if (pages_evicted == cache->pages_evict) {
+ WT_STAT_FAST_CONN_INCR(session,
+ cache_eviction_server_slept);
/*
* Back off if we aren't making progress: walks hold
* the handle list lock, which blocks other operations
@@ -785,7 +797,7 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session)
WT_CACHE *cache;
WT_DECL_RET;
WT_EVICT_ENTRY *evict;
- u_int i, elem;
+ u_int i, elem, q;
btree = S2BT(session);
cache = S2C(session)->cache;
@@ -819,12 +831,19 @@ __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session)
* The eviction candidate list might reference pages from the file,
* clear it. Hold the evict lock to remove queued pages from a file.
*/
- __wt_spin_lock(session, &cache->evict_lock);
- elem = cache->evict_max;
- for (i = 0, evict = cache->evict_queue; i < elem; i++, evict++)
- if (evict->btree == btree)
- __evict_list_clear(session, evict);
- __wt_spin_unlock(session, &cache->evict_lock);
+ __wt_spin_lock(session, &cache->evict_queue_lock);
+
+ for (q = 0; q < WT_EVICT_QUEUE_MAX; q++) {
+ __wt_spin_lock(session, &cache->evict_queues[q].evict_lock);
+ elem = cache->evict_queues[q].evict_max;
+ for (i = 0, evict = cache->evict_queues[q].evict_queue;
+ i < elem; i++, evict++)
+ if (evict->btree == btree)
+ __evict_list_clear(session, evict);
+ __wt_spin_unlock(session, &cache->evict_queues[q].evict_lock);
+ }
+
+ __wt_spin_unlock(session, &cache->evict_queue_lock);
/*
* We have disabled further eviction: wait for concurrent LRU eviction
@@ -873,6 +892,7 @@ __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session)
__wt_spin_unlock(session, &cache->evict_walk_lock);
}
+#define APP_EVICT_THRESHOLD 3 /* Threshold to help evict */
/*
* __evict_lru_pages --
* Get pages from the LRU queue to evict.
@@ -880,7 +900,27 @@ __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session)
static int
__evict_lru_pages(WT_SESSION_IMPL *session, bool is_server)
{
+ WT_CACHE *cache;
WT_DECL_RET;
+ uint64_t app_evict_percent, total_evict;
+
+ /*
+ * The server will not help evict if the workers are coping with
+ * eviction workload, that is, if fewer than the threshold of the
+ * pages are evicted by application threads.
+ */
+ if (is_server && S2C(session)->evict_workers > 1) {
+ cache = S2C(session)->cache;
+ total_evict = cache->app_evicts +
+ cache->server_evicts + cache->worker_evicts;
+ app_evict_percent = (100 * cache->app_evicts) /
+ (total_evict + 1);
+ if (app_evict_percent < APP_EVICT_THRESHOLD) {
+ WT_STAT_FAST_CONN_INCR(session,
+ cache_eviction_server_not_evicting);
+ return (0);
+ }
+ }
/*
* Reconcile and discard some pages: EBUSY is returned if a page fails
@@ -900,23 +940,26 @@ __evict_lru_walk(WT_SESSION_IMPL *session)
{
WT_CACHE *cache;
WT_DECL_RET;
+ WT_EVICT_QUEUE *evict_queue;
uint64_t cutoff, read_gen_oldest;
- uint32_t candidates, entries;
+ uint32_t candidates, entries, queue_index;
cache = S2C(session)->cache;
+ queue_index = cache->evict_queue_fill++ % WT_EVICT_QUEUE_MAX;
+ evict_queue = &cache->evict_queues[queue_index];
/* Get some more pages to consider for eviction. */
- if ((ret = __evict_walk(session)) != 0)
+ if ((ret = __evict_walk(session, queue_index)) != 0)
return (ret == EBUSY ? 0 : ret);
/* Sort the list into LRU order and restart. */
- __wt_spin_lock(session, &cache->evict_lock);
+ __wt_spin_lock(session, &evict_queue->evict_lock);
- entries = cache->evict_entries;
- qsort(cache->evict_queue,
+ entries = evict_queue->evict_entries;
+ qsort(evict_queue->evict_queue,
entries, sizeof(WT_EVICT_ENTRY), __evict_lru_cmp);
- while (entries > 0 && cache->evict_queue[entries - 1].ref == NULL)
+ while (entries > 0 && evict_queue->evict_queue[entries - 1].ref == NULL)
--entries;
/*
@@ -925,9 +968,10 @@ __evict_lru_walk(WT_SESSION_IMPL *session)
* candidates so we never end up with more candidates than entries.
*/
while (entries > WT_EVICT_WALK_BASE)
- __evict_list_clear(session, &cache->evict_queue[--entries]);
+ __evict_list_clear(session,
+ &evict_queue->evict_queue[--entries]);
- cache->evict_entries = entries;
+ evict_queue->evict_entries = entries;
if (entries == 0) {
/*
@@ -935,9 +979,12 @@ __evict_lru_walk(WT_SESSION_IMPL *session)
* Make sure application threads don't read past the end of the
* candidate list, or they may race with the next walk.
*/
- cache->evict_candidates = 0;
+ evict_queue->evict_candidates = 0;
+ __wt_spin_unlock(session, &evict_queue->evict_lock);
+ __wt_spin_lock(session, &cache->evict_queue_lock);
cache->evict_current = NULL;
- __wt_spin_unlock(session, &cache->evict_lock);
+ cache->evict_current_queue = NULL;
+ __wt_spin_unlock(session, &cache->evict_queue_lock);
return (0);
}
@@ -948,7 +995,7 @@ __evict_lru_walk(WT_SESSION_IMPL *session)
* Take all candidates if we only gathered pages with an oldest
* read generation set.
*/
- cache->evict_candidates = entries;
+ evict_queue->evict_candidates = entries;
} else {
/*
* Find the oldest read generation we have in the queue, used
@@ -958,7 +1005,8 @@ __evict_lru_walk(WT_SESSION_IMPL *session)
read_gen_oldest = WT_READGEN_OLDEST;
for (candidates = 0; candidates < entries; ++candidates) {
read_gen_oldest =
- __evict_read_gen(&cache->evict_queue[candidates]);
+ __evict_read_gen(
+ &evict_queue->evict_queue[candidates]);
if (read_gen_oldest != WT_READGEN_OLDEST)
break;
}
@@ -971,9 +1019,9 @@ __evict_lru_walk(WT_SESSION_IMPL *session)
* of the entries were at the oldest read generation, take them.
*/
if (read_gen_oldest == WT_READGEN_OLDEST)
- cache->evict_candidates = entries;
+ evict_queue->evict_candidates = entries;
else if (candidates >= entries / 2)
- cache->evict_candidates = candidates;
+ evict_queue->evict_candidates = candidates;
else {
/* Save the calculated oldest generation. */
cache->read_gen_oldest = read_gen_oldest;
@@ -981,7 +1029,7 @@ __evict_lru_walk(WT_SESSION_IMPL *session)
/* Find the bottom 25% of read generations. */
cutoff =
(3 * read_gen_oldest + __evict_read_gen(
- &cache->evict_queue[entries - 1])) / 4;
+ &evict_queue->evict_queue[entries - 1])) / 4;
/*
* Don't take less than 10% or more than 50% of entries,
@@ -993,14 +1041,26 @@ __evict_lru_walk(WT_SESSION_IMPL *session)
candidates < entries / 2;
candidates++)
if (__evict_read_gen(
- &cache->evict_queue[candidates]) > cutoff)
+ &evict_queue->evict_queue[candidates]) >
+ cutoff)
break;
- cache->evict_candidates = candidates;
+ evict_queue->evict_candidates = candidates;
}
}
- cache->evict_current = cache->evict_queue;
- __wt_spin_unlock(session, &cache->evict_lock);
+ __wt_spin_unlock(session, &evict_queue->evict_lock);
+ /*
+ * Now we can set the next queue.
+ */
+ __wt_spin_lock(session, &cache->evict_queue_lock);
+ if (cache->evict_current == NULL)
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_queue_empty);
+ else
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_queue_not_empty);
+
+ cache->evict_current = evict_queue->evict_queue;
+ cache->evict_current_queue = evict_queue;
+ __wt_spin_unlock(session, &cache->evict_queue_lock);
/*
* The eviction server thread doesn't do any actual eviction if there
@@ -1012,46 +1072,20 @@ __evict_lru_walk(WT_SESSION_IMPL *session)
}
/*
- * __evict_server_work --
- * Evict pages from the cache based on their read generation.
- */
-static int
-__evict_server_work(WT_SESSION_IMPL *session)
-{
- WT_CACHE *cache;
-
- cache = S2C(session)->cache;
-
- if (S2C(session)->evict_workers > 1) {
- WT_STAT_FAST_CONN_INCR(
- session, cache_eviction_server_not_evicting);
-
- /*
- * If there are candidates queued, give other threads a chance
- * to access them before gathering more.
- */
- if (cache->evict_candidates > 10 &&
- cache->evict_current != NULL)
- __wt_yield();
- } else
- WT_RET_NOTFOUND_OK(__evict_lru_pages(session, true));
-
- return (0);
-}
-
-/*
* __evict_walk --
* Fill in the array by walking the next set of pages.
*/
static int
-__evict_walk(WT_SESSION_IMPL *session)
+__evict_walk(WT_SESSION_IMPL *session, uint32_t queue_index)
{
WT_BTREE *btree;
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
- u_int max_entries, prev_slot, retries, slot, start_slot, spins;
+ WT_EVICT_QUEUE *evict_queue;
+ u_int max_entries, prev_slot, retries;
+ u_int slot, start_slot, spins;
bool dhandle_locked, incr;
conn = S2C(session);
@@ -1061,16 +1095,12 @@ __evict_walk(WT_SESSION_IMPL *session)
dhandle_locked = incr = false;
retries = 0;
- if (cache->evict_current == NULL)
- WT_STAT_FAST_CONN_INCR(session, cache_eviction_queue_empty);
- else
- WT_STAT_FAST_CONN_INCR(session, cache_eviction_queue_not_empty);
-
/*
* Set the starting slot in the queue and the maximum pages added
* per walk.
*/
- start_slot = slot = cache->evict_entries;
+ evict_queue = &cache->evict_queues[queue_index];
+ start_slot = slot = evict_queue->evict_entries;
max_entries = slot + WT_EVICT_WALK_INCR;
retry: while (slot < max_entries && ret == 0) {
@@ -1154,7 +1184,7 @@ retry: while (slot < max_entries && ret == 0) {
* useful in the past.
*/
if (btree->evict_walk_period != 0 &&
- cache->evict_entries >= WT_EVICT_WALK_INCR &&
+ evict_queue->evict_entries >= WT_EVICT_WALK_INCR &&
btree->evict_walk_skips++ < btree->evict_walk_period)
continue;
btree->evict_walk_skips = 0;
@@ -1180,7 +1210,8 @@ retry: while (slot < max_entries && ret == 0) {
if (!F_ISSET(btree, WT_BTREE_NO_EVICTION)) {
cache->evict_file_next = dhandle;
WT_WITH_DHANDLE(session, dhandle,
- ret = __evict_walk_file(session, &slot));
+ ret = __evict_walk_file(
+ session, queue_index, &slot));
WT_ASSERT(session, session->split_gen == 0);
}
__wt_spin_unlock(session, &cache->evict_walk_lock);
@@ -1217,13 +1248,13 @@ retry: while (slot < max_entries && ret == 0) {
slot < max_entries && (retries < 2 ||
(retries < 10 &&
!FLD_ISSET(cache->state, WT_EVICT_PASS_WOULD_BLOCK) &&
- (slot == cache->evict_entries || slot > start_slot)))) {
+ (slot == evict_queue->evict_entries || slot > start_slot)))) {
start_slot = slot;
++retries;
goto retry;
}
- cache->evict_entries = slot;
+ evict_queue->evict_entries = slot;
return (ret);
}
@@ -1232,18 +1263,15 @@ retry: while (slot < max_entries && ret == 0) {
* Initialize a WT_EVICT_ENTRY structure with a given page.
*/
static void
-__evict_init_candidate(
- WT_SESSION_IMPL *session, WT_EVICT_ENTRY *evict, WT_REF *ref)
+__evict_init_candidate(WT_SESSION_IMPL *session,
+ WT_EVICT_QUEUE *evict_queue, WT_EVICT_ENTRY *evict, WT_REF *ref)
{
- WT_CACHE *cache;
u_int slot;
- cache = S2C(session)->cache;
-
/* Keep track of the maximum slot we are using. */
- slot = (u_int)(evict - cache->evict_queue);
- if (slot >= cache->evict_max)
- cache->evict_max = slot + 1;
+ slot = (u_int)(evict - evict_queue->evict_queue);
+ if (slot >= evict_queue->evict_max)
+ evict_queue->evict_max = slot + 1;
if (evict->ref != NULL)
__evict_list_clear(session, evict);
@@ -1259,13 +1287,14 @@ __evict_init_candidate(
* Get a few page eviction candidates from a single underlying file.
*/
static int
-__evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
+__evict_walk_file(WT_SESSION_IMPL *session, uint32_t queue_index, u_int *slotp)
{
WT_BTREE *btree;
WT_CACHE *cache;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_EVICT_ENTRY *end, *evict, *start;
+ WT_EVICT_QUEUE *evict_queue;
WT_PAGE *page;
WT_PAGE_MODIFY *mod;
WT_REF *ref;
@@ -1277,14 +1306,15 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp)
conn = S2C(session);
btree = S2BT(session);
cache = conn->cache;
+ evict_queue = &cache->evict_queues[queue_index];
internal_pages = restarts = 0;
enough = false;
- start = cache->evict_queue + *slotp;
+ start = evict_queue->evict_queue + *slotp;
end = start + WT_EVICT_WALK_PER_FILE;
if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD) ||
- end > cache->evict_queue + cache->evict_slots)
- end = cache->evict_queue + cache->evict_slots;
+ end > evict_queue->evict_queue + cache->evict_slots)
+ end = evict_queue->evict_queue + cache->evict_slots;
walk_flags =
WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT;
@@ -1397,7 +1427,7 @@ fast: /* If the page can't be evicted, give up. */
}
WT_ASSERT(session, evict->ref == NULL);
- __evict_init_candidate(session, evict, ref);
+ __evict_init_candidate(session, evict_queue, evict, ref);
++evict;
if (WT_PAGE_IS_INTERNAL(page))
@@ -1432,6 +1462,43 @@ fast: /* If the page can't be evicted, give up. */
}
/*
+ * __evict_check_entry_size --
+ * Check if the size of an entry is too large for this thread to evict.
+ * We use this so that the server thread doesn't get stalled evicting
+ * a very large page.
+ */
+static bool
+__evict_check_entry_size(WT_SESSION_IMPL *session, WT_EVICT_ENTRY *entry)
+{
+ WT_CACHE *cache;
+ WT_PAGE *page;
+ WT_REF *ref;
+ uint64_t max;
+
+ cache = S2C(session)->cache;
+
+ if (cache->pages_evict == 0)
+ return (true);
+
+ max = (cache->bytes_evict / cache->pages_evict) * 4;
+ if ((ref = entry->ref) != NULL) {
+ if ((page = ref->page) == NULL)
+ return (true);
+ /*
+ * If this page is more than four times the average evicted page
+ * size then return false. Return true in all other cases.
+ * XXX Should we care here if the page is dirty? Probably...
+ */
+ if (page->memory_footprint > max) {
+ WT_STAT_FAST_CONN_INCR(
+ session, cache_eviction_server_toobig);
+ return (false);
+ }
+ }
+ return (true);
+}
+
+/*
* __evict_get_ref --
* Get a page for eviction.
*/
@@ -1441,6 +1508,7 @@ __evict_get_ref(
{
WT_CACHE *cache;
WT_EVICT_ENTRY *evict;
+ WT_EVICT_QUEUE *evict_queue;
uint32_t candidates;
cache = S2C(session)->cache;
@@ -1448,33 +1516,56 @@ __evict_get_ref(
*refp = NULL;
/*
- * Avoid the LRU lock if no pages are available. If there are pages
- * available, spin until we get the lock. If this function returns
- * without getting a page to evict, application threads assume there
- * are no more pages available and will attempt to wake the eviction
- * server.
+ * Avoid the LRU lock if no pages are available.
*/
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_get_ref);
+ if (cache->evict_current == NULL) {
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_get_ref_empty);
+ return (WT_NOTFOUND);
+ }
+ __wt_spin_lock(session, &cache->evict_queue_lock);
+ /*
+ * Verify there are still pages available.
+ */
+ if (cache->evict_current == NULL) {
+ __wt_spin_unlock(session, &cache->evict_queue_lock);
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_get_ref_empty2);
+ return (WT_NOTFOUND);
+ }
+ /*
+ * We got the queue lock, which should be fast, and now we want to
+ * get the lock on the individual queue. We know that the shared
+ * queue fields cannot change now.
+ */
+ evict_queue = cache->evict_current_queue;
for (;;) {
- if (cache->evict_current == NULL)
- return (WT_NOTFOUND);
- if (__wt_spin_trylock(session, &cache->evict_lock) == 0)
+ if (__wt_spin_trylock(session, &evict_queue->evict_lock) == 0)
break;
__wt_yield();
}
-
/*
* Only evict half of the pages before looking for more. The remainder
* are left to eviction workers (if configured), or application threads
* if necessary.
*/
- candidates = cache->evict_candidates;
+ candidates = evict_queue->evict_candidates;
if (is_server && candidates > 1)
candidates /= 2;
/* Get the next page queued for eviction. */
while ((evict = cache->evict_current) != NULL &&
- evict < cache->evict_queue + candidates && evict->ref != NULL) {
+ evict < evict_queue->evict_queue + candidates &&
+ evict->ref != NULL) {
WT_ASSERT(session, evict->btree != NULL);
+ /*
+ * If the server is helping out and encounters an entry that
+ * is too large, it stops helping. Evicting a very large
+ * page in the server thread could stall eviction from finding
+ * new work.
+ */
+ if (is_server && S2C(session)->evict_workers > 1 &&
+ !__evict_check_entry_size(session, evict))
+ break;
/* Move to the next item. */
++cache->evict_current;
@@ -1508,9 +1599,10 @@ __evict_get_ref(
}
/* Clear the current pointer if there are no more candidates. */
- if (evict >= cache->evict_queue + cache->evict_candidates)
+ if (evict >= evict_queue->evict_queue + evict_queue->evict_candidates)
cache->evict_current = NULL;
- __wt_spin_unlock(session, &cache->evict_lock);
+ __wt_spin_unlock(session, &evict_queue->evict_lock);
+ __wt_spin_unlock(session, &cache->evict_queue_lock);
return ((*refp == NULL) ? WT_NOTFOUND : 0);
}
@@ -1523,25 +1615,32 @@ static int
__evict_page(WT_SESSION_IMPL *session, bool is_server)
{
WT_BTREE *btree;
+ WT_CACHE *cache;
WT_DECL_RET;
WT_REF *ref;
WT_RET(__evict_get_ref(session, is_server, &btree, &ref));
WT_ASSERT(session, ref->state == WT_REF_LOCKED);
+ cache = S2C(session)->cache;
/*
* An internal session flags either the server itself or an eviction
* worker thread.
*/
if (F_ISSET(session, WT_SESSION_INTERNAL)) {
- if (is_server)
+ if (is_server) {
WT_STAT_FAST_CONN_INCR(
session, cache_eviction_server_evicting);
- else
+ cache->server_evicts++;
+ } else {
WT_STAT_FAST_CONN_INCR(
session, cache_eviction_worker_evicting);
- } else
+ cache->worker_evicts++;
+ }
+ } else {
WT_STAT_FAST_CONN_INCR(session, cache_eviction_app);
+ cache->app_evicts++;
+ }
/*
* In case something goes wrong, don't pick the same set of pages every
@@ -1635,7 +1734,6 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full)
/* Evict a page. */
switch (ret = __evict_page(session, false)) {
case 0:
- cache->app_evicts++;
if (txn_busy)
return (0);
/* FALLTHROUGH */
@@ -1688,9 +1786,9 @@ __wt_evict_priority_clear(WT_SESSION_IMPL *session)
int
__wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile)
{
+ FILE *fp;
WT_CONNECTION_IMPL *conn;
WT_DATA_HANDLE *dhandle, *saved_dhandle;
- WT_FH *fh;
WT_PAGE *page;
WT_REF *next_walk;
uint64_t dirty_bytes, dirty_pages, intl_bytes, intl_pages;
@@ -1702,13 +1800,12 @@ __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile)
total_bytes = 0;
if (ofile == NULL)
- fh = WT_STDERR(session);
- else
- WT_RET(__wt_open(session, ofile, WT_FILE_TYPE_REGULAR,
- WT_OPEN_CREATE | WT_STREAM_WRITE, &fh));
+ fp = stderr;
+ else if ((fp = fopen(ofile, "w")) == NULL)
+ return (EIO);
/* Note: odd string concatenation avoids spelling errors. */
- (void)__wt_fprintf(session, fh, "==========\n" "cache dump\n");
+ (void)fprintf(fp, "==========\n" "cache dump\n");
saved_dhandle = session->dhandle;
TAILQ_FOREACH(dhandle, &conn->dhqh, q) {
@@ -1747,24 +1844,22 @@ __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile)
session->dhandle = NULL;
if (dhandle->checkpoint == NULL)
- (void)__wt_fprintf(session, fh,
- "%s(<live>): \n", dhandle->name);
+ (void)fprintf(fp, "%s(<live>): \n", dhandle->name);
else
- (void)__wt_fprintf(session, fh,
- "%s(checkpoint=%s): \n",
+ (void)fprintf(fp, "%s(checkpoint=%s): \n",
dhandle->name, dhandle->checkpoint);
if (intl_pages != 0)
- (void)__wt_fprintf(session, fh,
+ (void)fprintf(fp,
"\t" "internal pages: %" PRIu64 " pages, %" PRIu64
" max, %" PRIu64 "MB total\n",
intl_pages, max_intl_bytes, intl_bytes >> 20);
if (leaf_pages != 0)
- (void)__wt_fprintf(session, fh,
+ (void)fprintf(fp,
"\t" "leaf pages: %" PRIu64 " pages, %" PRIu64
" max, %" PRIu64 "MB total\n",
leaf_pages, max_leaf_bytes, leaf_bytes >> 20);
if (dirty_pages != 0)
- (void)__wt_fprintf(session, fh,
+ (void)fprintf(fp,
"\t" "dirty pages: %" PRIu64 " pages, %" PRIu64
" max, %" PRIu64 "MB total\n",
dirty_pages, max_dirty_bytes, dirty_bytes >> 20);
@@ -1780,13 +1875,13 @@ __wt_cache_dump(WT_SESSION_IMPL *session, const char *ofile)
if (conn->cache->overhead_pct != 0)
total_bytes +=
(total_bytes * (uint64_t)conn->cache->overhead_pct) / 100;
- (void)__wt_fprintf(session, fh,
+ (void)fprintf(fp,
"cache dump: total found = %" PRIu64
"MB vs tracked inuse %" PRIu64 "MB\n",
total_bytes >> 20, __wt_cache_bytes_inuse(conn->cache) >> 20);
- (void)__wt_fprintf(session, fh, "==========\n");
- if (ofile != NULL)
- WT_RET(__wt_close(session, &fh));
+ (void)fprintf(fp, "==========\n");
+ if (ofile != NULL && fclose(fp) != 0)
+ return (EIO);
return (0);
}
#endif
diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c
index f0d4752cc83..2d20f53e9ae 100644
--- a/src/evict/evict_page.c
+++ b/src/evict/evict_page.c
@@ -420,7 +420,7 @@ __evict_review(
* fallen behind current.
*/
if (modified)
- __wt_txn_update_oldest(session, true);
+ WT_RET(__wt_txn_update_oldest(session, false));
if (!__wt_page_can_evict(session, ref, inmem_splitp))
return (EBUSY);
diff --git a/src/include/api.h b/src/include/api.h
index c6a5af40698..50b2eab83b8 100644
--- a/src/include/api.h
+++ b/src/include/api.h
@@ -118,7 +118,7 @@
#define JOINABLE_CURSOR_CALL_CHECK(cur) \
if (F_ISSET(cur, WT_CURSTD_JOINED)) \
- WT_ERR(__wt_curindex_joined(cur))
+ WT_ERR(__wt_curjoin_joined(cur))
#define JOINABLE_CURSOR_API_CALL(cur, s, n, bt) \
CURSOR_API_CALL(cur, s, n, bt); \
diff --git a/src/include/bitstring.i b/src/include/bitstring.i
index 0d30e55d1ef..08746beb9b9 100644
--- a/src/include/bitstring.i
+++ b/src/include/bitstring.i
@@ -261,10 +261,10 @@ __bit_getv(uint8_t *bitf, uint64_t entry, uint8_t width)
* Return a record number's bit-field value.
*/
static inline uint8_t
-__bit_getv_recno(WT_PAGE *page, uint64_t recno, uint8_t width)
+__bit_getv_recno(WT_REF *ref, uint64_t recno, uint8_t width)
{
return (__bit_getv(
- page->pg_fix_bitf, recno - page->pg_fix_recno, width));
+ ref->page->pg_fix_bitf, recno - ref->ref_recno, width));
}
/*
@@ -305,13 +305,3 @@ __bit_setv(uint8_t *bitf, uint64_t entry, uint8_t width, uint8_t value)
__BIT_SET(1, 0x01);
}
}
-
-/*
- * __bit_setv_recno --
- * Set a record number's bit-field value.
- */
-static inline void
-__bit_setv_recno(WT_PAGE *page, uint64_t recno, uint8_t width, uint8_t value)
-{
- __bit_setv(page->pg_fix_bitf, recno - page->pg_fix_recno, width, value);
-}
diff --git a/src/include/block.h b/src/include/block.h
index e964fb4e8c2..9f652ceddb9 100644
--- a/src/include/block.h
+++ b/src/include/block.h
@@ -174,6 +174,7 @@ struct __wt_bm {
int (*compact_start)(WT_BM *, WT_SESSION_IMPL *);
int (*free)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
bool (*is_mapped)(WT_BM *, WT_SESSION_IMPL *);
+ int (*map_discard)(WT_BM *, WT_SESSION_IMPL *, void *, size_t);
int (*preload)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
int (*read)
(WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t);
@@ -196,9 +197,9 @@ struct __wt_bm {
WT_BLOCK *block; /* Underlying file */
- void *map; /* Mapped region */
- size_t maplen;
- void *mappingcookie;
+ void *map; /* Mapped region */
+ size_t maplen;
+ void *mapped_cookie;
/*
* There's only a single block manager handle that can be written, all
@@ -224,8 +225,6 @@ struct __wt_block {
wt_off_t size; /* File size */
wt_off_t extend_size; /* File extended size */
wt_off_t extend_len; /* File extend chunk size */
- bool nowait_sync_available; /* File can flush asynchronously */
- bool preload_available; /* File pages can be preloaded */
/* Configuration information, set when the file is opened. */
uint32_t allocfirst; /* Allocation is first-fit */
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 7cdf2bef43a..9700b6f4761 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -251,6 +251,7 @@ struct __wt_page_modify {
*/
union {
WT_ADDR replace; /* Single, written replacement block */
+#undef mod_replace
#define mod_replace u1.replace
struct { /* Multiple replacement blocks */
@@ -295,7 +296,9 @@ struct __wt_page_modify {
} *multi;
uint32_t multi_entries; /* Multiple blocks element count */
} m;
+#undef mod_multi
#define mod_multi u1.m.multi
+#undef mod_multi_entries
#define mod_multi_entries u1.m.multi_entries
} u1;
@@ -318,6 +321,7 @@ struct __wt_page_modify {
*/
WT_PAGE *root_split; /* Linked list of root split pages */
} intl;
+#undef mod_root_split
#define mod_root_split u2.intl.root_split
struct {
/*
@@ -344,10 +348,24 @@ struct __wt_page_modify {
* write any implicitly created deleted records for the page.
*/
uint64_t split_recno;
- } leaf;
-#define mod_append u2.leaf.append
-#define mod_update u2.leaf.update
-#define mod_split_recno u2.leaf.split_recno
+ } column_leaf;
+#undef mod_col_append
+#define mod_col_append u2.column_leaf.append
+#undef mod_col_update
+#define mod_col_update u2.column_leaf.update
+#undef mod_col_split_recno
+#define mod_col_split_recno u2.column_leaf.split_recno
+ struct {
+ /* Inserted items for row-store. */
+ WT_INSERT_HEAD **insert;
+
+ /* Updated items for row-stores. */
+ WT_UPDATE **update;
+ } row_leaf;
+#undef mod_row_insert
+#define mod_row_insert u2.row_leaf.insert
+#undef mod_row_update
+#define mod_row_update u2.row_leaf.update
} u2;
/*
@@ -433,7 +451,6 @@ struct __wt_page {
* doesn't read it multiple times).
*/
struct {
- uint64_t recno; /* Starting recno */
WT_REF *parent_ref; /* Parent reference */
struct __wt_page_index {
@@ -442,8 +459,7 @@ struct __wt_page {
WT_REF **index;
} * volatile __index; /* Collated children */
} intl;
-#undef pg_intl_recno
-#define pg_intl_recno u.intl.recno
+#undef pg_intl_parent_ref
#define pg_intl_parent_ref u.intl.parent_ref
/*
@@ -482,40 +498,19 @@ struct __wt_page {
/* Row-store leaf page. */
struct {
- /*
- * The column-store leaf page modification structures
- * live in the WT_PAGE_MODIFY structure to keep the
- * WT_PAGE structure as small as possible for read-only
- * pages. For consistency, we could move the row-store
- * modification structures into WT_PAGE_MODIFY too, but
- * that doesn't shrink WT_PAGE any further and it would
- * require really ugly naming inside of WT_PAGE_MODIFY
- * to avoid growing that structure.
- */
- WT_INSERT_HEAD **ins; /* Inserts */
- WT_UPDATE **upd; /* Updates */
-
WT_ROW *d; /* Key/value pairs */
uint32_t entries; /* Entries */
} row;
#undef pg_row_d
#define pg_row_d u.row.d
-#undef pg_row_ins
-#define pg_row_ins u.row.ins
-#undef pg_row_upd
-#define pg_row_upd u.row.upd
#undef pg_row_entries
#define pg_row_entries u.row.entries
/* Fixed-length column-store leaf page. */
struct {
- uint64_t recno; /* Starting recno */
-
uint8_t *bitf; /* Values */
uint32_t entries; /* Entries */
} col_fix;
-#undef pg_fix_recno
-#define pg_fix_recno u.col_fix.recno
#undef pg_fix_bitf
#define pg_fix_bitf u.col_fix.bitf
#undef pg_fix_entries
@@ -523,8 +518,6 @@ struct __wt_page {
/* Variable-length column-store leaf page. */
struct {
- uint64_t recno; /* Starting recno */
-
WT_COL *d; /* Values */
/*
@@ -537,8 +530,6 @@ struct __wt_page {
uint32_t entries; /* Entries */
} col_var;
-#undef pg_var_recno
-#define pg_var_recno u.col_var.recno
#undef pg_var_d
#define pg_var_d u.col_var.d
#undef pg_var_repeats
@@ -732,6 +723,10 @@ struct __wt_ref {
uint64_t recno; /* Column-store: starting recno */
void *ikey; /* Row-store: key */
} key;
+#undef ref_recno
+#define ref_recno key.recno
+#undef ref_ikey
+#define ref_ikey key.ikey
WT_PAGE_DELETED *page_del; /* Deleted on-disk page information */
};
@@ -1007,12 +1002,15 @@ struct __wt_insert_head {
* of pointers and the specific structure exist, else NULL.
*/
#define WT_ROW_INSERT_SLOT(page, slot) \
- ((page)->pg_row_ins == NULL ? NULL : (page)->pg_row_ins[slot])
+ ((page)->modify == NULL || \
+ (page)->modify->mod_row_insert == NULL ? \
+ NULL : (page)->modify->mod_row_insert[slot])
#define WT_ROW_INSERT(page, ip) \
WT_ROW_INSERT_SLOT(page, WT_ROW_SLOT(page, ip))
#define WT_ROW_UPDATE(page, ip) \
- ((page)->pg_row_upd == NULL ? \
- NULL : (page)->pg_row_upd[WT_ROW_SLOT(page, ip)])
+ ((page)->modify == NULL || \
+ (page)->modify->mod_row_update == NULL ? \
+ NULL : (page)->modify->mod_row_update[WT_ROW_SLOT(page, ip)])
/*
* WT_ROW_INSERT_SMALLEST references an additional slot past the end of the
* the "one per WT_ROW slot" insert array. That's because the insert array
@@ -1020,8 +1018,9 @@ struct __wt_insert_head {
* original page.
*/
#define WT_ROW_INSERT_SMALLEST(page) \
- ((page)->pg_row_ins == NULL ? \
- NULL : (page)->pg_row_ins[(page)->pg_row_entries])
+ ((page)->modify == NULL || \
+ (page)->modify->mod_row_insert == NULL ? \
+ NULL : (page)->modify->mod_row_insert[(page)->pg_row_entries])
/*
* The column-store leaf page update lists are arrays of pointers to structures,
@@ -1029,8 +1028,9 @@ struct __wt_insert_head {
* of pointers and the specific structure exist, else NULL.
*/
#define WT_COL_UPDATE_SLOT(page, slot) \
- ((page)->modify == NULL || (page)->modify->mod_update == NULL ? \
- NULL : (page)->modify->mod_update[slot])
+ ((page)->modify == NULL || \
+ (page)->modify->mod_col_update == NULL ? \
+ NULL : (page)->modify->mod_col_update[slot])
#define WT_COL_UPDATE(page, ip) \
WT_COL_UPDATE_SLOT(page, WT_COL_SLOT(page, ip))
@@ -1046,8 +1046,9 @@ struct __wt_insert_head {
* appends.
*/
#define WT_COL_APPEND(page) \
- ((page)->modify != NULL && (page)->modify->mod_append != NULL ? \
- (page)->modify->mod_append[0] : NULL)
+ ((page)->modify == NULL || \
+ (page)->modify->mod_col_append == NULL ? \
+ NULL : (page)->modify->mod_col_append[0])
/* WT_FIX_FOREACH walks fixed-length bit-fields on a disk page. */
#define WT_FIX_FOREACH(btree, dsk, v, i) \
diff --git a/src/include/btree.i b/src/include/btree.i
index 6df7f87073f..4c8166ca6a6 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -511,8 +511,8 @@ __wt_ref_key(WT_PAGE *page, WT_REF *ref, void *keyp, size_t *sizep)
/*
* An internal page key is in one of two places: if we instantiated the
- * key (for example, when reading the page), WT_REF.key.ikey references
- * a WT_IKEY structure, otherwise WT_REF.key.ikey references an on-page
+ * key (for example, when reading the page), WT_REF.ref_ikey references
+ * a WT_IKEY structure, otherwise WT_REF.ref_ikey references an on-page
* key offset/length pair.
*
* Now the magic: allocated memory must be aligned to store any standard
@@ -536,14 +536,14 @@ __wt_ref_key(WT_PAGE *page, WT_REF *ref, void *keyp, size_t *sizep)
#define WT_IK_DECODE_KEY_LEN(v) ((v) >> 32)
#define WT_IK_ENCODE_KEY_OFFSET(v) ((uintptr_t)(v) << 1)
#define WT_IK_DECODE_KEY_OFFSET(v) (((v) & 0xFFFFFFFF) >> 1)
- v = (uintptr_t)ref->key.ikey;
+ v = (uintptr_t)ref->ref_ikey;
if (v & WT_IK_FLAG) {
*(void **)keyp =
WT_PAGE_REF_OFFSET(page, WT_IK_DECODE_KEY_OFFSET(v));
*sizep = WT_IK_DECODE_KEY_LEN(v);
} else {
- *(void **)keyp = WT_IKEY_DATA(ref->key.ikey);
- *sizep = ((WT_IKEY *)ref->key.ikey)->size;
+ *(void **)keyp = WT_IKEY_DATA(ref->ref_ikey);
+ *sizep = ((WT_IKEY *)ref->ref_ikey)->size;
}
}
@@ -562,7 +562,7 @@ __wt_ref_key_onpage_set(WT_PAGE *page, WT_REF *ref, WT_CELL_UNPACK *unpack)
v = WT_IK_ENCODE_KEY_LEN(unpack->size) |
WT_IK_ENCODE_KEY_OFFSET(WT_PAGE_DISK_OFFSET(page, unpack->data)) |
WT_IK_FLAG;
- ref->key.ikey = (void *)v;
+ ref->ref_ikey = (void *)v;
}
/*
@@ -577,8 +577,8 @@ __wt_ref_key_instantiated(WT_REF *ref)
/*
* See the comment in __wt_ref_key for an explanation of the magic.
*/
- v = (uintptr_t)ref->key.ikey;
- return (v & WT_IK_FLAG ? NULL : ref->key.ikey);
+ v = (uintptr_t)ref->ref_ikey;
+ return (v & WT_IK_FLAG ? NULL : ref->ref_ikey);
}
/*
@@ -591,10 +591,10 @@ __wt_ref_key_clear(WT_REF *ref)
/*
* The key union has 2 8B fields; this is equivalent to:
*
- * ref->key.recno = WT_RECNO_OOB;
- * ref->key.ikey = NULL;
+ * ref->ref_recno = WT_RECNO_OOB;
+ * ref->ref_ikey = NULL;
*/
- ref->key.recno = 0;
+ ref->ref_recno = 0;
}
/*
diff --git a/src/include/cache.h b/src/include/cache.h
index 9184a2fe6ed..4f7981a5df9 100644
--- a/src/include/cache.h
+++ b/src/include/cache.h
@@ -26,6 +26,19 @@ struct __wt_evict_entry {
WT_REF *ref; /* Page to flush/evict */
};
+#define WT_EVICT_QUEUE_MAX 2
+/*
+ * WT_EVICT_QUEUE --
+ * Encapsulation of an eviction candidate queue.
+ */
+struct __wt_evict_queue {
+ WT_SPINLOCK evict_lock; /* Eviction LRU queue */
+ WT_EVICT_ENTRY *evict_queue; /* LRU pages being tracked */
+ uint32_t evict_candidates; /* LRU list pages to evict */
+ uint32_t evict_entries; /* LRU entries in the queue */
+ volatile uint32_t evict_max; /* LRU maximum eviction slot used */
+};
+
/*
* WT_EVICT_WORKER --
* Encapsulation of an eviction worker thread.
@@ -67,8 +80,10 @@ struct __wt_cache {
uint64_t pages_dirty;
uint64_t bytes_read; /* Bytes read into memory */
- uint64_t app_evicts; /* Pages evicted by user threads */
uint64_t app_waits; /* User threads waited for cache */
+ uint64_t app_evicts; /* Pages evicted by user threads */
+ uint64_t server_evicts; /* Pages evicted by server thread */
+ uint64_t worker_evicts; /* Pages evicted by worker threads */
uint64_t evict_max_page_size; /* Largest page seen at eviction */
@@ -83,7 +98,6 @@ struct __wt_cache {
* Eviction thread information.
*/
WT_CONDVAR *evict_cond; /* Eviction server condition */
- WT_SPINLOCK evict_lock; /* Eviction LRU queue */
WT_SPINLOCK evict_walk_lock; /* Eviction walk location */
/* Condition signalled when the eviction server populates the queue */
WT_CONDVAR *evict_waiter_cond;
@@ -98,11 +112,11 @@ struct __wt_cache {
/*
* LRU eviction list information.
*/
- WT_EVICT_ENTRY *evict_queue; /* LRU pages being tracked */
+ WT_SPINLOCK evict_queue_lock; /* Eviction current queue lock */
+ WT_EVICT_QUEUE evict_queues[WT_EVICT_QUEUE_MAX];
+ WT_EVICT_QUEUE *evict_current_queue;/* LRU current queue in use */
WT_EVICT_ENTRY *evict_current; /* LRU current page to be evicted */
- uint32_t evict_candidates; /* LRU list pages to evict */
- uint32_t evict_entries; /* LRU entries in the queue */
- volatile uint32_t evict_max; /* LRU maximum eviction slot used */
+ uint32_t evict_queue_fill; /* LRU eviction queue index to fill */
uint32_t evict_slots; /* LRU list eviction slots */
WT_DATA_HANDLE
*evict_file_next; /* LRU next file to search */
diff --git a/src/include/column.i b/src/include/column.i
index d64e68420a5..d15f874b281 100644
--- a/src/include/column.i
+++ b/src/include/column.i
@@ -209,9 +209,12 @@ __col_insert_search(WT_INSERT_HEAD *ins_head,
* Return the last record number for a variable-length column-store page.
*/
static inline uint64_t
-__col_var_last_recno(WT_PAGE *page)
+__col_var_last_recno(WT_REF *ref)
{
WT_COL_RLE *repeat;
+ WT_PAGE *page;
+
+ page = ref->page;
/*
* If there's an append list, there may be more records on the page.
@@ -220,7 +223,7 @@ __col_var_last_recno(WT_PAGE *page)
*/
if (page->pg_var_nrepeats == 0)
return (page->pg_var_entries == 0 ? 0 :
- page->pg_var_recno + (page->pg_var_entries - 1));
+ ref->ref_recno + (page->pg_var_entries - 1));
repeat = &page->pg_var_repeats[page->pg_var_nrepeats - 1];
return ((repeat->recno + repeat->rle) - 1 +
@@ -232,15 +235,19 @@ __col_var_last_recno(WT_PAGE *page)
* Return the last record number for a fixed-length column-store page.
*/
static inline uint64_t
-__col_fix_last_recno(WT_PAGE *page)
+__col_fix_last_recno(WT_REF *ref)
{
+ WT_PAGE *page;
+
+ page = ref->page;
+
/*
* If there's an append list, there may be more records on the page.
* This function ignores those records, our callers must handle that
* explicitly, if they care.
*/
- return (page->pg_fix_entries == 0 ? 0 :
- page->pg_fix_recno + (page->pg_fix_entries - 1));
+ return (page->pg_fix_entries == 0 ?
+ 0 : ref->ref_recno + (page->pg_fix_entries - 1));
}
/*
@@ -248,12 +255,15 @@ __col_fix_last_recno(WT_PAGE *page)
* Search a variable-length column-store page for a record.
*/
static inline WT_COL *
-__col_var_search(WT_PAGE *page, uint64_t recno, uint64_t *start_recnop)
+__col_var_search(WT_REF *ref, uint64_t recno, uint64_t *start_recnop)
{
WT_COL_RLE *repeat;
+ WT_PAGE *page;
uint64_t start_recno;
uint32_t base, indx, limit, start_indx;
+ page = ref->page;
+
/*
* Find the matching slot.
*
@@ -285,7 +295,7 @@ __col_var_search(WT_PAGE *page, uint64_t recno, uint64_t *start_recnop)
*/
if (base == 0) {
start_indx = 0;
- start_recno = page->pg_var_recno;
+ start_recno = ref->ref_recno;
} else {
repeat = page->pg_var_repeats + (base - 1);
start_indx = repeat->indx + 1;
diff --git a/src/include/config.h b/src/include/config.h
index 48a255134af..486aa50e86c 100644
--- a/src/include/config.h
+++ b/src/include/config.h
@@ -59,41 +59,42 @@ struct __wt_config_parser_impl {
#define WT_CONFIG_ENTRY_WT_CONNECTION_load_extension 7
#define WT_CONFIG_ENTRY_WT_CONNECTION_open_session 8
#define WT_CONFIG_ENTRY_WT_CONNECTION_reconfigure 9
-#define WT_CONFIG_ENTRY_WT_CURSOR_close 10
-#define WT_CONFIG_ENTRY_WT_CURSOR_reconfigure 11
-#define WT_CONFIG_ENTRY_WT_SESSION_begin_transaction 12
-#define WT_CONFIG_ENTRY_WT_SESSION_checkpoint 13
-#define WT_CONFIG_ENTRY_WT_SESSION_close 14
-#define WT_CONFIG_ENTRY_WT_SESSION_commit_transaction 15
-#define WT_CONFIG_ENTRY_WT_SESSION_compact 16
-#define WT_CONFIG_ENTRY_WT_SESSION_create 17
-#define WT_CONFIG_ENTRY_WT_SESSION_drop 18
-#define WT_CONFIG_ENTRY_WT_SESSION_join 19
-#define WT_CONFIG_ENTRY_WT_SESSION_log_flush 20
-#define WT_CONFIG_ENTRY_WT_SESSION_log_printf 21
-#define WT_CONFIG_ENTRY_WT_SESSION_open_cursor 22
-#define WT_CONFIG_ENTRY_WT_SESSION_rebalance 23
-#define WT_CONFIG_ENTRY_WT_SESSION_reconfigure 24
-#define WT_CONFIG_ENTRY_WT_SESSION_rename 25
-#define WT_CONFIG_ENTRY_WT_SESSION_reset 26
-#define WT_CONFIG_ENTRY_WT_SESSION_rollback_transaction 27
-#define WT_CONFIG_ENTRY_WT_SESSION_salvage 28
-#define WT_CONFIG_ENTRY_WT_SESSION_snapshot 29
-#define WT_CONFIG_ENTRY_WT_SESSION_strerror 30
-#define WT_CONFIG_ENTRY_WT_SESSION_transaction_sync 31
-#define WT_CONFIG_ENTRY_WT_SESSION_truncate 32
-#define WT_CONFIG_ENTRY_WT_SESSION_upgrade 33
-#define WT_CONFIG_ENTRY_WT_SESSION_verify 34
-#define WT_CONFIG_ENTRY_colgroup_meta 35
-#define WT_CONFIG_ENTRY_file_config 36
-#define WT_CONFIG_ENTRY_file_meta 37
-#define WT_CONFIG_ENTRY_index_meta 38
-#define WT_CONFIG_ENTRY_lsm_meta 39
-#define WT_CONFIG_ENTRY_table_meta 40
-#define WT_CONFIG_ENTRY_wiredtiger_open 41
-#define WT_CONFIG_ENTRY_wiredtiger_open_all 42
-#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 43
-#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 44
+#define WT_CONFIG_ENTRY_WT_CONNECTION_set_file_system 10
+#define WT_CONFIG_ENTRY_WT_CURSOR_close 11
+#define WT_CONFIG_ENTRY_WT_CURSOR_reconfigure 12
+#define WT_CONFIG_ENTRY_WT_SESSION_begin_transaction 13
+#define WT_CONFIG_ENTRY_WT_SESSION_checkpoint 14
+#define WT_CONFIG_ENTRY_WT_SESSION_close 15
+#define WT_CONFIG_ENTRY_WT_SESSION_commit_transaction 16
+#define WT_CONFIG_ENTRY_WT_SESSION_compact 17
+#define WT_CONFIG_ENTRY_WT_SESSION_create 18
+#define WT_CONFIG_ENTRY_WT_SESSION_drop 19
+#define WT_CONFIG_ENTRY_WT_SESSION_join 20
+#define WT_CONFIG_ENTRY_WT_SESSION_log_flush 21
+#define WT_CONFIG_ENTRY_WT_SESSION_log_printf 22
+#define WT_CONFIG_ENTRY_WT_SESSION_open_cursor 23
+#define WT_CONFIG_ENTRY_WT_SESSION_rebalance 24
+#define WT_CONFIG_ENTRY_WT_SESSION_reconfigure 25
+#define WT_CONFIG_ENTRY_WT_SESSION_rename 26
+#define WT_CONFIG_ENTRY_WT_SESSION_reset 27
+#define WT_CONFIG_ENTRY_WT_SESSION_rollback_transaction 28
+#define WT_CONFIG_ENTRY_WT_SESSION_salvage 29
+#define WT_CONFIG_ENTRY_WT_SESSION_snapshot 30
+#define WT_CONFIG_ENTRY_WT_SESSION_strerror 31
+#define WT_CONFIG_ENTRY_WT_SESSION_transaction_sync 32
+#define WT_CONFIG_ENTRY_WT_SESSION_truncate 33
+#define WT_CONFIG_ENTRY_WT_SESSION_upgrade 34
+#define WT_CONFIG_ENTRY_WT_SESSION_verify 35
+#define WT_CONFIG_ENTRY_colgroup_meta 36
+#define WT_CONFIG_ENTRY_file_config 37
+#define WT_CONFIG_ENTRY_file_meta 38
+#define WT_CONFIG_ENTRY_index_meta 39
+#define WT_CONFIG_ENTRY_lsm_meta 40
+#define WT_CONFIG_ENTRY_table_meta 41
+#define WT_CONFIG_ENTRY_wiredtiger_open 42
+#define WT_CONFIG_ENTRY_wiredtiger_open_all 43
+#define WT_CONFIG_ENTRY_wiredtiger_open_basecfg 44
+#define WT_CONFIG_ENTRY_wiredtiger_open_usercfg 45
/*
* configuration section: END
* DO NOT EDIT: automatically built by dist/flags.py.
diff --git a/src/include/connection.h b/src/include/connection.h
index c2b1dd68c18..e6cff08f0ae 100644
--- a/src/include/connection.h
+++ b/src/include/connection.h
@@ -145,20 +145,6 @@ struct __wt_named_extractor {
} while (0)
/*
- * Macros to ensure the file handle is inserted or removed from both the
- * main queue and the hashed queue.
- */
-#define WT_CONN_FILE_INSERT(conn, fh, bucket) do { \
- TAILQ_INSERT_HEAD(&(conn)->fhqh, fh, q); \
- TAILQ_INSERT_HEAD(&(conn)->fhhash[bucket], fh, hashq); \
-} while (0)
-
-#define WT_CONN_FILE_REMOVE(conn, fh, bucket) do { \
- TAILQ_REMOVE(&(conn)->fhqh, fh, q); \
- TAILQ_REMOVE(&(conn)->fhhash[bucket], fh, hashq); \
-} while (0)
-
-/*
* WT_CONNECTION_IMPL --
* Implementation of WT_CONNECTION
*/
@@ -333,7 +319,7 @@ struct __wt_connection_impl {
bool stat_tid_set; /* Statistics log thread set */
WT_CONDVAR *stat_cond; /* Statistics log wait mutex */
const char *stat_format; /* Statistics log timestamp format */
- WT_FH *stat_fh; /* Statistics log file handle */
+ WT_FSTREAM *stat_fs; /* Statistics log stream */
char *stat_path; /* Statistics log path format */
char **stat_sources; /* Statistics log list of objects */
const char *stat_stamp; /* Statistics log entry timestamp */
@@ -414,32 +400,26 @@ struct __wt_connection_impl {
wt_off_t data_extend_len; /* file_extend data length */
wt_off_t log_extend_len; /* file_extend log length */
- /* O_DIRECT/FILE_FLAG_NO_BUFFERING file type flags */
- uint32_t direct_io;
- uint32_t write_through; /* FILE_FLAG_WRITE_THROUGH type flags */
+#define WT_DIRECT_IO_CHECKPOINT 0x01 /* Checkpoints */
+#define WT_DIRECT_IO_DATA 0x02 /* Data files */
+#define WT_DIRECT_IO_LOG 0x04 /* Log files */
+ uint32_t direct_io; /* O_DIRECT, FILE_FLAG_NO_BUFFERING */
+
+ uint32_t write_through; /* FILE_FLAG_WRITE_THROUGH */
+
bool mmap; /* mmap configuration */
int page_size; /* OS page size for mmap alignment */
uint32_t verbose;
- void *inmemory; /* In-memory configuration cookie */
-
#define WT_STDERR(s) (&S2C(s)->wt_stderr)
#define WT_STDOUT(s) (&S2C(s)->wt_stdout)
- WT_FH wt_stderr, wt_stdout;
+ WT_FSTREAM wt_stderr, wt_stdout;
/*
- * OS library/system call jump table, to support in-memory and readonly
- * configurations as well as special devices with other non-POSIX APIs.
+ * File system interface abstracted to support alternative file system
+ * implementations.
*/
- int (*file_directory_list)(WT_SESSION_IMPL *,
- const char *, const char *, uint32_t, char ***, u_int *);
- int (*file_directory_sync)(WT_SESSION_IMPL *, const char *);
- int (*file_exist)(WT_SESSION_IMPL *, const char *, bool *);
- int (*file_remove)(WT_SESSION_IMPL *, const char *);
- int (*file_rename)(WT_SESSION_IMPL *, const char *, const char *);
- int (*file_size)(WT_SESSION_IMPL *, const char *, bool, wt_off_t *);
- int (*handle_open)(WT_SESSION_IMPL *,
- WT_FH *, const char *, uint32_t, uint32_t);
+ WT_FILE_SYSTEM *file_system;
uint32_t flags;
};
diff --git a/src/include/cursor.h b/src/include/cursor.h
index 1d2ce1bfd82..6357523a03f 100644
--- a/src/include/cursor.h
+++ b/src/include/cursor.h
@@ -67,7 +67,7 @@ struct __wt_cursor_backup {
WT_CURSOR iface;
size_t next; /* Cursor position */
- WT_FH *bfh; /* Backup file */
+ WT_FSTREAM *bfs; /* Backup file stream */
uint32_t maxid; /* Maximum log file ID seen */
WT_CURSOR_BACKUP_ENTRY *list; /* List of files to be copied. */
@@ -284,18 +284,50 @@ struct __wt_cursor_index {
uint8_t *cg_needvalue;
};
+/*
+ * A join iterator structure is used to generate candidate primary keys. It
+ * is the responsibility of the caller of the iterator to filter these
+ * primary key against the other conditions of the join before returning
+ * them the caller of WT_CURSOR::next.
+ *
+ * For a conjunction join (the default), entry_count will be 1, meaning that
+ * the iterator only consumes the first entry (WT_CURSOR_JOIN_ENTRY). That
+ * is, it successively returns primary keys from a cursor for the first
+ * index that was joined. When the values returned by that cursor are
+ * exhausted, the iterator has completed. For a disjunction join,
+ * exhausting a cursor just means that the iterator advances to the next
+ * entry. If the next entry represents an index, a new cursor is opened and
+ * primary keys from that index are then successively returned.
+ *
+ * When positioned on an entry that represents a nested join, a new child
+ * iterator is created that will be bound to the nested WT_CURSOR_JOIN.
+ * That iterator is then used to generate candidate primary keys. When its
+ * iteration is completed, that iterator is destroyed and the parent
+ * iterator advances to the next entry. Thus, depending on how deeply joins
+ * are nested, a similarly deep stack of iterators is created.
+ */
struct __wt_cursor_join_iter {
WT_SESSION_IMPL *session;
WT_CURSOR_JOIN *cjoin;
WT_CURSOR_JOIN_ENTRY *entry;
+ WT_CURSOR_JOIN_ITER *child;
WT_CURSOR *cursor; /* has null projection */
- WT_CURSOR *main; /* main table with projection */
WT_ITEM *curkey; /* primary key */
WT_ITEM idxkey;
+ u_int entry_pos; /* the current entry */
+ u_int entry_count; /* entries to walk */
+ u_int end_pos; /* the current endpoint */
+ u_int end_count; /* endpoints to walk */
+ u_int end_skip; /* when testing for inclusion */
+ /* can we skip current end? */
bool positioned;
- bool isequal; /* advancing means we're done */
+ bool is_equal;
};
+/*
+ * A join endpoint represents a positioned cursor that is 'captured' by a
+ * WT_SESSION::join call.
+ */
struct __wt_cursor_join_endpoint {
WT_ITEM key;
uint8_t recno_buf[10]; /* holds packed recno */
@@ -313,9 +345,17 @@ struct __wt_cursor_join_endpoint {
((endp)->flags & \
(WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ | WT_CURJOIN_END_LT))
+/*
+ * Each join entry typically represents an index's participation in a join.
+ * For example, if 'k' is an index, then "t.k > 10 && t.k < 20" would be
+ * represented by a single entry, with two endpoints. When the index and
+ * subjoin fields are NULL, the join is on the main table. When subjoin is
+ * non-NULL, there is a nested join clause.
+ */
struct __wt_cursor_join_entry {
WT_INDEX *index;
WT_CURSOR *main; /* raw main table cursor */
+ WT_CURSOR_JOIN *subjoin; /* a nested join clause */
WT_BLOOM *bloom; /* Bloom filter handle */
char *repack_format; /* target format for repack */
uint32_t bloom_bit_count; /* bits per item in bloom */
@@ -339,15 +379,17 @@ struct __wt_cursor_join {
WT_TABLE *table;
const char *projection;
- WT_CURSOR_JOIN_ITER *iter;
+ WT_CURSOR *main; /* main table with projection */
+ WT_CURSOR_JOIN *parent; /* parent of nested group */
+ WT_CURSOR_JOIN_ITER *iter; /* chain of iterators */
WT_CURSOR_JOIN_ENTRY *entries;
size_t entries_allocated;
u_int entries_next;
uint8_t recno_buf[10]; /* holds packed recno */
-#define WT_CURJOIN_ERROR 0x01 /* Error in initialization */
-#define WT_CURJOIN_INITIALIZED 0x02 /* Successful initialization */
-#define WT_CURJOIN_SKIP_FIRST_LEFT 0x04 /* First check not needed */
+#define WT_CURJOIN_DISJUNCTION 0x01 /* Entries are or-ed */
+#define WT_CURJOIN_ERROR 0x02 /* Error in initialization */
+#define WT_CURJOIN_INITIALIZED 0x04 /* Successful initialization */
uint8_t flags;
};
diff --git a/src/include/cursor.i b/src/include/cursor.i
index 8ab96c0a69d..553dd03f958 100644
--- a/src/include/cursor.i
+++ b/src/include/cursor.i
@@ -270,7 +270,7 @@ __cursor_func_init(WT_CURSOR_BTREE *cbt, bool reenter)
* to read.
*/
if (!F_ISSET(cbt, WT_CBT_NO_TXN))
- __wt_txn_cursor_op(session);
+ WT_RET(__wt_txn_cursor_op(session));
return (0);
}
diff --git a/src/include/extern.h b/src/include/extern.h
index 292bcfb1c7c..53e49e51a26 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -41,8 +41,8 @@ extern int __wt_block_extlist_write(WT_SESSION_IMPL *session, WT_BLOCK *block, W
extern int __wt_block_extlist_truncate( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el);
extern int __wt_block_extlist_init(WT_SESSION_IMPL *session, WT_EXTLIST *el, const char *name, const char *extname, bool track_size);
extern void __wt_block_extlist_free(WT_SESSION_IMPL *session, WT_EXTLIST *el);
-extern int __wt_block_map( WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapp, size_t *maplenp, void **mappingcookie);
-extern int __wt_block_unmap( WT_SESSION_IMPL *session, WT_BLOCK *block, void *map, size_t maplen, void **mappingcookie);
+extern int __wt_block_map(WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapped_regionp, size_t *lengthp, void *mapped_cookiep);
+extern int __wt_block_unmap(WT_SESSION_IMPL *session, WT_BLOCK *block, void *mapped_region, size_t length, void *mapped_cookie);
extern int __wt_block_manager_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], bool forced_salvage, bool readonly, uint32_t allocsize, WT_BM **bmp);
extern int __wt_block_manager_drop(WT_SESSION_IMPL *session, const char *filename);
extern int __wt_block_manager_create( WT_SESSION_IMPL *session, const char *filename, uint32_t allocsize);
@@ -118,9 +118,9 @@ extern int __wt_debug_offset_blind( WT_SESSION_IMPL *session, wt_off_t offset, c
extern int __wt_debug_offset(WT_SESSION_IMPL *session, wt_off_t offset, uint32_t size, uint32_t cksum, const char *ofile);
extern int __wt_debug_disk( WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, const char *ofile);
extern int __wt_debug_tree_shape( WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile);
-extern int __wt_debug_tree_all( WT_SESSION_IMPL *session, WT_BTREE *btree, WT_PAGE *page, const char *ofile);
-extern int __wt_debug_tree( WT_SESSION_IMPL *session, WT_BTREE *btree, WT_PAGE *page, const char *ofile);
-extern int __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile);
+extern int __wt_debug_tree_all( WT_SESSION_IMPL *session, WT_BTREE *btree, WT_REF *ref, const char *ofile);
+extern int __wt_debug_tree( WT_SESSION_IMPL *session, WT_BTREE *btree, WT_REF *ref, const char *ofile);
+extern int __wt_debug_page(WT_SESSION_IMPL *session, WT_REF *ref, const char *ofile);
extern int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp);
extern void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref);
extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all);
@@ -134,7 +134,7 @@ extern int __wt_btree_open(WT_SESSION_IMPL *session, const char *op_cfg[]);
extern int __wt_btree_close(WT_SESSION_IMPL *session);
extern void __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, bool is_recno);
extern int __wt_btree_tree_open( WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size);
-extern int __wt_btree_new_leaf_page( WT_SESSION_IMPL *session, uint64_t recno, WT_PAGE **pagep);
+extern int __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep);
extern void __wt_btree_evictable(WT_SESSION_IMPL *session, bool on);
extern int __wt_btree_huffman_open(WT_SESSION_IMPL *session);
extern void __wt_btree_huffman_close(WT_SESSION_IMPL *session);
@@ -148,7 +148,7 @@ extern const char *__wt_buf_set_printable( WT_SESSION_IMPL *session, const void
extern int __wt_ovfl_read(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store);
extern int __wt_ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, void *cookie, WT_CELL_UNPACK *vpack);
extern int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell);
-extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint64_t recno, uint32_t alloc_entries, bool alloc_refs, WT_PAGE **pagep);
+extern int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint32_t alloc_entries, bool alloc_refs, WT_PAGE **pagep);
extern int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, size_t memsize, uint32_t flags, WT_PAGE **pagep);
extern int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size);
extern int
@@ -203,6 +203,8 @@ extern int __wt_las_cursor_open(WT_SESSION_IMPL *session, WT_CURSOR **cursorp);
extern int __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags);
extern int __wt_las_cursor_close( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags);
extern int __wt_las_sweep(WT_SESSION_IMPL *session);
+extern uint32_t __wt_cksum(const void *chunk, size_t len);
+extern void __wt_cksum_init(void);
extern int __wt_config_initn( WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str, size_t len);
extern int __wt_config_init(WT_SESSION_IMPL *session, WT_CONFIG *conf, const char *str);
extern int __wt_config_subinit( WT_SESSION_IMPL *session, WT_CONFIG *conf, WT_CONFIG_ITEM *item);
@@ -283,8 +285,8 @@ extern int __wt_curdump_create(WT_CURSOR *child, WT_CURSOR *owner, WT_CURSOR **c
extern int __wt_curfile_update_check(WT_CURSOR *cursor);
extern int __wt_curfile_create(WT_SESSION_IMPL *session, WT_CURSOR *owner, const char *cfg[], bool bulk, bool bitmap, WT_CURSOR **cursorp);
extern int __wt_curfile_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
-extern int __wt_curindex_joined(WT_CURSOR *cursor);
extern int __wt_curindex_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
+extern int __wt_curjoin_joined(WT_CURSOR *cursor);
extern int __wt_curjoin_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
extern int __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_INDEX *idx, WT_CURSOR *ref_cursor, uint8_t flags, uint8_t range, uint64_t count, uint32_t bloom_bit_count, uint32_t bloom_hash_count);
extern int __wt_json_alloc_unpack(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, WT_CURSOR_JSON *json, bool iskey, va_list ap);
@@ -356,7 +358,6 @@ extern int __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn);
extern int __wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, bool *recp);
extern void __wt_log_written_reset(WT_SESSION_IMPL *session);
extern int __wt_log_get_all_files(WT_SESSION_IMPL *session, char ***filesp, u_int *countp, uint32_t *maxid, bool active_only);
-extern void __wt_log_files_free(WT_SESSION_IMPL *session, char **files, u_int count);
extern int __wt_log_extract_lognum( WT_SESSION_IMPL *session, const char *name, uint32_t *id);
extern int __wt_log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot);
extern int __wt_log_allocfile( WT_SESSION_IMPL *session, uint32_t lognum, const char *dest);
@@ -580,8 +581,6 @@ extern void __wt_session_close_cache(WT_SESSION_IMPL *session);
extern int __wt_session_get_btree(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, const char *cfg[], uint32_t flags);
extern int __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint);
extern int __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]);
-extern uint32_t __wt_cksum(const void *chunk, size_t len);
-extern void __wt_cksum_init(void);
extern int __wt_cond_auto_alloc( WT_SESSION_IMPL *session, const char *name, bool is_signalled, uint64_t min, uint64_t max, WT_CONDVAR **condp);
extern int __wt_cond_auto_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond);
extern int __wt_cond_auto_wait_signal( WT_SESSION_IMPL *session, WT_CONDVAR *cond, bool progress, bool *signalled);
@@ -676,8 +675,8 @@ extern void __wt_stat_join_clear_single(WT_JOIN_STATS *stats);
extern void __wt_stat_join_clear_all(WT_JOIN_STATS **stats);
extern void __wt_stat_join_aggregate( WT_JOIN_STATS **from, WT_JOIN_STATS *to);
extern void __wt_txn_release_snapshot(WT_SESSION_IMPL *session);
-extern void __wt_txn_get_snapshot(WT_SESSION_IMPL *session);
-extern void __wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force);
+extern int __wt_txn_get_snapshot(WT_SESSION_IMPL *session);
+extern int __wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force);
extern int __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]);
extern void __wt_txn_release(WT_SESSION_IMPL *session);
extern int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]);
@@ -713,7 +712,7 @@ extern int __wt_txn_named_snapshot_config(WT_SESSION_IMPL *session, const char *
extern int __wt_txn_named_snapshot_destroy(WT_SESSION_IMPL *session);
extern int __wt_txn_recover(WT_SESSION_IMPL *session);
extern bool __wt_absolute_path(const char *path);
-extern bool __wt_handle_search(WT_SESSION_IMPL *session, const char *name, bool increment_ref, WT_FH *newfh, WT_FH **fhp);
+extern bool __wt_handle_is_open(WT_SESSION_IMPL *session, const char *name);
extern bool __wt_has_priv(void);
extern const char *__wt_path_separator(void);
extern const char *__wt_strerror(WT_SESSION_IMPL *session, int error, char *errbuf, size_t errlen);
@@ -731,49 +730,43 @@ extern int __wt_dlsym(WT_SESSION_IMPL *session, WT_DLH *dlh, const char *name, b
extern int __wt_epoch(WT_SESSION_IMPL *session, struct timespec *tsp);
extern int __wt_errno(void);
extern int __wt_filename(WT_SESSION_IMPL *session, const char *name, char **path);
+extern int __wt_fopen(WT_SESSION_IMPL *session, const char *name, uint32_t open_flags, uint32_t flags, WT_FSTREAM **fsp);
extern int __wt_get_vm_pagesize(void);
extern int __wt_getenv(WT_SESSION_IMPL *session, const char *variable, const char **envp);
extern int __wt_getlasterror(void);
-extern int __wt_getline(WT_SESSION_IMPL *session, WT_ITEM *buf, WT_FH *fh);
extern int __wt_getopt( const char *progname, int nargc, char *const *nargv, const char *ostr);
extern int __wt_malloc(WT_SESSION_IMPL *session, size_t bytes_to_allocate, void *retp);
extern int __wt_map_error_rdonly(int error);
extern int __wt_nfilename( WT_SESSION_IMPL *session, const char *name, size_t namelen, char **path);
extern int __wt_once(void (*init_routine)(void));
-extern int __wt_open(WT_SESSION_IMPL *session, const char *name, uint32_t file_type, uint32_t flags, WT_FH **fhp);
-extern int __wt_os_cleanup(WT_SESSION_IMPL *session);
-extern int __wt_os_init(WT_SESSION_IMPL *session);
+extern int __wt_open(WT_SESSION_IMPL *session, const char *name, WT_OPEN_FILE_TYPE file_type, u_int flags, WT_FH **fhp);
extern int __wt_os_inmemory(WT_SESSION_IMPL *session);
-extern int __wt_os_inmemory_cleanup(WT_SESSION_IMPL *session);
extern int __wt_os_posix(WT_SESSION_IMPL *session);
-extern int __wt_os_posix_cleanup(WT_SESSION_IMPL *session);
extern int __wt_os_stdio(WT_SESSION_IMPL *session);
extern int __wt_os_win(WT_SESSION_IMPL *session);
-extern int __wt_os_win_cleanup(WT_SESSION_IMPL *session);
-extern int __wt_posix_directory_list(WT_SESSION_IMPL *session, const char *dir, const char *prefix, uint32_t flags, char ***dirlist, u_int *countp);
-extern int __wt_posix_handle_allocate( WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, wt_off_t len);
-extern int __wt_posix_map(WT_SESSION_IMPL *session, WT_FH *fh, void *mapp, size_t *lenp, void **mappingcookie);
-extern int __wt_posix_map_discard( WT_SESSION_IMPL *session, WT_FH *fh, void *p, size_t size);
-extern int __wt_posix_map_preload( WT_SESSION_IMPL *session, WT_FH *fh, const void *p, size_t size);
-extern int __wt_posix_map_unmap(WT_SESSION_IMPL *session, WT_FH *fh, void *map, size_t len, void **mappingcookie);
+extern int __wt_posix_directory_list(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const char *directory, const char *prefix, char ***dirlistp, uint32_t *countp);
+extern int __wt_posix_directory_list_free(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, char **dirlist, uint32_t count);
+extern int __wt_posix_file_fallocate(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, wt_off_t offset, wt_off_t len);
+extern int __wt_posix_map(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, void *mapped_regionp, size_t *lenp, void *mapped_cookiep);
+extern int __wt_posix_map_discard(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, void *map, size_t length, void *mapped_cookie);
+extern int __wt_posix_map_preload(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, const void *map, size_t length, void *mapped_cookie);
+extern int __wt_posix_unmap(WT_FILE_HANDLE *fh, WT_SESSION *wt_session, void *mapped_region, size_t len, void *mapped_cookie);
extern int __wt_realloc(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp);
extern int __wt_realloc_aligned(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp);
extern int __wt_realloc_noclear(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp);
extern int __wt_remove_if_exists(WT_SESSION_IMPL *session, const char *name);
extern int __wt_rename_and_sync_directory( WT_SESSION_IMPL *session, const char *from, const char *to);
extern int __wt_strndup(WT_SESSION_IMPL *session, const void *str, size_t len, void *retp);
-extern int __wt_sync_handle_and_rename( WT_SESSION_IMPL *session, WT_FH **fhp, const char *from, const char *to);
extern int __wt_thread_create(WT_SESSION_IMPL *session, wt_thread_t *tidret, WT_THREAD_CALLBACK(*func)(void *), void *arg);
extern int __wt_thread_join(WT_SESSION_IMPL *session, wt_thread_t tid);
-extern int __wt_win_directory_list(WT_SESSION_IMPL *session, const char *dir, const char *prefix, uint32_t flags, char ***dirlist, u_int *countp);
-extern int __wt_win_map(WT_SESSION_IMPL *session, WT_FH *fh, void *mapp, size_t *lenp, void **mappingcookie);
-extern int __wt_win_map_discard(WT_SESSION_IMPL *session, WT_FH *fh, void *p, size_t size);
-extern int __wt_win_map_preload( WT_SESSION_IMPL *session, WT_FH *fh, const void *p, size_t size);
-extern int __wt_win_map_unmap(WT_SESSION_IMPL *session, WT_FH *fh, void *map, size_t len, void **mappingcookie);
+extern int __wt_win_directory_list(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const char *directory, const char *prefix, char ***dirlistp, uint32_t *countp);
+extern int __wt_win_directory_list_free(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, char **dirlist, uint32_t count);
+extern int __wt_win_fs_size(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const char *name, wt_off_t *sizep);
+extern int __wt_win_map(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, void *mapped_regionp, size_t *lenp, void *mapped_cookiep);
+extern int __wt_win_unmap(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, void *mapped_region, size_t length, void *mapped_cookie);
extern uint64_t __wt_strtouq(const char *nptr, char **endptr, int base);
extern void __wt_abort(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn));
extern void __wt_free_int(WT_SESSION_IMPL *session, const void *p_arg);
-extern void __wt_posix_handle_allocate_configure(WT_SESSION_IMPL *session, WT_FH *fh);
extern void __wt_sleep(uint64_t seconds, uint64_t micro_seconds);
extern void __wt_stream_set_line_buffer(FILE *fp);
extern void __wt_stream_set_no_buffer(FILE *fp);
diff --git a/src/include/flags.h b/src/include/flags.h
index 3d9b0ed716b..da7aee7b059 100644
--- a/src/include/flags.h
+++ b/src/include/flags.h
@@ -24,11 +24,6 @@
#define WT_EVICT_IN_MEMORY 0x00000002
#define WT_EVICT_LOOKASIDE 0x00000004
#define WT_EVICT_UPDATE_RESTORE 0x00000008
-#define WT_FILE_TYPE_CHECKPOINT 0x00000001
-#define WT_FILE_TYPE_DATA 0x00000002
-#define WT_FILE_TYPE_DIRECTORY 0x00000004
-#define WT_FILE_TYPE_LOG 0x00000008
-#define WT_FILE_TYPE_REGULAR 0x00000010
#define WT_LOGSCAN_FIRST 0x00000001
#define WT_LOGSCAN_FROM_CKP 0x00000002
#define WT_LOGSCAN_ONE 0x00000004
diff --git a/src/include/log.h b/src/include/log.h
index 0e676d47b66..387d0c6c154 100644
--- a/src/include/log.h
+++ b/src/include/log.h
@@ -46,10 +46,12 @@ union __wt_lsn {
*/
#define WT_IS_INIT_LSN(l) ((l)->file_offset == ((uint64_t)1 << 32))
/*
- * XXX Original tested INT32_MAX.
+ * Original tested INT32_MAX. But if we read one from an older
+ * release we may see UINT32_MAX.
*/
#define WT_IS_MAX_LSN(lsn) \
- ((lsn)->l.file == UINT32_MAX && (lsn)->l.offset == INT32_MAX)
+ ((lsn)->l.file == UINT32_MAX && \
+ ((lsn)->l.offset == INT32_MAX || (lsn)->l.offset == UINT32_MAX))
/*
* Both of the macros below need to change if the content of __wt_lsn
diff --git a/src/include/meta.h b/src/include/meta.h
index ac0f5fedac4..ba4149979ef 100644
--- a/src/include/meta.h
+++ b/src/include/meta.h
@@ -14,8 +14,10 @@
#define WT_USERCONFIG "WiredTiger.config" /* User configuration */
+#define WT_BACKUP_TMP "WiredTiger.backup.tmp" /* Backup tmp file */
#define WT_METADATA_BACKUP "WiredTiger.backup" /* Hot backup file */
#define WT_INCREMENTAL_BACKUP "WiredTiger.ibackup" /* Incremental backup */
+#define WT_INCREMENTAL_SRC "WiredTiger.isrc" /* Incremental source */
#define WT_METADATA_TURTLE "WiredTiger.turtle" /* Metadata metadata */
#define WT_METADATA_TURTLE_SET "WiredTiger.turtle.set" /* Turtle temp file */
diff --git a/src/include/misc.h b/src/include/misc.h
index 07d52c61eac..4c7c9572905 100644
--- a/src/include/misc.h
+++ b/src/include/misc.h
@@ -96,8 +96,9 @@
* the caller remember to put the & operator on the pointer.
*/
#define __wt_free(session, p) do { \
- if ((p) != NULL) \
- __wt_free_int(session, (void *)&(p)); \
+ void *__p = &(p); \
+ if (*(void **)__p != NULL) \
+ __wt_free_int(session, __p); \
} while (0)
#ifdef HAVE_DIAGNOSTIC
#define __wt_overwrite_and_free(session, p) do { \
diff --git a/src/include/misc.i b/src/include/misc.i
index 114b711ac88..eaa7a328ff1 100644
--- a/src/include/misc.i
+++ b/src/include/misc.i
@@ -70,248 +70,3 @@ __wt_verbose(WT_SESSION_IMPL *session, int flag, const char *fmt, ...)
return (0);
#endif
}
-
-/*
- * __wt_dirlist --
- * Get a list of files from a directory.
- */
-static inline int
-__wt_dirlist(WT_SESSION_IMPL *session, const char *dir,
- const char *prefix, uint32_t flags, char ***dirlist, u_int *countp)
-{
- WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_IN_MEMORY));
-
- WT_RET(__wt_verbose(session, WT_VERB_FILEOPS,
- "%s: directory-list: %s prefix %s",
- dir, LF_ISSET(WT_DIRLIST_INCLUDE) ? "include" : "exclude",
- prefix == NULL ? "all" : prefix));
-
- return (S2C(session)->file_directory_list(
- session, dir, prefix, flags, dirlist, countp));
-}
-
-/*
- * __wt_directory_sync --
- * Flush a directory to ensure file creation is durable.
- */
-static inline int
-__wt_directory_sync(WT_SESSION_IMPL *session, const char *name)
-{
- WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY));
-
- WT_RET(__wt_verbose(
- session, WT_VERB_FILEOPS, "%s: directory-sync", name));
-
- return (S2C(session)->file_directory_sync(session, name));
-}
-
-/*
- * __wt_exist --
- * Return if the file exists.
- */
-static inline int
-__wt_exist(WT_SESSION_IMPL *session, const char *name, bool *existp)
-{
- WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: file-exist", name));
-
- return (S2C(session)->file_exist(session, name, existp));
-}
-
-/*
- * __wt_remove --
- * POSIX remove.
- */
-static inline int
-__wt_remove(WT_SESSION_IMPL *session, const char *name)
-{
- WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY));
-
- WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: file-remove", name));
-
- return (S2C(session)->file_remove(session, name));
-}
-
-/*
- * __wt_rename --
- * POSIX rename.
- */
-static inline int
-__wt_rename(WT_SESSION_IMPL *session, const char *from, const char *to)
-{
- WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY));
-
- WT_RET(__wt_verbose(
- session, WT_VERB_FILEOPS, "%s to %s: file-rename", from, to));
-
- return (S2C(session)->file_rename(session, from, to));
-}
-
-/*
- * __wt_filesize_name --
- * Get the size of a file in bytes, by file name.
- */
-static inline int
-__wt_filesize_name(
- WT_SESSION_IMPL *session, const char *name, bool silent, wt_off_t *sizep)
-{
- WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: file-size", name));
-
- return (S2C(session)->file_size(session, name, silent, sizep));
-}
-
-/*
- * __wt_directory_sync_fh --
- * Flush a directory file handle to ensure file creation is durable.
- *
- * We don't use the normal sync path because many file systems don't require
- * this step and we don't want to penalize them.
- */
-static inline int
-__wt_directory_sync_fh(WT_SESSION_IMPL *session, WT_FH *fh)
-{
- WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY));
-
- return (fh->fh_sync(session, fh, true));
-}
-
-/*
- * __wt_fallocate --
- * Extend a file.
- */
-static inline int
-__wt_fallocate(
- WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, wt_off_t len)
-{
- WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY));
- WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_IN_MEMORY));
-
- WT_RET(__wt_verbose(session, WT_VERB_HANDLEOPS,
- "%s: handle-allocate: %" PRIuMAX " at %" PRIuMAX,
- fh->name, (uintmax_t)len, (uintmax_t)offset));
-
- return (fh->fh_allocate(session, fh, offset, len));
-}
-
-/*
- * __wt_file_lock --
- * Lock/unlock a file.
- */
-static inline int
-__wt_file_lock(WT_SESSION_IMPL * session, WT_FH *fh, bool lock)
-{
- WT_RET(__wt_verbose(session, WT_VERB_HANDLEOPS,
- "%s: handle-lock: %s", fh->name, lock ? "lock" : "unlock"));
-
- return (fh->fh_lock(session, fh, lock));
-}
-
-/*
- * __wt_vfprintf --
- * ANSI C vfprintf.
- */
-static inline int
-__wt_vfprintf(WT_SESSION_IMPL *session, WT_FH *fh, const char *fmt, va_list ap)
-{
- WT_RET(__wt_verbose(
- session, WT_VERB_HANDLEOPS, "%s: handle-printf", fh->name));
-
- return (fh->fh_printf(session, fh, fmt, ap));
-}
-
-/*
- * __wt_fprintf --
- * ANSI C fprintf.
- */
-static inline int
-__wt_fprintf(WT_SESSION_IMPL *session, WT_FH *fh, const char *fmt, ...)
- WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4)))
-{
- WT_DECL_RET;
- va_list ap;
-
- va_start(ap, fmt);
- ret = __wt_vfprintf(session, fh, fmt, ap);
- va_end(ap);
-
- return (ret);
-}
-
-/*
- * __wt_read --
- * POSIX pread.
- */
-static inline int
-__wt_read(
- WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf)
-{
- WT_RET(__wt_verbose(session, WT_VERB_HANDLEOPS,
- "%s: handle-read: %" WT_SIZET_FMT " at %" PRIuMAX,
- fh->name, len, (uintmax_t)offset));
-
- WT_STAT_FAST_CONN_INCR(session, read_io);
-
- return (fh->fh_read(session, fh, offset, len, buf));
-}
-
-/*
- * __wt_filesize --
- * Get the size of a file in bytes, by file handle.
- */
-static inline int
-__wt_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *sizep)
-{
- WT_RET(__wt_verbose(
- session, WT_VERB_HANDLEOPS, "%s: handle-size", fh->name));
-
- return (fh->fh_size(session, fh, sizep));
-}
-
-/*
- * __wt_fsync --
- * POSIX fflush/fsync.
- */
-static inline int
-__wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh, bool block)
-{
- WT_RET(__wt_verbose(
- session, WT_VERB_HANDLEOPS, "%s: handle-sync", fh->name));
-
- return (fh->fh_sync(session, fh, block));
-}
-
-/*
- * __wt_ftruncate --
- * POSIX ftruncate.
- */
-static inline int
-__wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len)
-{
- WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY));
-
- WT_RET(__wt_verbose(session, WT_VERB_HANDLEOPS,
- "%s: handle-truncate: %" PRIuMAX,
- fh->name, (uintmax_t)len));
-
- return (fh->fh_truncate(session, fh, len));
-}
-
-/*
- * __wt_write --
- * POSIX pwrite.
- */
-static inline int
-__wt_write(WT_SESSION_IMPL *session,
- WT_FH *fh, wt_off_t offset, size_t len, const void *buf)
-{
- WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY) ||
- WT_STRING_MATCH(fh->name,
- WT_SINGLETHREAD, strlen(WT_SINGLETHREAD)));
-
- WT_RET(__wt_verbose(session, WT_VERB_HANDLEOPS,
- "%s: handle-write: %" WT_SIZET_FMT " at %" PRIuMAX,
- fh->name, len, (uintmax_t)offset));
-
- WT_STAT_FAST_CONN_INCR(session, write_io);
-
- return (fh->fh_write(session, fh, offset, len, buf));
-}
diff --git a/src/include/os.h b/src/include/os.h
index 2ff41d39f46..bf60f32f764 100644
--- a/src/include/os.h
+++ b/src/include/os.h
@@ -6,14 +6,6 @@
* See the file LICENSE for redistribution information.
*/
-/*
- * Number of directory entries can grow dynamically.
- */
-#define WT_DIR_ENTRY 32
-
-#define WT_DIRLIST_EXCLUDE 0x1 /* Exclude files matching prefix */
-#define WT_DIRLIST_INCLUDE 0x2 /* Include files matching prefix */
-
#define WT_SYSCALL_RETRY(call, ret) do { \
int __retry; \
for (__retry = 0; __retry < 10; ++__retry) { \
@@ -59,81 +51,97 @@
(t1).tv_nsec == (t2).tv_nsec ? 0 : 1 : 1)
/*
- * The underlying OS calls return ENOTSUP if posix_fadvise functionality isn't
- * available, but WiredTiger uses the POSIX flag names in the API. Use distinct
- * values so the underlying code can distinguish.
+ * Macros to ensure a file handle is inserted or removed from both the main and
+ * the hashed queue, used by connection-level and in-memory data structures.
*/
-#ifndef POSIX_FADV_DONTNEED
-#define POSIX_FADV_DONTNEED 0x01
-#endif
-#ifndef POSIX_FADV_WILLNEED
-#define POSIX_FADV_WILLNEED 0x02
-#endif
+#define WT_FILE_HANDLE_INSERT(h, fh, bucket) do { \
+ TAILQ_INSERT_HEAD(&(h)->fhqh, fh, q); \
+ TAILQ_INSERT_HEAD(&(h)->fhhash[bucket], fh, hashq); \
+} while (0)
-#define WT_OPEN_CREATE 0x001 /* Create is OK */
-#define WT_OPEN_EXCLUSIVE 0x002 /* Exclusive open */
-#define WT_OPEN_FIXED 0x004 /* Path isn't relative to home */
-#define WT_OPEN_READONLY 0x008 /* Readonly open */
-#define WT_STREAM_APPEND 0x010 /* Open a stream: append */
-#define WT_STREAM_LINE_BUFFER 0x020 /* Line buffer the stream */
-#define WT_STREAM_READ 0x040 /* Open a stream: read */
-#define WT_STREAM_WRITE 0x080 /* Open a stream: write */
+#define WT_FILE_HANDLE_REMOVE(h, fh, bucket) do { \
+ TAILQ_REMOVE(&(h)->fhqh, fh, q); \
+ TAILQ_REMOVE(&(h)->fhhash[bucket], fh, hashq); \
+} while (0)
struct __wt_fh {
+ /*
+ * There is a file name field in both the WT_FH and WT_FILE_HANDLE
+ * structures, which isn't ideal. There would be compromises to keeping
+ * a single copy: If it were in WT_FH, file systems could not access
+ * the name field, if it were just in the WT_FILE_HANDLE internal
+ * WiredTiger code would need to maintain a string inside a structure
+ * that is owned by the user (since we care about the content of the
+ * file name). Keeping two copies seems most reasonable.
+ */
const char *name; /* File name */
- uint64_t name_hash; /* Hash of name */
- TAILQ_ENTRY(__wt_fh) q; /* List of open handles */
- TAILQ_ENTRY(__wt_fh) hashq; /* Hashed list of handles */
- u_int ref; /* Reference count */
+ uint64_t name_hash; /* hash of name */
+ TAILQ_ENTRY(__wt_fh) q; /* internal queue */
+ TAILQ_ENTRY(__wt_fh) hashq; /* internal hash queue */
+ u_int ref; /* reference count */
+
+ WT_FILE_HANDLE *handle;
+};
+
+#ifdef _WIN32
+struct __wt_file_handle_win {
+ WT_FILE_HANDLE iface;
/*
- * Underlying file system handle support.
+ * Windows specific file handle fields
*/
-#ifdef _WIN32
HANDLE filehandle; /* Windows file handle */
HANDLE filehandle_secondary; /* Windows file handle
for file size changes */
+ bool direct_io; /* O_DIRECT configured */
+};
+
#else
+
+struct __wt_file_handle_posix {
+ WT_FILE_HANDLE iface;
+
+ /*
+ * POSIX specific file handle fields
+ */
int fd; /* POSIX file handle */
+
+ bool direct_io; /* O_DIRECT configured */
+};
#endif
- FILE *fp; /* ANSI C stdio handle */
+
+struct __wt_file_handle_inmem {
+ WT_FILE_HANDLE iface;
/*
- * Underlying in-memory handle support.
+ * In memory specific file handle fields
*/
+ uint64_t name_hash; /* hash of name */
+ TAILQ_ENTRY(__wt_file_handle_inmem) q; /* internal queue, hash queue */
+ TAILQ_ENTRY(__wt_file_handle_inmem) hashq;
+
size_t off; /* Read/write offset */
WT_ITEM buf; /* Data */
+ u_int ref; /* Reference count */
+};
- bool direct_io; /* O_DIRECT configured */
+struct __wt_fstream {
+ const char *name; /* Stream name */
- enum { /* file extend configuration */
- WT_FALLOCATE_AVAILABLE,
- WT_FALLOCATE_NOT_AVAILABLE,
- WT_FALLOCATE_POSIX,
- WT_FALLOCATE_STD,
- WT_FALLOCATE_SYS } fallocate_available;
- bool fallocate_requires_locking;
+ FILE *fp; /* stdio FILE stream */
+ WT_FH *fh; /* WT file handle */
+ wt_off_t off; /* Read/write offset */
+ wt_off_t size; /* File size */
+ WT_ITEM buf; /* Data */
-#define WT_FH_FLUSH_ON_CLOSE 0x01 /* Flush when closing */
-#define WT_FH_IN_MEMORY 0x02 /* In-memory, don't remove */
+#define WT_STREAM_APPEND 0x01 /* Open a stream for append */
+#define WT_STREAM_READ 0x02 /* Open a stream for read */
+#define WT_STREAM_WRITE 0x04 /* Open a stream for write */
uint32_t flags;
- int (*fh_advise)(WT_SESSION_IMPL *, WT_FH *, wt_off_t, wt_off_t, int);
- int (*fh_allocate)(WT_SESSION_IMPL *, WT_FH *, wt_off_t, wt_off_t);
- int (*fh_close)(WT_SESSION_IMPL *, WT_FH *);
- int (*fh_getc)(WT_SESSION_IMPL *, WT_FH *, int *);
- int (*fh_lock)(WT_SESSION_IMPL *, WT_FH *, bool);
- int (*fh_map)(WT_SESSION_IMPL *, WT_FH *, void *, size_t *, void **);
- int (*fh_map_discard)(WT_SESSION_IMPL *, WT_FH *, void *, size_t);
- int (*fh_map_preload)(WT_SESSION_IMPL *, WT_FH *, const void *, size_t);
- int (*fh_map_unmap)(
- WT_SESSION_IMPL *, WT_FH *, void *, size_t, void **);
- int (*fh_printf)(WT_SESSION_IMPL *, WT_FH *, const char *, va_list);
- int (*fh_read)(WT_SESSION_IMPL *, WT_FH *, wt_off_t, size_t, void *);
- int (*fh_size)(WT_SESSION_IMPL *, WT_FH *, wt_off_t *);
- int (*fh_sync)(WT_SESSION_IMPL *, WT_FH *, bool);
- int (*fh_truncate)(WT_SESSION_IMPL *, WT_FH *, wt_off_t);
- int (*fh_write)(
- WT_SESSION_IMPL *, WT_FH *, wt_off_t, size_t, const void *);
+ int (*close)(WT_SESSION_IMPL *, WT_FSTREAM *);
+ int (*flush)(WT_SESSION_IMPL *, WT_FSTREAM *);
+ int (*getline)(WT_SESSION_IMPL *, WT_FSTREAM *, WT_ITEM *);
+ int (*printf)(WT_SESSION_IMPL *, WT_FSTREAM *, const char *, va_list);
};
diff --git a/src/include/os_fhandle.i b/src/include/os_fhandle.i
new file mode 100644
index 00000000000..8d2cda4b305
--- /dev/null
+++ b/src/include/os_fhandle.i
@@ -0,0 +1,154 @@
+/*-
+ * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __wt_fsync --
+ * POSIX fsync.
+ */
+static inline int
+__wt_fsync(WT_SESSION_IMPL *session, WT_FH *fh, bool block)
+{
+ WT_FILE_HANDLE *handle;
+
+ WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY));
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_HANDLEOPS, "%s: handle-sync", fh->handle->name));
+
+ handle = fh->handle;
+ if (block)
+ return (handle->sync == NULL ? 0 :
+ handle->sync(handle, (WT_SESSION *)session));
+ else
+ return (handle->sync_nowait == NULL ? 0 :
+ handle->sync_nowait(handle, (WT_SESSION *)session));
+}
+
+/*
+ * __wt_fallocate --
+ * Extend a file.
+ */
+static inline int
+__wt_fallocate(
+ WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, wt_off_t len)
+{
+ WT_DECL_RET;
+ WT_FILE_HANDLE *handle;
+
+ WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY));
+ WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_IN_MEMORY));
+
+ WT_RET(__wt_verbose(session, WT_VERB_HANDLEOPS,
+ "%s: handle-allocate: %" PRIuMAX " at %" PRIuMAX,
+ fh->handle->name, (uintmax_t)len, (uintmax_t)offset));
+
+ /*
+ * Our caller is responsible for handling any locking issues, all we
+ * have to do is find a function to call.
+ *
+ * Be cautious, the underlying system might have configured the nolock
+ * flavor, that failed, and we have to fallback to the locking flavor.
+ */
+ handle = fh->handle;
+ if (handle->fallocate_nolock != NULL) {
+ if ((ret = handle->fallocate_nolock(
+ handle, (WT_SESSION *)session, offset, len)) == 0)
+ return (0);
+ WT_RET_ERROR_OK(ret, ENOTSUP);
+ }
+ if (handle->fallocate != NULL)
+ return (handle->fallocate(
+ handle, (WT_SESSION *)session, offset, len));
+ return (ENOTSUP);
+}
+
+/*
+ * __wt_file_lock --
+ * Lock/unlock a file.
+ */
+static inline int
+__wt_file_lock(WT_SESSION_IMPL * session, WT_FH *fh, bool lock)
+{
+ WT_FILE_HANDLE *handle;
+
+ WT_RET(__wt_verbose(session, WT_VERB_HANDLEOPS,
+ "%s: handle-lock: %s", fh->handle->name, lock ? "lock" : "unlock"));
+
+ handle = fh->handle;
+ return (handle->lock == NULL ? 0 :
+ handle->lock(handle, (WT_SESSION*)session, lock));
+}
+
+/*
+ * __wt_read --
+ * POSIX pread.
+ */
+static inline int
+__wt_read(
+ WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf)
+{
+ WT_RET(__wt_verbose(session, WT_VERB_HANDLEOPS,
+ "%s: handle-read: %" WT_SIZET_FMT " at %" PRIuMAX,
+ fh->handle->name, len, (uintmax_t)offset));
+
+ WT_STAT_FAST_CONN_INCR(session, read_io);
+
+ return (fh->handle->read(
+ fh->handle, (WT_SESSION *)session, offset, len, buf));
+}
+
+/*
+ * __wt_filesize --
+ * Get the size of a file in bytes, by file handle.
+ */
+static inline int
+__wt_filesize(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *sizep)
+{
+ WT_RET(__wt_verbose(
+ session, WT_VERB_HANDLEOPS, "%s: handle-size", fh->handle->name));
+
+ return (fh->handle->size(fh->handle, (WT_SESSION *)session, sizep));
+}
+
+/*
+ * __wt_ftruncate --
+ * POSIX ftruncate.
+ */
+static inline int
+__wt_ftruncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len)
+{
+ WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY));
+
+ WT_RET(__wt_verbose(session, WT_VERB_HANDLEOPS,
+ "%s: handle-truncate: %" PRIuMAX,
+ fh->handle->name, (uintmax_t)len));
+
+ return (fh->handle->truncate(fh->handle, (WT_SESSION *)session, len));
+}
+
+/*
+ * __wt_write --
+ * POSIX pwrite.
+ */
+static inline int
+__wt_write(WT_SESSION_IMPL *session,
+ WT_FH *fh, wt_off_t offset, size_t len, const void *buf)
+{
+ WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY) ||
+ WT_STRING_MATCH(fh->name,
+ WT_SINGLETHREAD, strlen(WT_SINGLETHREAD)));
+
+ WT_RET(__wt_verbose(session, WT_VERB_HANDLEOPS,
+ "%s: handle-write: %" WT_SIZET_FMT " at %" PRIuMAX,
+ fh->handle->name, len, (uintmax_t)offset));
+
+ WT_STAT_FAST_CONN_INCR(session, write_io);
+
+ return (fh->handle->write(
+ fh->handle, (WT_SESSION *)session, offset, len, buf));
+}
diff --git a/src/include/os_fs.i b/src/include/os_fs.i
new file mode 100644
index 00000000000..151898711d8
--- /dev/null
+++ b/src/include/os_fs.i
@@ -0,0 +1,243 @@
+/*-
+ * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __wt_fs_directory_list --
+ * Get a list of files from a directory.
+ */
+static inline int
+__wt_fs_directory_list(WT_SESSION_IMPL *session,
+ const char *dir, const char *prefix, char ***dirlistp, u_int *countp)
+{
+ WT_DECL_RET;
+ WT_FILE_SYSTEM *file_system;
+ WT_SESSION *wt_session;
+ char *path;
+
+ *dirlistp = NULL;
+ *countp = 0;
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS,
+ "%s: directory-list: %s prefix %s",
+ dir, prefix == NULL ? "all" : prefix));
+
+ WT_RET(__wt_filename(session, dir, &path));
+
+ file_system = S2C(session)->file_system;
+ wt_session = (WT_SESSION *)session;
+ ret = file_system->directory_list(
+ file_system, wt_session, path, prefix, dirlistp, countp);
+
+ __wt_free(session, path);
+ return (ret);
+}
+
+/*
+ * __wt_fs_directory_list_free --
+ * Free memory allocated by __wt_fs_directory_list.
+ */
+static inline int
+__wt_fs_directory_list_free(
+ WT_SESSION_IMPL *session, char ***dirlistp, u_int count)
+{
+ WT_DECL_RET;
+ WT_FILE_SYSTEM *file_system;
+ WT_SESSION *wt_session;
+
+ if (*dirlistp != NULL) {
+ file_system = S2C(session)->file_system;
+ wt_session = (WT_SESSION *)session;
+ ret = file_system->directory_list_free(
+ file_system, wt_session, *dirlistp, count);
+ }
+
+ *dirlistp = NULL;
+ return (ret);
+}
+
+/*
+ * __wt_fs_directory_sync --
+ * Flush a directory to ensure file creation is durable.
+ */
+static inline int
+__wt_fs_directory_sync(WT_SESSION_IMPL *session, const char *name)
+{
+ WT_DECL_RET;
+ WT_FILE_SYSTEM *file_system;
+ WT_SESSION *wt_session;
+ char *copy, *dir;
+
+ WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY));
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_FILEOPS, "%s: directory-sync", name));
+
+ /*
+ * POSIX 1003.1 does not require that fsync of a file handle ensures the
+ * entry in the directory containing the file has also reached disk (and
+ * there are historic Linux filesystems requiring it). If the underlying
+ * filesystem method is set, do an explicit fsync on a file descriptor
+ * for the directory to be sure.
+ *
+ * directory-sync is not a required call, no method means the call isn't
+ * needed.
+ */
+ file_system = S2C(session)->file_system;
+ if (file_system->directory_sync == NULL)
+ return (0);
+
+ copy = NULL;
+ if (name == NULL || strchr(name, '/') == NULL)
+ name = S2C(session)->home;
+ else {
+ /*
+ * File name construction should not return a path without any
+ * slash separator, but caution isn't unreasonable.
+ */
+ WT_RET(__wt_filename(session, name, &copy));
+ if ((dir = strrchr(copy, '/')) == NULL)
+ name = S2C(session)->home;
+ else {
+ dir[1] = '\0';
+ name = copy;
+ }
+ }
+
+ wt_session = (WT_SESSION *)session;
+ ret = file_system->directory_sync(file_system, wt_session, name);
+
+ __wt_free(session, copy);
+ return (ret);
+}
+
+/*
+ * __wt_fs_exist --
+ * Return if the file exists.
+ */
+static inline int
+__wt_fs_exist(WT_SESSION_IMPL *session, const char *name, bool *existp)
+{
+ WT_DECL_RET;
+ WT_FILE_SYSTEM *file_system;
+ WT_SESSION *wt_session;
+ char *path;
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: file-exist", name));
+
+ WT_RET(__wt_filename(session, name, &path));
+
+ file_system = S2C(session)->file_system;
+ wt_session = (WT_SESSION *)session;
+ ret = file_system->exist(file_system, wt_session, path, existp);
+
+ __wt_free(session, path);
+ return (ret);
+}
+
+/*
+ * __wt_fs_remove --
+ * POSIX remove.
+ */
+static inline int
+__wt_fs_remove(WT_SESSION_IMPL *session, const char *name)
+{
+ WT_DECL_RET;
+ WT_FILE_SYSTEM *file_system;
+ WT_SESSION *wt_session;
+ char *path;
+
+ WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY));
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: file-remove", name));
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * It is a layering violation to retrieve a WT_FH here, but it is a
+ * useful diagnostic to ensure WiredTiger doesn't have the handle open.
+ */
+ if (__wt_handle_is_open(session, name))
+ WT_RET_MSG(session, EINVAL,
+ "%s: file-remove: file has open handles", name);
+#endif
+
+ WT_RET(__wt_filename(session, name, &path));
+
+ file_system = S2C(session)->file_system;
+ wt_session = (WT_SESSION *)session;
+ ret = file_system->remove(file_system, wt_session, path);
+
+ __wt_free(session, path);
+ return (ret);
+}
+
+/*
+ * __wt_fs_rename --
+ * POSIX rename.
+ */
+static inline int
+__wt_fs_rename(WT_SESSION_IMPL *session, const char *from, const char *to)
+{
+ WT_DECL_RET;
+ WT_FILE_SYSTEM *file_system;
+ WT_SESSION *wt_session;
+ char *from_path, *to_path;
+
+ WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY));
+
+ WT_RET(__wt_verbose(
+ session, WT_VERB_FILEOPS, "%s to %s: file-rename", from, to));
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * It is a layering violation to retrieve a WT_FH here, but it is a
+ * useful diagnostic to ensure WiredTiger doesn't have the handle open.
+ */
+ if (__wt_handle_is_open(session, from))
+ WT_RET_MSG(session, EINVAL,
+ "%s: file-rename: file has open handles", from);
+ if (__wt_handle_is_open(session, to))
+ WT_RET_MSG(session, EINVAL,
+ "%s: file-rename: file has open handles", to);
+#endif
+
+ from_path = to_path = NULL;
+ WT_ERR(__wt_filename(session, from, &from_path));
+ WT_ERR(__wt_filename(session, to, &to_path));
+
+ file_system = S2C(session)->file_system;
+ wt_session = (WT_SESSION *)session;
+ ret = file_system->rename(file_system, wt_session, from_path, to_path);
+
+err: __wt_free(session, from_path);
+ __wt_free(session, to_path);
+ return (ret);
+}
+
+/*
+ * __wt_fs_size --
+ * Get the size of a file in bytes, by file name.
+ */
+static inline int
+__wt_fs_size(WT_SESSION_IMPL *session, const char *name, wt_off_t *sizep)
+{
+ WT_DECL_RET;
+ WT_FILE_SYSTEM *file_system;
+ WT_SESSION *wt_session;
+ char *path;
+
+ WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: file-size", name));
+
+ WT_RET(__wt_filename(session, name, &path));
+
+ file_system = S2C(session)->file_system;
+ wt_session = (WT_SESSION *)session;
+ ret = file_system->size(file_system, wt_session, path, sizep);
+
+ __wt_free(session, path);
+ return (ret);
+}
diff --git a/src/include/os_fstream.i b/src/include/os_fstream.i
new file mode 100644
index 00000000000..37a6039d1b7
--- /dev/null
+++ b/src/include/os_fstream.i
@@ -0,0 +1,97 @@
+/*-
+ * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+/*
+ * __wt_getline --
+ * Get a line from a stream.
+ */
+static inline int
+__wt_getline(WT_SESSION_IMPL *session, WT_FSTREAM *fs, WT_ITEM *buf)
+{
+ return (fs->getline(session, fs, buf));
+}
+
+/*
+ * __wt_fclose --
+ * Close a stream.
+ */
+static inline int
+__wt_fclose(WT_SESSION_IMPL *session, WT_FSTREAM **fsp)
+{
+ WT_FSTREAM *fs;
+
+ if ((fs = *fsp) == NULL)
+ return (0);
+ *fsp = NULL;
+ return (fs->close(session, fs));
+}
+
+/*
+ * __wt_fflush --
+ * Flush a stream.
+ */
+static inline int
+__wt_fflush(WT_SESSION_IMPL *session, WT_FSTREAM *fs)
+{
+ return (fs->flush(session, fs));
+}
+
+/*
+ * __wt_vfprintf --
+ * ANSI C vfprintf.
+ */
+static inline int
+__wt_vfprintf(
+ WT_SESSION_IMPL *session, WT_FSTREAM *fs, const char *fmt, va_list ap)
+{
+ WT_RET(__wt_verbose(
+ session, WT_VERB_HANDLEOPS, "%s: handle-printf", fs->name));
+
+ return (fs->printf(session, fs, fmt, ap));
+}
+
+/*
+ * __wt_fprintf --
+ * ANSI C fprintf.
+ */
+static inline int
+__wt_fprintf(WT_SESSION_IMPL *session, WT_FSTREAM *fs, const char *fmt, ...)
+ WT_GCC_FUNC_ATTRIBUTE((format (printf, 3, 4)))
+{
+ WT_DECL_RET;
+ va_list ap;
+
+ va_start(ap, fmt);
+ ret = __wt_vfprintf(session, fs, fmt, ap);
+ va_end(ap);
+
+ return (ret);
+}
+
+/*
+ * __wt_sync_and_rename --
+ * Flush and close a stream, then swap it into place.
+ */
+static inline int
+__wt_sync_and_rename(WT_SESSION_IMPL *session,
+ WT_FSTREAM **fsp, const char *from, const char *to)
+{
+ WT_DECL_RET;
+ WT_FSTREAM *fs;
+
+ fs = *fsp;
+ *fsp = NULL;
+
+ /* Flush to disk and close the handle. */
+ WT_TRET(__wt_fflush(session, fs));
+ WT_TRET(__wt_fsync(session, fs->fh, true));
+ WT_TRET(__wt_fclose(session, &fs));
+ WT_RET(ret);
+
+ return (__wt_rename_and_sync_directory(session, from, to));
+}
diff --git a/src/include/serial.i b/src/include/serial.i
index fa920de7e37..c0cd9c85ee9 100644
--- a/src/include/serial.i
+++ b/src/include/serial.i
@@ -306,7 +306,7 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
if ((txn = page->modify->obsolete_check_txn) != WT_TXN_NONE) {
if (!__wt_txn_visible_all(session, txn)) {
/* Try to move the oldest ID forward and re-check. */
- __wt_txn_update_oldest(session, false);
+ WT_RET(__wt_txn_update_oldest(session, false));
if (!__wt_txn_visible_all(session, txn))
return (0);
diff --git a/src/include/stat.h b/src/include/stat.h
index f9170dc1a79..18461b1ee38 100644
--- a/src/include/stat.h
+++ b/src/include/stat.h
@@ -261,11 +261,16 @@ struct __wt_connection_stats {
int64_t cache_bytes_read;
int64_t cache_bytes_write;
int64_t cache_eviction_checkpoint;
+ int64_t cache_eviction_get_ref;
+ int64_t cache_eviction_get_ref_empty;
+ int64_t cache_eviction_get_ref_empty2;
int64_t cache_eviction_aggressive_set;
int64_t cache_eviction_queue_empty;
int64_t cache_eviction_queue_not_empty;
int64_t cache_eviction_server_evicting;
int64_t cache_eviction_server_not_evicting;
+ int64_t cache_eviction_server_toobig;
+ int64_t cache_eviction_server_slept;
int64_t cache_eviction_slow;
int64_t cache_eviction_worker_evicting;
int64_t cache_eviction_force_fail;
@@ -288,6 +293,7 @@ struct __wt_connection_stats {
int64_t cache_eviction_app;
int64_t cache_read;
int64_t cache_read_lookaside;
+ int64_t cache_pages_requested;
int64_t cache_eviction_fail;
int64_t cache_eviction_walk;
int64_t cache_write;
@@ -458,6 +464,7 @@ struct __wt_dsrc_stats {
int64_t cache_write_lookaside;
int64_t cache_read;
int64_t cache_read_lookaside;
+ int64_t cache_pages_requested;
int64_t cache_write;
int64_t cache_write_restore;
int64_t cache_eviction_clean;
diff --git a/src/include/txn.h b/src/include/txn.h
index 1e82e2d982a..d10738cc670 100644
--- a/src/include/txn.h
+++ b/src/include/txn.h
@@ -74,7 +74,7 @@ struct __wt_txn_global {
volatile uint64_t current; /* Current transaction ID. */
/* The oldest running transaction ID (may race). */
- uint64_t last_running;
+ volatile uint64_t last_running;
/*
* The oldest transaction ID that is not yet visible to some
@@ -82,8 +82,11 @@ struct __wt_txn_global {
*/
volatile uint64_t oldest_id;
- /* Count of scanning threads, or -1 for exclusive access. */
- volatile int32_t scan_count;
+ /*
+ * Prevents the oldest ID moving forwards while threads are scanning
+ * the global transaction state.
+ */
+ WT_RWLOCK *scan_rwlock;
/*
* Track information about the running checkpoint. The transaction
diff --git a/src/include/txn.i b/src/include/txn.i
index f5ca44c2ada..96f7426e421 100644
--- a/src/include/txn.i
+++ b/src/include/txn.i
@@ -261,14 +261,14 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
* eviction, it's better to do it beforehand.
*/
WT_RET(__wt_cache_eviction_check(session, false, NULL));
-
- __wt_txn_get_snapshot(session);
+ WT_RET(__wt_txn_get_snapshot(session));
}
F_SET(txn, WT_TXN_RUNNING);
if (F_ISSET(S2C(session), WT_CONN_READONLY))
F_SET(txn, WT_TXN_READONLY);
- return (false);
+
+ return (0);
}
/*
@@ -450,7 +450,7 @@ __wt_txn_read_last(WT_SESSION_IMPL *session)
* __wt_txn_cursor_op --
* Called for each cursor operation.
*/
-static inline void
+static inline int
__wt_txn_cursor_op(WT_SESSION_IMPL *session)
{
WT_TXN *txn;
@@ -482,7 +482,9 @@ __wt_txn_cursor_op(WT_SESSION_IMPL *session)
if (txn_state->snap_min == WT_TXN_NONE)
txn_state->snap_min = txn_global->last_running;
} else if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))
- __wt_txn_get_snapshot(session);
+ WT_RET(__wt_txn_get_snapshot(session));
+
+ return (0);
}
/*
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index 279858a808e..007df44f257 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -71,6 +71,8 @@ struct __wt_encryptor; typedef struct __wt_encryptor WT_ENCRYPTOR;
struct __wt_event_handler; typedef struct __wt_event_handler WT_EVENT_HANDLER;
struct __wt_extension_api; typedef struct __wt_extension_api WT_EXTENSION_API;
struct __wt_extractor; typedef struct __wt_extractor WT_EXTRACTOR;
+struct __wt_file_handle; typedef struct __wt_file_handle WT_FILE_HANDLE;
+struct __wt_file_system; typedef struct __wt_file_system WT_FILE_SYSTEM;
struct __wt_item; typedef struct __wt_item WT_ITEM;
struct __wt_session; typedef struct __wt_session WT_SESSION;
@@ -421,6 +423,9 @@ struct __wt_cursor {
* @errors
* In particular, if \c overwrite is not configured and a record with
* the specified key already exists, ::WT_DUPLICATE_KEY is returned.
+ * Also, if \c in_memory is configured for the database and the insert
+ * requires more than the configured cache size to complete,
+ * ::WT_CACHE_FULL is returned.
*/
int __F(insert)(WT_CURSOR *cursor);
@@ -451,6 +456,9 @@ struct __wt_cursor {
* @errors
* In particular, if \c overwrite is not configured and no record with
* the specified key exists, ::WT_NOTFOUND is returned.
+ * Also, if \c in_memory is configured for the database and the insert
+ * requires more than the configured cache size to complete,
+ * ::WT_CACHE_FULL is returned.
*/
int __F(update)(WT_CURSOR *cursor);
@@ -1241,18 +1249,21 @@ struct __wt_session {
* @param join_cursor a cursor that was opened using a
* \c "join:" URI. It may not have been used for any operations
* other than other join calls.
- * @param ref_cursor either an index cursor having the same base table
- * as the join_cursor, or a table cursor open on the same base table.
- * The ref_cursor must be positioned.
+ * @param ref_cursor an index cursor having the same base table
+ * as the join_cursor, or a table cursor open on the same base table,
+ * or another join cursor. Unless the ref_cursor is another join
+ * cursor, it must be positioned.
*
* The ref_cursor limits the results seen by iterating the
* join_cursor to table items referred to by the key in this
* index. The set of keys referred to is modified by the compare
* config option.
*
- * Multiple join calls builds up a set of ref_cursors, and the
- * results seen by iteration are the intersection of the cursor
- * ranges participating in the join.
+ * Multiple join calls builds up a set of ref_cursors, and
+ * by default, the results seen by iteration are the intersection
+ * of the cursor ranges participating in the join. When configured
+ * with \c "operation=or", the results seen are the union of
+ * the participating cursor ranges.
*
* After the join call completes, the ref_cursor cursor may not be
* used for any purpose other than get_key and get_value. Any other
@@ -1275,6 +1286,13 @@ struct __wt_session {
* also influences evaluation order for cursors in the join. When the
* count is equal for multiple bloom filters in a composition of joins\,
* the bloom filter may be shared., an integer; default \c .}
+ * @config{operation, the operation applied between this and other
+ * joined cursors. When "operation=and" is specified\, all the
+ * conditions implied by joins must be satisfied for an entry to be
+ * returned by the join cursor; when "operation=or" is specified\, only
+ * one must be satisfied. All cursors joined to a join cursor must have
+ * matching operations., a string\, chosen from the following options:
+ * \c "and"\, \c "or"; default \c "and".}
* @config{strategy, when set to bloom\, a bloom filter is created and
* populated for this index. This has an up front cost but may reduce
* the number of accesses to the main table when iterating the joined
@@ -2018,6 +2036,10 @@ struct __wt_connection {
* @configstart{WT_CONNECTION.load_extension, see dist/api_data.py}
* @config{config, configuration string passed to the entry point of the
* extension as its WT_CONFIG_ARG argument., a string; default empty.}
+ * @config{early_load, whether this extension should be loaded at the
+ * beginning of ::wiredtiger_open. Only applicable to extensions loaded
+ * via the wiredtiger_open configurations string., a boolean flag;
+ * default \c false.}
* @config{entry, the entry point of the extension\, called to
* initialize the extension when it is loaded. The signature of the
* function must match ::wiredtiger_extension_init., a string; default
@@ -2129,6 +2151,23 @@ struct __wt_connection {
WT_EXTRACTOR *extractor, const char *config);
/*!
+ * Configure a custom file system.
+ *
+ * This method can only be called from an early loaded extension
+ * module. The application must first implement the WT_FILE_SYSTEM
+ * interface and then register the implementation with WiredTiger:
+ *
+ * @snippet ex_file_system.c WT_FILE_SYSTEM register
+ *
+ * @param connection the connection handle
+ * @param fs the populated file system structure
+ * @configempty{WT_CONNECTION.set_file_system, see dist/api_data.py}
+ * @errors
+ */
+ int __F(set_file_system)(
+ WT_CONNECTION *connection, WT_FILE_SYSTEM *fs, const char *config);
+
+ /*!
* Return a reference to the WiredTiger extension functions.
*
* @snippet ex_data_source.c WT_EXTENSION_API declaration
@@ -2289,6 +2328,8 @@ struct __wt_connection {
* @config{ ),,}
* @config{hazard_max, maximum number of simultaneous hazard pointers per
* session handle., an integer greater than or equal to 15; default \c 1000.}
+ * @config{in_memory, keep data in-memory only. See @ref in_memory for more
+ * information., a boolean flag; default \c false.}
* @config{log = (, enable logging. Enabling logging uses three sessions from
* the configured session_max., a set of related configuration options defined
* below.}
@@ -3003,15 +3044,15 @@ const char *wiredtiger_version(int *majorp, int *minorp, int *patchp);
* if recovery is required to use the database.
*/
#define WT_RUN_RECOVERY -31806
-/*! @cond internal */
/*!
* Operation would overflow cache.
- * This error is generated when wiredtiger_open is configured to run in-memory,
- * and an insert or update operation requires more than the configured cache
- * size to complete.
+ * This error is only generated when wiredtiger_open is configured to run in-
+ * memory, and an insert or update operation requires more than the configured
+ * cache size to complete. The operation may be retried; if a transaction is in
+ * progress, it should be rolled back and the operation retried in a new
+ * transaction.
*/
#define WT_CACHE_FULL -31807
-/*! @endcond */
/*! @cond internal */
/*! Permission denied (internal). */
#define WT_PERM_DENIED -31808
@@ -3038,7 +3079,7 @@ const char *wiredtiger_version(int *majorp, int *minorp, int *patchp);
/*******************************************
* Forward structure declarations for the extension API
*******************************************/
-struct __wt_config_arg; typedef struct __wt_config_arg WT_CONFIG_ARG;
+struct __wt_config_arg; typedef struct __wt_config_arg WT_CONFIG_ARG;
/*!
* The interface implemented by applications to provide custom ordering of
@@ -3569,7 +3610,7 @@ struct __wt_encryptor {
* number of bytes needed.
*
* @param[out] expansion_constantp the additional number of bytes needed
- * when encrypting.
+ * when encrypting.
* @returns zero for success, non-zero to indicate an error.
*
* @snippet nop_encrypt.c WT_ENCRYPTOR sizing
@@ -3588,8 +3629,7 @@ struct __wt_encryptor {
* is used instead of this one for any callbacks.
*
* @param[in] encrypt_config the "encryption" portion of the
- * configuration from the wiredtiger_open or
- * WT_SESSION::create call
+ * configuration from the wiredtiger_open or WT_SESSION::create call
* @param[out] customp the new modified encryptor, or NULL.
* @returns zero for success, non-zero to indicate an error.
*/
@@ -3664,6 +3704,466 @@ struct __wt_extractor {
int (*terminate)(WT_EXTRACTOR *extractor, WT_SESSION *session);
};
+#if !defined(SWIG)
+/*! WT_FILE_SYSTEM::open_file file types */
+typedef enum {
+ WT_OPEN_FILE_TYPE_CHECKPOINT, /*!< open a data file checkpoint */
+ WT_OPEN_FILE_TYPE_DATA, /*!< open a data file */
+ WT_OPEN_FILE_TYPE_DIRECTORY, /*!< open a directory */
+ WT_OPEN_FILE_TYPE_LOG, /*!< open a log file */
+ WT_OPEN_FILE_TYPE_REGULAR /*!< open a regular file */
+} WT_OPEN_FILE_TYPE;
+
+/*! WT_FILE_SYSTEM::open_file flags: create if does not exist */
+#define WT_OPEN_CREATE 0x001
+/*! WT_FILE_SYSTEM::open_file flags: direct I/O requested */
+#define WT_OPEN_DIRECTIO 0x002
+/*! WT_FILE_SYSTEM::open_file flags: error if exclusive use not available */
+#define WT_OPEN_EXCLUSIVE 0x004
+#ifndef DOXYGEN
+#define WT_OPEN_FIXED 0x008 /* Path not home relative (internal) */
+#endif
+/*! WT_FILE_SYSTEM::open_file flags: open is read-only */
+#define WT_OPEN_READONLY 0x010
+
+/*!
+ * The interface implemented by applications to provide a custom file system
+ * implementation.
+ *
+ * <b>Thread safety:</b> WiredTiger may invoke methods on the WT_FILE_SYSTEM
+ * interface from multiple threads concurrently. It is the responsibility of
+ * the implementation to protect any shared data.
+ *
+ * Applications register implementations with WiredTiger by calling
+ * WT_CONNECTION::add_file_system. See @ref custom_file_systems for more
+ * information.
+ *
+ * @snippet ex_file_system.c WT_FILE_SYSTEM register
+ */
+struct __wt_file_system {
+ /*!
+ * Return a list of file names for the named directory.
+ *
+ * @errors
+ *
+ * @param file_system the WT_FILE_SYSTEM
+ * @param session the current WiredTiger session
+ * @param directory the name of the directory
+ * @param prefix if not NULL, only files with names matching the prefix
+ * are returned
+ * @param[out] dirlist the method returns an allocated array of
+ * individually allocated strings, one for each entry in the
+ * directory.
+ * @param[out] countp the method the number of entries returned
+ */
+ int (*directory_list)(WT_FILE_SYSTEM *file_system, WT_SESSION *session,
+ const char *directory, const char *prefix, char ***dirlist,
+ uint32_t *countp);
+
+ /*!
+ * Free memory allocated by WT_FILE_SYSTEM::directory_list.
+ *
+ * @errors
+ *
+ * @param file_system the WT_FILE_SYSTEM
+ * @param session the current WiredTiger session
+ * @param dirlist array returned by WT_FILE_SYSTEM::directory_list
+ * @param count count returned by WT_FILE_SYSTEM::directory_list
+ */
+ int (*directory_list_free)(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *session, char **dirlist, uint32_t count);
+
+ /*!
+ * Flush the named directory.
+ *
+ * This method is not required for readonly file systems or file systems
+ * where it is not necessary to flush a file's directory to ensure the
+ * durability of file system operations, and should be set to NULL when
+ * not required by the file system.
+ *
+ * @errors
+ *
+ * @param file_system the WT_FILE_SYSTEM
+ * @param session the current WiredTiger session
+ * @param directory the name of the directory
+ */
+ int (*directory_sync)(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *session, const char *directory);
+
+ /*!
+ * Return if the named file system object exists.
+ *
+ * @errors
+ *
+ * @param file_system the WT_FILE_SYSTEM
+ * @param session the current WiredTiger session
+ * @param name the name of the file
+ * @param[out] existp If the named file system object exists
+ */
+ int (*exist)(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *session, const char *name, bool *existp);
+
+ /*!
+ * Open a handle for a named file system object
+ *
+ * @errors
+ *
+ * @param file_system the WT_FILE_SYSTEM
+ * @param session the current WiredTiger session
+ * @param name the name of the file system object
+ * @param file_type the type of the file
+ * The file type is provided to allow optimization for different file
+ * access patterns.
+ * @param flags flags indicating how to open the file, one or more of
+ * ::WT_OPEN_CREATE, ::WT_OPEN_DIRECTIO, ::WT_OPEN_EXCLUSIVE or
+ * ::WT_OPEN_READONLY.
+ * @param[out] file_handlep the handle to the newly opened file. File
+ * system implementations must allocate memory for the handle and
+ * the WT_FILE_HANDLE::name field, and fill in the WT_FILE_HANDLE::
+ * fields. Applications wanting to associate private information
+ * with the WT_FILE_HANDLE:: structure should declare and allocate
+ * their own structure as a superset of a WT_FILE_HANDLE:: structure.
+ */
+ int (*open_file)(WT_FILE_SYSTEM *file_system, WT_SESSION *session,
+ const char *name, WT_OPEN_FILE_TYPE file_type, uint32_t flags,
+ WT_FILE_HANDLE **file_handlep);
+
+ /*!
+ * Remove a named file system object
+ *
+ * This method is not required for readonly file systems and should be
+ * set to NULL when not required by the file system.
+ *
+ * @errors
+ *
+ * @param file_system the WT_FILE_SYSTEM
+ * @param session the current WiredTiger session
+ * @param name the name of the file system object
+ */
+ int (*remove)(
+ WT_FILE_SYSTEM *file_system, WT_SESSION *session, const char *name);
+
+ /*!
+ * Rename a named file system object
+ *
+ * This method is not required for readonly file systems and should be
+ * set to NULL when not required by the file system.
+ *
+ * @errors
+ *
+ * @param file_system the WT_FILE_SYSTEM
+ * @param session the current WiredTiger session
+ * @param from the original name of the object
+ * @param to the new name for the object
+ */
+ int (*rename)(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *session, const char *from, const char *to);
+
+ /*!
+ * Return the size of a named file system object
+ *
+ * @errors
+ *
+ * @param file_system the WT_FILE_SYSTEM
+ * @param session the current WiredTiger session
+ * @param name the name of the file system object
+ * @param[out] sizep the size of the file system entry
+ */
+ int (*size)(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *session, const char *name, wt_off_t *sizep);
+
+ /*!
+ * A callback performed when the file system is closed and will no
+ * longer be accessed by the WiredTiger database.
+ *
+ * This method is not required and should be set to NULL when not
+ * required by the file system.
+ *
+ * The WT_FILE_SYSTEM::terminate callback is intended to allow cleanup,
+ * the handle will not be subsequently accessed by WiredTiger.
+ */
+ int (*terminate)(WT_FILE_SYSTEM *file_system, WT_SESSION *session);
+};
+
+/*! WT_FILE_HANDLE::fadvise flags: no longer need */
+#define WT_FILE_HANDLE_DONTNEED 1
+/*! WT_FILE_HANDLE::fadvise flags: will need */
+#define WT_FILE_HANDLE_WILLNEED 2
+
+/*!
+ * A file handle implementation returned by WT_FILE_SYSTEM::open_file.
+ *
+ * <b>Thread safety:</b> Unless explicitly stated otherwise, WiredTiger may
+ * invoke methods on the WT_FILE_HANDLE interface from multiple threads
+ * concurrently. It is the responsibility of the implementation to protect
+ * any shared data.
+ *
+ * See @ref custom_file_systems for more information.
+ */
+struct __wt_file_handle {
+ /*!
+ * The enclosing file system, set by WT_FILE_SYSTEM::open_file.
+ */
+ WT_FILE_SYSTEM *file_system;
+
+ /*!
+ * The name of the file, set by WT_FILE_SYSTEM::open_file.
+ */
+ char *name;
+
+ /*!
+ * Close a file handle, the handle will not be further accessed by
+ * WiredTiger.
+ *
+ * @errors
+ *
+ * @param file_handle the WT_FILE_HANDLE
+ * @param session the current WiredTiger session
+ */
+ int (*close)(WT_FILE_HANDLE *file_handle, WT_SESSION *session);
+
+ /*!
+ * Indicate expected future use of file ranges, based on the POSIX
+ * 1003.1 standard fadvise.
+ *
+ * This method is not required, and should be set to NULL when not
+ * supported by the file.
+ *
+ * @errors
+ *
+ * @param file_handle the WT_FILE_HANDLE
+ * @param session the current WiredTiger session
+ * @param offset the file offset
+ * @param len the size of the advisory
+ * @param advice one of ::WT_FILE_HANDLE_WILLNEED or
+ * ::WT_FILE_HANDLE_DONTNEED.
+ */
+ int (*fadvise)(WT_FILE_HANDLE *file_handle,
+ WT_SESSION *session, wt_off_t offset, wt_off_t len, int advice);
+
+ /*!
+ * Ensure disk space is allocated for the file, based on the POSIX
+ * 1003.1 standard fallocate.
+ *
+ * This method is not required, and should be set to NULL when not
+ * supported by the file.
+ *
+ * This method is not called by multiple threads concurrently (on the
+ * same file handle). If the file handle's fallocate method supports
+ * concurrent calls, set the WT_FILE_HANDLE::fallocate_nolock method
+ * instead.
+ *
+ * @errors
+ *
+ * @param file_handle the WT_FILE_HANDLE
+ * @param session the current WiredTiger session
+ * @param offset the file offset
+ * @param len the size of the advisory
+ */
+ int (*fallocate)(WT_FILE_HANDLE *file_handle,
+ WT_SESSION *session, wt_off_t, wt_off_t);
+
+ /*!
+ * Ensure disk space is allocated for the file, based on the POSIX
+ * 1003.1 standard fallocate.
+ *
+ * This method is not required, and should be set to NULL when not
+ * supported by the file.
+ *
+ * This method may be called by multiple threads concurrently (on the
+ * same file handle). If the file handle's fallocate method does not
+ * support concurrent calls, set the WT_FILE_HANDLE::fallocate method
+ * instead.
+ *
+ * @errors
+ *
+ * @param file_handle the WT_FILE_HANDLE
+ * @param session the current WiredTiger session
+ * @param offset the file offset
+ * @param len the size of the advisory
+ */
+ int (*fallocate_nolock)(WT_FILE_HANDLE *file_handle,
+ WT_SESSION *session, wt_off_t, wt_off_t);
+
+ /*!
+ * Lock/unlock a file from the perspective of other processes running
+ * in the system.
+ *
+ * @errors
+ *
+ * @param file_handle the WT_FILE_HANDLE
+ * @param session the current WiredTiger session
+ * @param lock whether to lock or unlock
+ */
+ int (*lock)(
+ WT_FILE_HANDLE *file_handle, WT_SESSION *session, bool lock);
+
+ /*!
+ * Map a file into memory, based on the POSIX 1003.1 standard mmap.
+ *
+ * This method is not required, and should be set to NULL when not
+ * supported by the file.
+ *
+ * @errors
+ *
+ * @param file_handle the WT_FILE_HANDLE
+ * @param session the current WiredTiger session
+ * @param[out] mapped_regionp a reference to a memory location into
+ * which should be stored a pointer to the start of the mapped region
+ * @param[out] lengthp a reference to a memory location into which
+ * should be stored the length of the region
+ * @param[out] mapped_cookiep a reference to a memory location into
+ * which can be optionally stored a pointer to an opaque cookie
+ * which is subsequently passed to WT_FILE_HANDLE::unmap.
+ */
+ int (*map)(WT_FILE_HANDLE *file_handle, WT_SESSION *session,
+ void *mapped_regionp, size_t *lengthp, void *mapped_cookiep);
+
+ /*!
+ * Unmap part of a memory mapped file, based on the POSIX 1003.1
+ * standard madvise.
+ *
+ * This method is not required, and should be set to NULL when not
+ * supported by the file.
+ *
+ * @errors
+ *
+ * @param file_handle the WT_FILE_HANDLE
+ * @param session the current WiredTiger session
+ * @param map a location in the mapped region unlikely to be used in the
+ * near future
+ * @param length the length of the mapped region to discard
+ * @param mapped_cookie any cookie set by the WT_FILE_HANDLE::map method
+ */
+ int (*map_discard)(WT_FILE_HANDLE *file_handle,
+ WT_SESSION *session, void *map, size_t length, void *mapped_cookie);
+
+ /*!
+ * Preload part of a memory mapped file, based on the POSIX 1003.1
+ * standard madvise.
+ *
+ * This method is not required, and should be set to NULL when not
+ * supported by the file.
+ *
+ * @errors
+ *
+ * @param file_handle the WT_FILE_HANDLE
+ * @param session the current WiredTiger session
+ * @param map a location in the mapped region likely to be used in the
+ * near future
+ * @param length the size of the mapped region to preload
+ * @param mapped_cookie any cookie set by the WT_FILE_HANDLE::map method
+ */
+ int (*map_preload)(WT_FILE_HANDLE *file_handle, WT_SESSION *session,
+ const void *map, size_t length, void *mapped_cookie);
+
+ /*!
+ * Unmap a memory mapped file, based on the POSIX 1003.1 standard
+ * munmap.
+ *
+ * This method is only required if a valid implementation of map is
+ * provided by the file, and should be set to NULL otherwise.
+ *
+ * @errors
+ *
+ * @param file_handle the WT_FILE_HANDLE
+ * @param session the current WiredTiger session
+ * @param mapped_region a pointer to the start of the mapped region
+ * @param length the length of the mapped region
+ * @param mapped_cookie any cookie set by the WT_FILE_HANDLE::map method
+ */
+ int (*unmap)(WT_FILE_HANDLE *file_handle, WT_SESSION *session,
+ void *mapped_region, size_t length, void *mapped_cookie);
+
+ /*!
+ * Read from a file, based on the POSIX 1003.1 standard pread.
+ *
+ * @errors
+ *
+ * @param file_handle the WT_FILE_HANDLE
+ * @param session the current WiredTiger session
+ * @param offset the offset in the file to start reading from
+ * @param len the amount to read
+ * @param[out] buf buffer to hold the content read from file
+ */
+ int (*read)(WT_FILE_HANDLE *file_handle,
+ WT_SESSION *session, wt_off_t offset, size_t len, void *buf);
+
+ /*!
+ * Return the size of a file.
+ *
+ * @errors
+ *
+ * @param file_handle the WT_FILE_HANDLE
+ * @param session the current WiredTiger session
+ * @param sizep the size of the file
+ */
+ int (*size)(
+ WT_FILE_HANDLE *file_handle, WT_SESSION *session, wt_off_t *sizep);
+
+ /*!
+ * Make outstanding file writes durable and do not return until writes
+ * are complete.
+ *
+ * This method is not required for read-only files, and should be set
+ * to NULL when not supported by the file.
+ *
+ * @errors
+ *
+ * @param file_handle the WT_FILE_HANDLE
+ * @param session the current WiredTiger session
+ */
+ int (*sync)(WT_FILE_HANDLE *file_handle, WT_SESSION *session);
+
+ /*!
+ * Schedule the outstanding file writes required for durability and
+ * return immediately.
+ *
+ * This method is not required, and should be set to NULL when not
+ * supported by the file.
+ *
+ * @errors
+ *
+ * @param file_handle the WT_FILE_HANDLE
+ * @param session the current WiredTiger session
+ */
+ int (*sync_nowait)(WT_FILE_HANDLE *file_handle, WT_SESSION *session);
+
+ /*!
+ * Lengthen or shorten a file to the specified length, based on the
+ * POSIX 1003.1 standard ftruncate.
+ *
+ * This method is not required for read-only files, and should be set
+ * to NULL when not supported by the file.
+ *
+ * @errors
+ *
+ * @param file_handle the WT_FILE_HANDLE
+ * @param session the current WiredTiger session
+ * @param length desired file size after truncate
+ */
+ int (*truncate)(
+ WT_FILE_HANDLE *file_handle, WT_SESSION *session, wt_off_t length);
+
+ /*!
+ * Write to a file, based on the POSIX 1003.1 standard pwrite.
+ *
+ * This method is not required for read-only files, and should be set
+ * to NULL when not supported by the file.
+ *
+ * @errors
+ *
+ * @param file_handle the WT_FILE_HANDLE
+ * @param session the current WiredTiger session
+ * @param offset offset at which to start writing
+ * @param length amount of data to write
+ * @param buf content to be written to the file
+ */
+ int (*write)(WT_FILE_HANDLE *file_handle, WT_SESSION *session,
+ wt_off_t offset, size_t length, const void *buf);
+};
+#endif /* !defined(SWIG) */
+
/*!
* Entry point to an extension, called when the extension is loaded.
*
@@ -3774,273 +4274,286 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_CONN_CACHE_BYTES_WRITE 1032
/*! cache: checkpoint blocked page eviction */
#define WT_STAT_CONN_CACHE_EVICTION_CHECKPOINT 1033
+/*! cache: eviction calls to get a page */
+#define WT_STAT_CONN_CACHE_EVICTION_GET_REF 1034
+/*! cache: eviction calls to get a page found queue empty */
+#define WT_STAT_CONN_CACHE_EVICTION_GET_REF_EMPTY 1035
+/*! cache: eviction calls to get a page found queue empty after locking */
+#define WT_STAT_CONN_CACHE_EVICTION_GET_REF_EMPTY2 1036
/*! cache: eviction currently operating in aggressive mode */
-#define WT_STAT_CONN_CACHE_EVICTION_AGGRESSIVE_SET 1034
+#define WT_STAT_CONN_CACHE_EVICTION_AGGRESSIVE_SET 1037
/*! cache: eviction server candidate queue empty when topping up */
-#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1035
+#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_EMPTY 1038
/*! cache: eviction server candidate queue not empty when topping up */
-#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1036
+#define WT_STAT_CONN_CACHE_EVICTION_QUEUE_NOT_EMPTY 1039
/*! cache: eviction server evicting pages */
-#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1037
+#define WT_STAT_CONN_CACHE_EVICTION_SERVER_EVICTING 1040
/*! cache: eviction server populating queue, but not evicting pages */
-#define WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING 1038
+#define WT_STAT_CONN_CACHE_EVICTION_SERVER_NOT_EVICTING 1041
+/*! cache: eviction server skipped very large page */
+#define WT_STAT_CONN_CACHE_EVICTION_SERVER_TOOBIG 1042
+/*! cache: eviction server slept, because we did not make progress with
+ * eviction */
+#define WT_STAT_CONN_CACHE_EVICTION_SERVER_SLEPT 1043
/*! cache: eviction server unable to reach eviction goal */
-#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1039
+#define WT_STAT_CONN_CACHE_EVICTION_SLOW 1044
/*! cache: eviction worker thread evicting pages */
-#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1040
+#define WT_STAT_CONN_CACHE_EVICTION_WORKER_EVICTING 1045
/*! cache: failed eviction of pages that exceeded the in-memory maximum */
-#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1041
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE_FAIL 1046
/*! cache: hazard pointer blocked page eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1042
+#define WT_STAT_CONN_CACHE_EVICTION_HAZARD 1047
/*! cache: in-memory page passed criteria to be split */
-#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1043
+#define WT_STAT_CONN_CACHE_INMEM_SPLITTABLE 1048
/*! cache: in-memory page splits */
-#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1044
+#define WT_STAT_CONN_CACHE_INMEM_SPLIT 1049
/*! cache: internal pages evicted */
-#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1045
+#define WT_STAT_CONN_CACHE_EVICTION_INTERNAL 1050
/*! cache: internal pages split during eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1046
+#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_INTERNAL 1051
/*! cache: leaf pages split during eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1047
+#define WT_STAT_CONN_CACHE_EVICTION_SPLIT_LEAF 1052
/*! cache: lookaside table insert calls */
-#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1048
+#define WT_STAT_CONN_CACHE_LOOKASIDE_INSERT 1053
/*! cache: lookaside table remove calls */
-#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1049
+#define WT_STAT_CONN_CACHE_LOOKASIDE_REMOVE 1054
/*! cache: maximum bytes configured */
-#define WT_STAT_CONN_CACHE_BYTES_MAX 1050
+#define WT_STAT_CONN_CACHE_BYTES_MAX 1055
/*! cache: maximum page size at eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1051
+#define WT_STAT_CONN_CACHE_EVICTION_MAXIMUM_PAGE_SIZE 1056
/*! cache: modified pages evicted */
-#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1052
+#define WT_STAT_CONN_CACHE_EVICTION_DIRTY 1057
/*! cache: page split during eviction deepened the tree */
-#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1053
+#define WT_STAT_CONN_CACHE_EVICTION_DEEPEN 1058
/*! cache: page written requiring lookaside records */
-#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1054
+#define WT_STAT_CONN_CACHE_WRITE_LOOKASIDE 1059
/*! cache: pages currently held in the cache */
-#define WT_STAT_CONN_CACHE_PAGES_INUSE 1055
+#define WT_STAT_CONN_CACHE_PAGES_INUSE 1060
/*! cache: pages evicted because they exceeded the in-memory maximum */
-#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1056
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE 1061
/*! cache: pages evicted because they had chains of deleted items */
-#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1057
+#define WT_STAT_CONN_CACHE_EVICTION_FORCE_DELETE 1062
/*! cache: pages evicted by application threads */
-#define WT_STAT_CONN_CACHE_EVICTION_APP 1058
+#define WT_STAT_CONN_CACHE_EVICTION_APP 1063
/*! cache: pages read into cache */
-#define WT_STAT_CONN_CACHE_READ 1059
+#define WT_STAT_CONN_CACHE_READ 1064
/*! cache: pages read into cache requiring lookaside entries */
-#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1060
+#define WT_STAT_CONN_CACHE_READ_LOOKASIDE 1065
+/*! cache: pages requested from the cache */
+#define WT_STAT_CONN_CACHE_PAGES_REQUESTED 1066
/*! cache: pages selected for eviction unable to be evicted */
-#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1061
+#define WT_STAT_CONN_CACHE_EVICTION_FAIL 1067
/*! cache: pages walked for eviction */
-#define WT_STAT_CONN_CACHE_EVICTION_WALK 1062
+#define WT_STAT_CONN_CACHE_EVICTION_WALK 1068
/*! cache: pages written from cache */
-#define WT_STAT_CONN_CACHE_WRITE 1063
+#define WT_STAT_CONN_CACHE_WRITE 1069
/*! cache: pages written requiring in-memory restoration */
-#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1064
+#define WT_STAT_CONN_CACHE_WRITE_RESTORE 1070
/*! cache: percentage overhead */
-#define WT_STAT_CONN_CACHE_OVERHEAD 1065
+#define WT_STAT_CONN_CACHE_OVERHEAD 1071
/*! cache: tracked bytes belonging to internal pages in the cache */
-#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1066
+#define WT_STAT_CONN_CACHE_BYTES_INTERNAL 1072
/*! cache: tracked bytes belonging to leaf pages in the cache */
-#define WT_STAT_CONN_CACHE_BYTES_LEAF 1067
+#define WT_STAT_CONN_CACHE_BYTES_LEAF 1073
/*! cache: tracked bytes belonging to overflow pages in the cache */
-#define WT_STAT_CONN_CACHE_BYTES_OVERFLOW 1068
+#define WT_STAT_CONN_CACHE_BYTES_OVERFLOW 1074
/*! cache: tracked dirty bytes in the cache */
-#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1069
+#define WT_STAT_CONN_CACHE_BYTES_DIRTY 1075
/*! cache: tracked dirty pages in the cache */
-#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1070
+#define WT_STAT_CONN_CACHE_PAGES_DIRTY 1076
/*! cache: unmodified pages evicted */
-#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1071
+#define WT_STAT_CONN_CACHE_EVICTION_CLEAN 1077
/*! connection: auto adjusting condition resets */
-#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1072
+#define WT_STAT_CONN_COND_AUTO_WAIT_RESET 1078
/*! connection: auto adjusting condition wait calls */
-#define WT_STAT_CONN_COND_AUTO_WAIT 1073
+#define WT_STAT_CONN_COND_AUTO_WAIT 1079
/*! connection: files currently open */
-#define WT_STAT_CONN_FILE_OPEN 1074
+#define WT_STAT_CONN_FILE_OPEN 1080
/*! connection: memory allocations */
-#define WT_STAT_CONN_MEMORY_ALLOCATION 1075
+#define WT_STAT_CONN_MEMORY_ALLOCATION 1081
/*! connection: memory frees */
-#define WT_STAT_CONN_MEMORY_FREE 1076
+#define WT_STAT_CONN_MEMORY_FREE 1082
/*! connection: memory re-allocations */
-#define WT_STAT_CONN_MEMORY_GROW 1077
+#define WT_STAT_CONN_MEMORY_GROW 1083
/*! connection: pthread mutex condition wait calls */
-#define WT_STAT_CONN_COND_WAIT 1078
+#define WT_STAT_CONN_COND_WAIT 1084
/*! connection: pthread mutex shared lock read-lock calls */
-#define WT_STAT_CONN_RWLOCK_READ 1079
+#define WT_STAT_CONN_RWLOCK_READ 1085
/*! connection: pthread mutex shared lock write-lock calls */
-#define WT_STAT_CONN_RWLOCK_WRITE 1080
+#define WT_STAT_CONN_RWLOCK_WRITE 1086
/*! connection: total read I/Os */
-#define WT_STAT_CONN_READ_IO 1081
+#define WT_STAT_CONN_READ_IO 1087
/*! connection: total write I/Os */
-#define WT_STAT_CONN_WRITE_IO 1082
+#define WT_STAT_CONN_WRITE_IO 1088
/*! cursor: cursor create calls */
-#define WT_STAT_CONN_CURSOR_CREATE 1083
+#define WT_STAT_CONN_CURSOR_CREATE 1089
/*! cursor: cursor insert calls */
-#define WT_STAT_CONN_CURSOR_INSERT 1084
+#define WT_STAT_CONN_CURSOR_INSERT 1090
/*! cursor: cursor next calls */
-#define WT_STAT_CONN_CURSOR_NEXT 1085
+#define WT_STAT_CONN_CURSOR_NEXT 1091
/*! cursor: cursor prev calls */
-#define WT_STAT_CONN_CURSOR_PREV 1086
+#define WT_STAT_CONN_CURSOR_PREV 1092
/*! cursor: cursor remove calls */
-#define WT_STAT_CONN_CURSOR_REMOVE 1087
+#define WT_STAT_CONN_CURSOR_REMOVE 1093
/*! cursor: cursor reset calls */
-#define WT_STAT_CONN_CURSOR_RESET 1088
+#define WT_STAT_CONN_CURSOR_RESET 1094
/*! cursor: cursor restarted searches */
-#define WT_STAT_CONN_CURSOR_RESTART 1089
+#define WT_STAT_CONN_CURSOR_RESTART 1095
/*! cursor: cursor search calls */
-#define WT_STAT_CONN_CURSOR_SEARCH 1090
+#define WT_STAT_CONN_CURSOR_SEARCH 1096
/*! cursor: cursor search near calls */
-#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1091
+#define WT_STAT_CONN_CURSOR_SEARCH_NEAR 1097
/*! cursor: cursor update calls */
-#define WT_STAT_CONN_CURSOR_UPDATE 1092
+#define WT_STAT_CONN_CURSOR_UPDATE 1098
/*! cursor: truncate calls */
-#define WT_STAT_CONN_CURSOR_TRUNCATE 1093
+#define WT_STAT_CONN_CURSOR_TRUNCATE 1099
/*! data-handle: connection data handles currently active */
-#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1094
+#define WT_STAT_CONN_DH_CONN_HANDLE_COUNT 1100
/*! data-handle: connection sweep candidate became referenced */
-#define WT_STAT_CONN_DH_SWEEP_REF 1095
+#define WT_STAT_CONN_DH_SWEEP_REF 1101
/*! data-handle: connection sweep dhandles closed */
-#define WT_STAT_CONN_DH_SWEEP_CLOSE 1096
+#define WT_STAT_CONN_DH_SWEEP_CLOSE 1102
/*! data-handle: connection sweep dhandles removed from hash list */
-#define WT_STAT_CONN_DH_SWEEP_REMOVE 1097
+#define WT_STAT_CONN_DH_SWEEP_REMOVE 1103
/*! data-handle: connection sweep time-of-death sets */
-#define WT_STAT_CONN_DH_SWEEP_TOD 1098
+#define WT_STAT_CONN_DH_SWEEP_TOD 1104
/*! data-handle: connection sweeps */
-#define WT_STAT_CONN_DH_SWEEPS 1099
+#define WT_STAT_CONN_DH_SWEEPS 1105
/*! data-handle: session dhandles swept */
-#define WT_STAT_CONN_DH_SESSION_HANDLES 1100
+#define WT_STAT_CONN_DH_SESSION_HANDLES 1106
/*! data-handle: session sweep attempts */
-#define WT_STAT_CONN_DH_SESSION_SWEEPS 1101
+#define WT_STAT_CONN_DH_SESSION_SWEEPS 1107
/*! log: busy returns attempting to switch slots */
-#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1102
+#define WT_STAT_CONN_LOG_SLOT_SWITCH_BUSY 1108
/*! log: consolidated slot closures */
-#define WT_STAT_CONN_LOG_SLOT_CLOSES 1103
+#define WT_STAT_CONN_LOG_SLOT_CLOSES 1109
/*! log: consolidated slot join races */
-#define WT_STAT_CONN_LOG_SLOT_RACES 1104
+#define WT_STAT_CONN_LOG_SLOT_RACES 1110
/*! log: consolidated slot join transitions */
-#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1105
+#define WT_STAT_CONN_LOG_SLOT_TRANSITIONS 1111
/*! log: consolidated slot joins */
-#define WT_STAT_CONN_LOG_SLOT_JOINS 1106
+#define WT_STAT_CONN_LOG_SLOT_JOINS 1112
/*! log: consolidated slot unbuffered writes */
-#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1107
+#define WT_STAT_CONN_LOG_SLOT_UNBUFFERED 1113
/*! log: log bytes of payload data */
-#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1108
+#define WT_STAT_CONN_LOG_BYTES_PAYLOAD 1114
/*! log: log bytes written */
-#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1109
+#define WT_STAT_CONN_LOG_BYTES_WRITTEN 1115
/*! log: log files manually zero-filled */
-#define WT_STAT_CONN_LOG_ZERO_FILLS 1110
+#define WT_STAT_CONN_LOG_ZERO_FILLS 1116
/*! log: log flush operations */
-#define WT_STAT_CONN_LOG_FLUSH 1111
+#define WT_STAT_CONN_LOG_FLUSH 1117
/*! log: log force write operations */
-#define WT_STAT_CONN_LOG_FORCE_WRITE 1112
+#define WT_STAT_CONN_LOG_FORCE_WRITE 1118
/*! log: log force write operations skipped */
-#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1113
+#define WT_STAT_CONN_LOG_FORCE_WRITE_SKIP 1119
/*! log: log records compressed */
-#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1114
+#define WT_STAT_CONN_LOG_COMPRESS_WRITES 1120
/*! log: log records not compressed */
-#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1115
+#define WT_STAT_CONN_LOG_COMPRESS_WRITE_FAILS 1121
/*! log: log records too small to compress */
-#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1116
+#define WT_STAT_CONN_LOG_COMPRESS_SMALL 1122
/*! log: log release advances write LSN */
-#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1117
+#define WT_STAT_CONN_LOG_RELEASE_WRITE_LSN 1123
/*! log: log scan operations */
-#define WT_STAT_CONN_LOG_SCANS 1118
+#define WT_STAT_CONN_LOG_SCANS 1124
/*! log: log scan records requiring two reads */
-#define WT_STAT_CONN_LOG_SCAN_REREADS 1119
+#define WT_STAT_CONN_LOG_SCAN_REREADS 1125
/*! log: log server thread advances write LSN */
-#define WT_STAT_CONN_LOG_WRITE_LSN 1120
+#define WT_STAT_CONN_LOG_WRITE_LSN 1126
/*! log: log server thread write LSN walk skipped */
-#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1121
+#define WT_STAT_CONN_LOG_WRITE_LSN_SKIP 1127
/*! log: log sync operations */
-#define WT_STAT_CONN_LOG_SYNC 1122
+#define WT_STAT_CONN_LOG_SYNC 1128
/*! log: log sync_dir operations */
-#define WT_STAT_CONN_LOG_SYNC_DIR 1123
+#define WT_STAT_CONN_LOG_SYNC_DIR 1129
/*! log: log write operations */
-#define WT_STAT_CONN_LOG_WRITES 1124
+#define WT_STAT_CONN_LOG_WRITES 1130
/*! log: logging bytes consolidated */
-#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1125
+#define WT_STAT_CONN_LOG_SLOT_CONSOLIDATED 1131
/*! log: maximum log file size */
-#define WT_STAT_CONN_LOG_MAX_FILESIZE 1126
+#define WT_STAT_CONN_LOG_MAX_FILESIZE 1132
/*! log: number of pre-allocated log files to create */
-#define WT_STAT_CONN_LOG_PREALLOC_MAX 1127
+#define WT_STAT_CONN_LOG_PREALLOC_MAX 1133
/*! log: pre-allocated log files not ready and missed */
-#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1128
+#define WT_STAT_CONN_LOG_PREALLOC_MISSED 1134
/*! log: pre-allocated log files prepared */
-#define WT_STAT_CONN_LOG_PREALLOC_FILES 1129
+#define WT_STAT_CONN_LOG_PREALLOC_FILES 1135
/*! log: pre-allocated log files used */
-#define WT_STAT_CONN_LOG_PREALLOC_USED 1130
+#define WT_STAT_CONN_LOG_PREALLOC_USED 1136
/*! log: records processed by log scan */
-#define WT_STAT_CONN_LOG_SCAN_RECORDS 1131
+#define WT_STAT_CONN_LOG_SCAN_RECORDS 1137
/*! log: total in-memory size of compressed records */
-#define WT_STAT_CONN_LOG_COMPRESS_MEM 1132
+#define WT_STAT_CONN_LOG_COMPRESS_MEM 1138
/*! log: total log buffer size */
-#define WT_STAT_CONN_LOG_BUFFER_SIZE 1133
+#define WT_STAT_CONN_LOG_BUFFER_SIZE 1139
/*! log: total size of compressed records */
-#define WT_STAT_CONN_LOG_COMPRESS_LEN 1134
+#define WT_STAT_CONN_LOG_COMPRESS_LEN 1140
/*! log: written slots coalesced */
-#define WT_STAT_CONN_LOG_SLOT_COALESCED 1135
+#define WT_STAT_CONN_LOG_SLOT_COALESCED 1141
/*! log: yields waiting for previous log file close */
-#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1136
+#define WT_STAT_CONN_LOG_CLOSE_YIELDS 1142
/*! reconciliation: fast-path pages deleted */
-#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1137
+#define WT_STAT_CONN_REC_PAGE_DELETE_FAST 1143
/*! reconciliation: page reconciliation calls */
-#define WT_STAT_CONN_REC_PAGES 1138
+#define WT_STAT_CONN_REC_PAGES 1144
/*! reconciliation: page reconciliation calls for eviction */
-#define WT_STAT_CONN_REC_PAGES_EVICTION 1139
+#define WT_STAT_CONN_REC_PAGES_EVICTION 1145
/*! reconciliation: pages deleted */
-#define WT_STAT_CONN_REC_PAGE_DELETE 1140
+#define WT_STAT_CONN_REC_PAGE_DELETE 1146
/*! reconciliation: split bytes currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1141
+#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1147
/*! reconciliation: split objects currently awaiting free */
-#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1142
+#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1148
/*! session: open cursor count */
-#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1143
+#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1149
/*! session: open session count */
-#define WT_STAT_CONN_SESSION_OPEN 1144
+#define WT_STAT_CONN_SESSION_OPEN 1150
/*! thread-yield: page acquire busy blocked */
-#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1145
+#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1151
/*! thread-yield: page acquire eviction blocked */
-#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1146
+#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1152
/*! thread-yield: page acquire locked blocked */
-#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1147
+#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1153
/*! thread-yield: page acquire read blocked */
-#define WT_STAT_CONN_PAGE_READ_BLOCKED 1148
+#define WT_STAT_CONN_PAGE_READ_BLOCKED 1154
/*! thread-yield: page acquire time sleeping (usecs) */
-#define WT_STAT_CONN_PAGE_SLEEP 1149
+#define WT_STAT_CONN_PAGE_SLEEP 1155
/*! transaction: number of named snapshots created */
-#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1150
+#define WT_STAT_CONN_TXN_SNAPSHOTS_CREATED 1156
/*! transaction: number of named snapshots dropped */
-#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1151
+#define WT_STAT_CONN_TXN_SNAPSHOTS_DROPPED 1157
/*! transaction: transaction begins */
-#define WT_STAT_CONN_TXN_BEGIN 1152
+#define WT_STAT_CONN_TXN_BEGIN 1158
/*! transaction: transaction checkpoint currently running */
-#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1153
+#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1159
/*! transaction: transaction checkpoint generation */
-#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1154
+#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1160
/*! transaction: transaction checkpoint max time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1155
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1161
/*! transaction: transaction checkpoint min time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1156
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1162
/*! transaction: transaction checkpoint most recent time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1157
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1163
/*! transaction: transaction checkpoint total time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1158
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1164
/*! transaction: transaction checkpoints */
-#define WT_STAT_CONN_TXN_CHECKPOINT 1159
+#define WT_STAT_CONN_TXN_CHECKPOINT 1165
/*! transaction: transaction failures due to cache overflow */
-#define WT_STAT_CONN_TXN_FAIL_CACHE 1160
+#define WT_STAT_CONN_TXN_FAIL_CACHE 1166
/*! transaction: transaction range of IDs currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_RANGE 1161
+#define WT_STAT_CONN_TXN_PINNED_RANGE 1167
/*! transaction: transaction range of IDs currently pinned by a checkpoint */
-#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1162
+#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1168
/*! transaction: transaction range of IDs currently pinned by named
* snapshots */
-#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1163
+#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1169
/*! transaction: transaction sync calls */
-#define WT_STAT_CONN_TXN_SYNC 1164
+#define WT_STAT_CONN_TXN_SYNC 1170
/*! transaction: transactions committed */
-#define WT_STAT_CONN_TXN_COMMIT 1165
+#define WT_STAT_CONN_TXN_COMMIT 1171
/*! transaction: transactions rolled back */
-#define WT_STAT_CONN_TXN_ROLLBACK 1166
+#define WT_STAT_CONN_TXN_ROLLBACK 1172
/*!
* @}
@@ -4163,91 +4676,93 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_DSRC_CACHE_READ 2055
/*! cache: pages read into cache requiring lookaside entries */
#define WT_STAT_DSRC_CACHE_READ_LOOKASIDE 2056
+/*! cache: pages requested from the cache */
+#define WT_STAT_DSRC_CACHE_PAGES_REQUESTED 2057
/*! cache: pages written from cache */
-#define WT_STAT_DSRC_CACHE_WRITE 2057
+#define WT_STAT_DSRC_CACHE_WRITE 2058
/*! cache: pages written requiring in-memory restoration */
-#define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2058
+#define WT_STAT_DSRC_CACHE_WRITE_RESTORE 2059
/*! cache: unmodified pages evicted */
-#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2059
+#define WT_STAT_DSRC_CACHE_EVICTION_CLEAN 2060
/*! compression: compressed pages read */
-#define WT_STAT_DSRC_COMPRESS_READ 2060
+#define WT_STAT_DSRC_COMPRESS_READ 2061
/*! compression: compressed pages written */
-#define WT_STAT_DSRC_COMPRESS_WRITE 2061
+#define WT_STAT_DSRC_COMPRESS_WRITE 2062
/*! compression: page written failed to compress */
-#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2062
+#define WT_STAT_DSRC_COMPRESS_WRITE_FAIL 2063
/*! compression: page written was too small to compress */
-#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2063
+#define WT_STAT_DSRC_COMPRESS_WRITE_TOO_SMALL 2064
/*! compression: raw compression call failed, additional data available */
-#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2064
+#define WT_STAT_DSRC_COMPRESS_RAW_FAIL_TEMPORARY 2065
/*! compression: raw compression call failed, no additional data available */
-#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2065
+#define WT_STAT_DSRC_COMPRESS_RAW_FAIL 2066
/*! compression: raw compression call succeeded */
-#define WT_STAT_DSRC_COMPRESS_RAW_OK 2066
+#define WT_STAT_DSRC_COMPRESS_RAW_OK 2067
/*! cursor: bulk-loaded cursor-insert calls */
-#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2067
+#define WT_STAT_DSRC_CURSOR_INSERT_BULK 2068
/*! cursor: create calls */
-#define WT_STAT_DSRC_CURSOR_CREATE 2068
+#define WT_STAT_DSRC_CURSOR_CREATE 2069
/*! cursor: cursor-insert key and value bytes inserted */
-#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2069
+#define WT_STAT_DSRC_CURSOR_INSERT_BYTES 2070
/*! cursor: cursor-remove key bytes removed */
-#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2070
+#define WT_STAT_DSRC_CURSOR_REMOVE_BYTES 2071
/*! cursor: cursor-update value bytes updated */
-#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2071
+#define WT_STAT_DSRC_CURSOR_UPDATE_BYTES 2072
/*! cursor: insert calls */
-#define WT_STAT_DSRC_CURSOR_INSERT 2072
+#define WT_STAT_DSRC_CURSOR_INSERT 2073
/*! cursor: next calls */
-#define WT_STAT_DSRC_CURSOR_NEXT 2073
+#define WT_STAT_DSRC_CURSOR_NEXT 2074
/*! cursor: prev calls */
-#define WT_STAT_DSRC_CURSOR_PREV 2074
+#define WT_STAT_DSRC_CURSOR_PREV 2075
/*! cursor: remove calls */
-#define WT_STAT_DSRC_CURSOR_REMOVE 2075
+#define WT_STAT_DSRC_CURSOR_REMOVE 2076
/*! cursor: reset calls */
-#define WT_STAT_DSRC_CURSOR_RESET 2076
+#define WT_STAT_DSRC_CURSOR_RESET 2077
/*! cursor: restarted searches */
-#define WT_STAT_DSRC_CURSOR_RESTART 2077
+#define WT_STAT_DSRC_CURSOR_RESTART 2078
/*! cursor: search calls */
-#define WT_STAT_DSRC_CURSOR_SEARCH 2078
+#define WT_STAT_DSRC_CURSOR_SEARCH 2079
/*! cursor: search near calls */
-#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2079
+#define WT_STAT_DSRC_CURSOR_SEARCH_NEAR 2080
/*! cursor: truncate calls */
-#define WT_STAT_DSRC_CURSOR_TRUNCATE 2080
+#define WT_STAT_DSRC_CURSOR_TRUNCATE 2081
/*! cursor: update calls */
-#define WT_STAT_DSRC_CURSOR_UPDATE 2081
+#define WT_STAT_DSRC_CURSOR_UPDATE 2082
/*! reconciliation: dictionary matches */
-#define WT_STAT_DSRC_REC_DICTIONARY 2082
+#define WT_STAT_DSRC_REC_DICTIONARY 2083
/*! reconciliation: fast-path pages deleted */
-#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2083
+#define WT_STAT_DSRC_REC_PAGE_DELETE_FAST 2084
/*! reconciliation: internal page key bytes discarded using suffix
* compression */
-#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2084
+#define WT_STAT_DSRC_REC_SUFFIX_COMPRESSION 2085
/*! reconciliation: internal page multi-block writes */
-#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2085
+#define WT_STAT_DSRC_REC_MULTIBLOCK_INTERNAL 2086
/*! reconciliation: internal-page overflow keys */
-#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2086
+#define WT_STAT_DSRC_REC_OVERFLOW_KEY_INTERNAL 2087
/*! reconciliation: leaf page key bytes discarded using prefix compression */
-#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2087
+#define WT_STAT_DSRC_REC_PREFIX_COMPRESSION 2088
/*! reconciliation: leaf page multi-block writes */
-#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2088
+#define WT_STAT_DSRC_REC_MULTIBLOCK_LEAF 2089
/*! reconciliation: leaf-page overflow keys */
-#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2089
+#define WT_STAT_DSRC_REC_OVERFLOW_KEY_LEAF 2090
/*! reconciliation: maximum blocks required for a page */
-#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2090
+#define WT_STAT_DSRC_REC_MULTIBLOCK_MAX 2091
/*! reconciliation: overflow values written */
-#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2091
+#define WT_STAT_DSRC_REC_OVERFLOW_VALUE 2092
/*! reconciliation: page checksum matches */
-#define WT_STAT_DSRC_REC_PAGE_MATCH 2092
+#define WT_STAT_DSRC_REC_PAGE_MATCH 2093
/*! reconciliation: page reconciliation calls */
-#define WT_STAT_DSRC_REC_PAGES 2093
+#define WT_STAT_DSRC_REC_PAGES 2094
/*! reconciliation: page reconciliation calls for eviction */
-#define WT_STAT_DSRC_REC_PAGES_EVICTION 2094
+#define WT_STAT_DSRC_REC_PAGES_EVICTION 2095
/*! reconciliation: pages deleted */
-#define WT_STAT_DSRC_REC_PAGE_DELETE 2095
+#define WT_STAT_DSRC_REC_PAGE_DELETE 2096
/*! session: object compaction */
-#define WT_STAT_DSRC_SESSION_COMPACT 2096
+#define WT_STAT_DSRC_SESSION_COMPACT 2097
/*! session: open cursor count */
-#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2097
+#define WT_STAT_DSRC_SESSION_CURSOR_OPEN 2098
/*! transaction: update conflicts */
-#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2098
+#define WT_STAT_DSRC_TXN_UPDATE_CONFLICT 2099
/*!
* @}
diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h
index 9e5007b38ed..0c8abf36cfe 100644
--- a/src/include/wt_internal.h
+++ b/src/include/wt_internal.h
@@ -169,6 +169,8 @@ struct __wt_dsrc_stats;
typedef struct __wt_dsrc_stats WT_DSRC_STATS;
struct __wt_evict_entry;
typedef struct __wt_evict_entry WT_EVICT_ENTRY;
+struct __wt_evict_queue;
+ typedef struct __wt_evict_queue WT_EVICT_QUEUE;
struct __wt_evict_worker;
typedef struct __wt_evict_worker WT_EVICT_WORKER;
struct __wt_ext;
@@ -179,6 +181,14 @@ struct __wt_fair_lock;
typedef struct __wt_fair_lock WT_FAIR_LOCK;
struct __wt_fh;
typedef struct __wt_fh WT_FH;
+struct __wt_file_handle_inmem;
+ typedef struct __wt_file_handle_inmem WT_FILE_HANDLE_INMEM;
+struct __wt_file_handle_posix;
+ typedef struct __wt_file_handle_posix WT_FILE_HANDLE_POSIX;
+struct __wt_file_handle_win;
+ typedef struct __wt_file_handle_win WT_FILE_HANDLE_WIN;
+struct __wt_fstream;
+ typedef struct __wt_fstream WT_FSTREAM;
struct __wt_hazard;
typedef struct __wt_hazard WT_HAZARD;
struct __wt_ikey;
@@ -351,21 +361,23 @@ union __wt_rand_state;
#include "intpack.i" /* required by cell.i, packing.i */
-#include "buf.i"
+#include "buf.i" /* required by cell.i */
#include "cache.i" /* required by txn.i */
#include "cell.i" /* required by btree.i */
-#include "log.i"
-#include "misc.i"
#include "mutex.i" /* required by btree.i */
-#include "packing.i"
#include "txn.i" /* required by btree.i */
+#include "bitstring.i"
#include "btree.i" /* required by cursor.i */
#include "btree_cmp.i"
-#include "cursor.i"
-
-#include "bitstring.i"
#include "column.i"
+#include "cursor.i"
+#include "log.i"
+#include "misc.i"
+#include "os_fhandle.i"
+#include "os_fs.i"
+#include "os_fstream.i"
+#include "packing.i"
#include "serial.i"
#if defined(__cplusplus)
diff --git a/src/log/log.c b/src/log/log.c
index 1132b54f335..1c4298b73e5 100644
--- a/src/log/log.c
+++ b/src/log/log.c
@@ -8,6 +8,8 @@
#include "wt_internal.h"
+static int __log_openfile(
+ WT_SESSION_IMPL *, bool, WT_FH **, const char *, uint32_t);
static int __log_write_internal(
WT_SESSION_IMPL *, WT_ITEM *, WT_LSN *, uint32_t);
@@ -93,8 +95,9 @@ __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn)
int
__wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn)
{
- WT_LOG *log;
WT_DECL_RET;
+ WT_FH *log_fh;
+ WT_LOG *log;
log = S2C(session)->log;
@@ -121,7 +124,7 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn)
"log_force_sync: sync directory %s to LSN %" PRIu32
"/%" PRIu32,
log->log_dir_fh->name, min_lsn->l.file, min_lsn->l.offset));
- WT_ERR(__wt_directory_sync_fh(session, log->log_dir_fh));
+ WT_ERR(__wt_fsync(session, log->log_dir_fh, true));
log->sync_dir_lsn = *min_lsn;
WT_STAT_FAST_CONN_INCR(session, log_sync_dir);
}
@@ -129,12 +132,21 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn)
* Sync the log file if needed.
*/
if (__wt_log_cmp(&log->sync_lsn, min_lsn) < 0) {
+ /*
+ * Get our own file handle to the log file. It is possible
+ * for the file handle in the log structure to change out
+ * from under us and either be NULL or point to a different
+ * file than we want.
+ */
+ WT_ERR(__log_openfile(session,
+ false, &log_fh, WT_LOG_FILENAME, min_lsn->l.file));
WT_ERR(__wt_verbose(session, WT_VERB_LOG,
"log_force_sync: sync %s to LSN %" PRIu32 "/%" PRIu32,
- log->log_fh->name, min_lsn->l.file, min_lsn->l.offset));
- WT_ERR(__wt_fsync(session, log->log_fh, true));
+ log_fh->name, min_lsn->l.file, min_lsn->l.offset));
+ WT_ERR(__wt_fsync(session, log_fh, true));
log->sync_lsn = *min_lsn;
WT_STAT_FAST_CONN_INCR(session, log_sync);
+ WT_ERR(__wt_close(session, &log_fh));
WT_ERR(__wt_cond_signal(session, log->log_sync_cond));
}
err:
@@ -246,8 +258,8 @@ __log_get_files(WT_SESSION_IMPL *session,
log_path = conn->log_path;
if (log_path == NULL)
log_path = "";
- return (__wt_dirlist(session, log_path, file_prefix,
- WT_DIRLIST_INCLUDE, filesp, countp));
+ return (__wt_fs_directory_list(
+ session, log_path, file_prefix, filesp, countp));
}
/*
@@ -265,6 +277,9 @@ __wt_log_get_all_files(WT_SESSION_IMPL *session,
uint32_t id, max;
u_int count, i;
+ *filesp = NULL;
+ *countp = 0;
+
id = 0;
log = S2C(session)->log;
@@ -295,26 +310,12 @@ __wt_log_get_all_files(WT_SESSION_IMPL *session,
*countp = count;
if (0) {
-err: __wt_log_files_free(session, files, count);
+err: WT_TRET(__wt_fs_directory_list_free(session, &files, count));
}
return (ret);
}
/*
- * __wt_log_files_free --
- * Free memory associated with a log file list.
- */
-void
-__wt_log_files_free(WT_SESSION_IMPL *session, char **files, u_int count)
-{
- u_int i;
-
- for (i = 0; i < count; i++)
- __wt_free(session, files[i]);
- __wt_free(session, files);
-}
-
-/*
* __log_filename --
* Given a log number, return a WT_ITEM of a generated log file name
* of the given prefix type.
@@ -431,21 +432,27 @@ __log_prealloc(WT_SESSION_IMPL *session, WT_FH *fh)
conn = S2C(session);
log = conn->log;
- ret = 0;
+
/*
* If the user configured zero filling, pre-allocate the log file
* manually. Otherwise use either fallocate or ftruncate to create
* and zero the log file based on what is available.
*/
if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ZERO_FILL))
- ret = __log_zero(session, fh,
- WT_LOG_FIRST_RECORD, conn->log_file_max);
- else if (fh->fallocate_available == WT_FALLOCATE_NOT_AVAILABLE ||
- (ret = __wt_fallocate(session, fh,
- WT_LOG_FIRST_RECORD, conn->log_file_max)) == ENOTSUP)
- ret = __wt_ftruncate(session, fh,
- WT_LOG_FIRST_RECORD + conn->log_file_max);
- return (ret);
+ return (__log_zero(session, fh,
+ WT_LOG_FIRST_RECORD, conn->log_file_max));
+
+ /*
+ * We have exclusive access to the log file and there are no other
+ * writes happening concurrently, so there are no locking issues.
+ */
+ if ((ret = __wt_fallocate(
+ session, fh, WT_LOG_FIRST_RECORD, conn->log_file_max)) == 0)
+ return (0);
+ WT_RET_ERROR_OK(ret, ENOTSUP);
+
+ return (__wt_ftruncate(
+ session, fh, WT_LOG_FIRST_RECORD + conn->log_file_max));
}
/*
@@ -657,14 +664,17 @@ static int
__log_openfile(WT_SESSION_IMPL *session,
bool ok_create, WT_FH **fhp, const char *file_prefix, uint32_t id)
{
+ WT_CONNECTION_IMPL *conn;
WT_DECL_ITEM(buf);
WT_DECL_RET;
WT_LOG *log;
WT_LOG_DESC *desc;
WT_LOG_RECORD *logrec;
uint32_t allocsize;
+ u_int flags;
- log = S2C(session)->log;
+ conn = S2C(session);
+ log = conn->log;
if (log == NULL)
allocsize = WT_LOG_ALIGN;
else
@@ -673,8 +683,14 @@ __log_openfile(WT_SESSION_IMPL *session,
WT_ERR(__log_filename(session, id, file_prefix, buf));
WT_ERR(__wt_verbose(session, WT_VERB_LOG,
"opening log %s", (const char *)buf->data));
- WT_ERR(__wt_open(session, buf->data,
- WT_FILE_TYPE_LOG, ok_create ? WT_OPEN_CREATE : 0, fhp));
+ flags = 0;
+ if (ok_create)
+ LF_SET(WT_OPEN_CREATE);
+ if (FLD_ISSET(conn->direct_io, WT_DIRECT_IO_LOG))
+ LF_SET(WT_OPEN_DIRECTIO);
+ WT_ERR(__wt_open(
+ session, buf->data, WT_OPEN_FILE_TYPE_LOG, flags, fhp));
+
/*
* If we are not creating the log file but opening it for reading,
* check that the magic number and versions are correct.
@@ -745,12 +761,11 @@ __log_alloc_prealloc(WT_SESSION_IMPL *session, uint32_t to_num)
* All file setup, writing the header and pre-allocation was done
* before. We only need to rename it.
*/
- WT_ERR(__wt_rename(session, from_path->data, to_path->data));
+ WT_ERR(__wt_fs_rename(session, from_path->data, to_path->data));
err: __wt_scr_free(session, &from_path);
__wt_scr_free(session, &to_path);
- if (logfiles != NULL)
- __wt_log_files_free(session, logfiles, logcount);
+ WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount));
return (ret);
}
@@ -972,8 +987,7 @@ __log_truncate(WT_SESSION_IMPL *session,
}
}
err: WT_TRET(__wt_close(session, &log_fh));
- if (logfiles != NULL)
- __wt_log_files_free(session, logfiles, logcount);
+ WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount));
return (ret);
}
@@ -1025,7 +1039,7 @@ __wt_log_allocfile(
/*
* Rename it into place and make it available.
*/
- WT_ERR(__wt_rename(session, from_path->data, to_path->data));
+ WT_ERR(__wt_fs_rename(session, from_path->data, to_path->data));
err: __wt_scr_free(session, &from_path);
__wt_scr_free(session, &to_path);
@@ -1048,7 +1062,7 @@ __wt_log_remove(WT_SESSION_IMPL *session,
WT_ERR(__log_filename(session, lognum, file_prefix, path));
WT_ERR(__wt_verbose(session, WT_VERB_LOG,
"log_remove: remove log %s", (char *)path->data));
- WT_ERR(__wt_remove(session, path->data));
+ WT_ERR(__wt_fs_remove(session, path->data));
err: __wt_scr_free(session, &path);
return (ret);
}
@@ -1084,7 +1098,7 @@ __wt_log_open(WT_SESSION_IMPL *session)
WT_RET(__wt_verbose(session, WT_VERB_LOG,
"log_open: open fh to directory %s", conn->log_path));
WT_RET(__wt_open(session, conn->log_path,
- WT_FILE_TYPE_DIRECTORY, 0, &log->log_dir_fh));
+ WT_OPEN_FILE_TYPE_DIRECTORY, 0, &log->log_dir_fh));
}
if (!F_ISSET(conn, WT_CONN_READONLY)) {
@@ -1101,9 +1115,8 @@ __wt_log_open(WT_SESSION_IMPL *session)
WT_ERR(__wt_log_remove(
session, WT_LOG_TMPNAME, lognum));
}
- __wt_log_files_free(session, logfiles, logcount);
- logfiles = NULL;
- logcount = 0;
+ WT_ERR(
+ __wt_fs_directory_list_free(session, &logfiles, logcount));
WT_ERR(__log_get_files(session,
WT_LOG_PREPNAME, &logfiles, &logcount));
for (i = 0; i < logcount; i++) {
@@ -1112,8 +1125,8 @@ __wt_log_open(WT_SESSION_IMPL *session)
WT_ERR(__wt_log_remove(
session, WT_LOG_PREPNAME, lognum));
}
- __wt_log_files_free(session, logfiles, logcount);
- logfiles = NULL;
+ WT_ERR(
+ __wt_fs_directory_list_free(session, &logfiles, logcount));
}
/*
@@ -1151,8 +1164,7 @@ __wt_log_open(WT_SESSION_IMPL *session)
FLD_SET(conn->log_flags, WT_CONN_LOG_EXISTED);
}
-err: if (logfiles != NULL)
- __wt_log_files_free(session, logfiles, logcount);
+err: WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount));
return (ret);
}
@@ -1188,8 +1200,7 @@ __wt_log_close(WT_SESSION_IMPL *session)
WT_RET(__wt_verbose(session, WT_VERB_LOG,
"closing log directory %s", log->log_dir_fh->name));
if (!F_ISSET(conn, WT_CONN_READONLY))
- WT_RET(
- __wt_directory_sync_fh(session, log->log_dir_fh));
+ WT_RET(__wt_fsync(session, log->log_dir_fh, true));
WT_RET(__wt_close(session, &log->log_dir_fh));
log->log_dir_fh = NULL;
}
@@ -1396,8 +1407,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep)
"/%" PRIu32,
log->log_dir_fh->name,
sync_lsn.l.file, sync_lsn.l.offset));
- WT_ERR(__wt_directory_sync_fh(
- session, log->log_dir_fh));
+ WT_ERR(__wt_fsync(session, log->log_dir_fh, true));
log->sync_dir_lsn = sync_lsn;
WT_STAT_FAST_CONN_INCR(session, log_sync_dir);
}
@@ -1538,8 +1548,8 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags,
}
WT_SET_LSN(&start_lsn, firstlog, 0);
WT_SET_LSN(&end_lsn, lastlog, 0);
- __wt_log_files_free(session, logfiles, logcount);
- logfiles = NULL;
+ WT_ERR(
+ __wt_fs_directory_list_free(session, &logfiles, logcount));
}
WT_ERR(__log_openfile(
session, false, &log_fh, WT_LOG_FILENAME, start_lsn.l.file));
@@ -1735,8 +1745,7 @@ advance:
err: WT_STAT_FAST_CONN_INCR(session, log_scans);
- if (logfiles != NULL)
- __wt_log_files_free(session, logfiles, logcount);
+ WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount));
__wt_scr_free(session, &buf);
__wt_scr_free(session, &decryptitem);
diff --git a/src/lsm/lsm_cursor.c b/src/lsm/lsm_cursor.c
index e023b2b407e..78235fb6a92 100644
--- a/src/lsm/lsm_cursor.c
+++ b/src/lsm/lsm_cursor.c
@@ -103,7 +103,6 @@ __clsm_enter_update(WT_CURSOR_LSM *clsm)
bool hard_limit, have_primary, ovfl;
lsm_tree = clsm->lsm_tree;
- ovfl = false;
session = (WT_SESSION_IMPL *)clsm->iface.session;
if (clsm->nchunks == 0) {
@@ -210,7 +209,7 @@ __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update)
goto open;
if (txn->isolation == WT_ISO_SNAPSHOT)
- __wt_txn_cursor_op(session);
+ WT_RET(__wt_txn_cursor_op(session));
/*
* Figure out how many updates are required for
@@ -1155,7 +1154,6 @@ __clsm_search_near(WT_CURSOR *cursor, int *exactp)
closest = NULL;
clsm = (WT_CURSOR_LSM *)cursor;
exact = 0;
- deleted = false;
CURSOR_API_CALL(cursor, session, search_near, NULL);
WT_CURSOR_NEEDKEY(cursor);
diff --git a/src/lsm/lsm_merge.c b/src/lsm/lsm_merge.c
index 6d907284546..1ff0a216c02 100644
--- a/src/lsm/lsm_merge.c
+++ b/src/lsm/lsm_merge.c
@@ -152,16 +152,13 @@ __lsm_merge_span(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree,
u_int end_chunk, i, merge_max, merge_min, nchunks, start_chunk;
u_int oldest_gen, youngest_gen;
- chunk_size = 0;
- nchunks = 0;
- record_count = 0;
- chunk = youngest = NULL;
-
/* Clear the return parameters */
- *start = 0;
- *end = 0;
+ *start = *end = 0;
*records = 0;
+ chunk_size = 0;
+ chunk = youngest = NULL;
+
aggressive = lsm_tree->merge_aggressiveness;
merge_max = (aggressive > WT_LSM_AGGRESSIVE_THRESHOLD) ?
100 : lsm_tree->merge_max;
@@ -218,8 +215,8 @@ __lsm_merge_span(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree,
*/
retry_find:
oldest_gen = youngest_gen = lsm_tree->chunk[end_chunk]->generation;
- for (start_chunk = end_chunk + 1, record_count = 0;
- start_chunk > 0; ) {
+ for (record_count = 0,
+ start_chunk = end_chunk + 1; start_chunk > 0;) {
chunk = lsm_tree->chunk[start_chunk - 1];
youngest = lsm_tree->chunk[end_chunk];
nchunks = (end_chunk + 1) - start_chunk;
@@ -306,14 +303,12 @@ retry_find:
}
#endif
- WT_ASSERT(session,
- nchunks == 0 || (chunk != NULL && youngest != NULL));
+ WT_ASSERT(session, nchunks == 0 || (chunk != NULL && youngest != NULL));
+
/*
- * Don't do merges that are too small or across too many
- * generations.
+ * Don't do merges that are too small or across too many generations.
*/
- if (nchunks < merge_min ||
- oldest_gen - youngest_gen > max_gap) {
+ if (nchunks < merge_min || oldest_gen - youngest_gen > max_gap) {
for (i = 0; i < nchunks; i++) {
chunk = lsm_tree->chunk[start_chunk + i];
WT_ASSERT(session,
@@ -365,7 +360,6 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
bloom = NULL;
chunk = NULL;
dest = src = NULL;
- start_id = 0;
created_chunk = create_bloom = locked = in_sync = false;
/* Fast path if it's obvious no merges could be done. */
@@ -485,7 +479,7 @@ __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
lsm_rows_merged, insert_count % LSM_MERGE_CHECK_INTERVAL);
++lsm_tree->merge_progressing;
WT_ERR(__wt_verbose(session, WT_VERB_LSM,
- "Bloom size for %" PRIu64 " has %" PRIu64 " items inserted.",
+ "Bloom size for %" PRIu64 " has %" PRIu64 " items inserted",
record_count, insert_count));
/*
diff --git a/src/lsm/lsm_meta.c b/src/lsm/lsm_meta.c
index e19e2cd0126..7e100cb855c 100644
--- a/src/lsm/lsm_meta.c
+++ b/src/lsm/lsm_meta.c
@@ -331,7 +331,7 @@ __lsm_meta_read_v1(
WT_ERR(__wt_scr_alloc(session, 0, &buf));
WT_ERR(__wt_buf_fmt(session, buf,
"key_format=u,value_format=u,memory_page_max=%" PRIu64,
- 2 * lsm_tree->chunk_max));
+ 2 * lsm_tree->chunk_size));
file_cfg[2] = buf->data;
WT_ERR(__wt_config_collapse(session, file_cfg, &fileconf));
lsm_tree->file_config = fileconf;
diff --git a/src/lsm/lsm_tree.c b/src/lsm/lsm_tree.c
index cb1ddf22f84..da106ae2089 100644
--- a/src/lsm/lsm_tree.c
+++ b/src/lsm/lsm_tree.c
@@ -235,7 +235,7 @@ __wt_lsm_tree_set_chunk_size(
if (!WT_PREFIX_SKIP(filename, "file:"))
WT_RET_MSG(session, EINVAL,
"Expected a 'file:' URI: %s", chunk->uri);
- WT_RET(__wt_filesize_name(session, filename, false, &size));
+ WT_RET(__wt_fs_size(session, filename, &size));
chunk->size = (uint64_t)size;
@@ -256,7 +256,7 @@ __lsm_tree_cleanup_old(WT_SESSION_IMPL *session, const char *uri)
{ WT_CONFIG_BASE(session, WT_SESSION_drop), "force", NULL };
bool exists;
- WT_RET(__wt_exist(session, uri + strlen("file:"), &exists));
+ WT_RET(__wt_fs_exist(session, uri + strlen("file:"), &exists));
if (exists)
WT_WITH_SCHEMA_LOCK(session, ret,
ret = __wt_schema_drop(session, uri, cfg));
@@ -1344,8 +1344,14 @@ __wt_lsm_tree_worker(WT_SESSION_IMPL *session,
locked = true;
for (i = 0; i < lsm_tree->nchunks; i++) {
chunk = lsm_tree->chunk[i];
- if (file_func == __wt_checkpoint &&
- F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))
+ /*
+ * If the chunk is on disk, don't include underlying handles in
+ * the checkpoint. Checking the "get handles" function is all
+ * we need to do, no further checkpoint calls are done if the
+ * handle is not gathered.
+ */
+ if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
+ file_func == __wt_checkpoint_get_handles)
continue;
WT_ERR(__wt_schema_worker(session, chunk->uri,
file_func, name_func, cfg, open_flags));
diff --git a/src/lsm/lsm_work_unit.c b/src/lsm/lsm_work_unit.c
index 87771e2cb6c..821a996c38b 100644
--- a/src/lsm/lsm_work_unit.c
+++ b/src/lsm/lsm_work_unit.c
@@ -289,7 +289,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
}
/* Stop if a running transaction needs the chunk. */
- __wt_txn_update_oldest(session, true);
+ WT_RET(__wt_txn_update_oldest(session, true));
if (chunk->switch_txn == WT_TXN_NONE ||
!__wt_txn_visible_all(session, chunk->switch_txn)) {
WT_RET(__wt_verbose(session, WT_VERB_LSM,
@@ -525,7 +525,7 @@ __lsm_drop_file(WT_SESSION_IMPL *session, const char *uri)
ret = __wt_schema_drop(session, uri, drop_cfg));
if (ret == 0)
- ret = __wt_remove(session, uri + strlen("file:"));
+ ret = __wt_fs_remove(session, uri + strlen("file:"));
WT_RET(__wt_verbose(session, WT_VERB_LSM, "Dropped %s", uri));
if (ret == EBUSY || ret == ENOENT)
diff --git a/src/meta/meta_track.c b/src/meta/meta_track.c
index a73b7e09d37..4fe628e319b 100644
--- a/src/meta/meta_track.c
+++ b/src/meta/meta_track.c
@@ -194,8 +194,8 @@ __meta_track_unroll(WT_SESSION_IMPL *session, WT_META_TRACK *trk)
__wt_err(session, ret,
"metadata unroll rename %s to %s", trk->b, trk->a);
- if (trk->a == NULL &&
- (ret = __wt_remove(session, trk->b + strlen("file:"))) != 0)
+ if (trk->a == NULL && (ret =
+ __wt_fs_remove(session, trk->b + strlen("file:"))) != 0)
__wt_err(session, ret,
"metadata unroll create %s", trk->b);
diff --git a/src/meta/meta_turtle.c b/src/meta/meta_turtle.c
index 0b287c228e5..635daf63d7f 100644
--- a/src/meta/meta_turtle.c
+++ b/src/meta/meta_turtle.c
@@ -71,24 +71,24 @@ __metadata_load_hot_backup(WT_SESSION_IMPL *session)
WT_DECL_ITEM(key);
WT_DECL_ITEM(value);
WT_DECL_RET;
- WT_FH *fh;
+ WT_FSTREAM *fs;
bool exist;
/* Look for a hot backup file: if we find it, load it. */
- WT_RET(__wt_exist(session, WT_METADATA_BACKUP, &exist));
+ WT_RET(__wt_fs_exist(session, WT_METADATA_BACKUP, &exist));
if (!exist)
return (0);
- WT_RET(__wt_open(session, WT_METADATA_BACKUP,
- WT_FILE_TYPE_REGULAR, WT_OPEN_READONLY | WT_STREAM_READ, &fh));
+ WT_RET(__wt_fopen(session,
+ WT_METADATA_BACKUP, 0, WT_STREAM_READ, &fs));
/* Read line pairs and load them into the metadata file. */
WT_ERR(__wt_scr_alloc(session, 512, &key));
WT_ERR(__wt_scr_alloc(session, 512, &value));
for (;;) {
- WT_ERR(__wt_getline(session, key, fh));
+ WT_ERR(__wt_getline(session, fs, key));
if (key->size == 0)
break;
- WT_ERR(__wt_getline(session, value, fh));
+ WT_ERR(__wt_getline(session, fs, value));
if (value->size == 0)
WT_ERR(__wt_illegal_value(session, WT_METADATA_BACKUP));
WT_ERR(__wt_metadata_update(session, key->data, value->data));
@@ -96,7 +96,7 @@ __metadata_load_hot_backup(WT_SESSION_IMPL *session)
F_SET(S2C(session), WT_CONN_WAS_BACKUP);
-err: WT_TRET(__wt_close(session, &fh));
+err: WT_TRET(__wt_fclose(session, &fs));
__wt_scr_free(session, &key);
__wt_scr_free(session, &value);
return (ret);
@@ -128,7 +128,7 @@ __metadata_load_bulk(WT_SESSION_IMPL *session)
continue;
/* If the file exists, it's all good. */
- WT_ERR(__wt_exist(session, key, &exist));
+ WT_ERR(__wt_fs_exist(session, key, &exist));
if (exist)
continue;
@@ -156,7 +156,7 @@ int
__wt_turtle_init(WT_SESSION_IMPL *session)
{
WT_DECL_RET;
- bool exist_backup, exist_incr, exist_turtle, load;
+ bool exist_backup, exist_incr, exist_isrc, exist_turtle, load;
char *metaconf;
metaconf = NULL;
@@ -182,21 +182,28 @@ __wt_turtle_init(WT_SESSION_IMPL *session)
* that is an error. Otherwise, if there's already a turtle file, we're
* done.
*/
- WT_RET(__wt_exist(session, WT_INCREMENTAL_BACKUP, &exist_incr));
- WT_RET(__wt_exist(session, WT_METADATA_BACKUP, &exist_backup));
- WT_RET(__wt_exist(session, WT_METADATA_TURTLE, &exist_turtle));
+ WT_RET(__wt_fs_exist(session, WT_INCREMENTAL_BACKUP, &exist_incr));
+ WT_RET(__wt_fs_exist(session, WT_INCREMENTAL_SRC, &exist_isrc));
+ WT_RET(__wt_fs_exist(session, WT_METADATA_BACKUP, &exist_backup));
+ WT_RET(__wt_fs_exist(session, WT_METADATA_TURTLE, &exist_turtle));
if (exist_turtle) {
- if (exist_incr)
+ /*
+ * We need to detect the difference between a source database
+ * that may have crashed with an incremental backup file
+ * and a destination database that incorrectly ran recovery.
+ */
+ if (exist_incr && !exist_isrc)
WT_RET_MSG(session, EINVAL,
"Incremental backup after running recovery "
- "is not allowed.");
+ "is not allowed");
/*
* If we have a backup file and metadata and turtle files,
* we want to recreate the metadata from the backup.
*/
if (exist_backup) {
- WT_RET(__wt_msg(session, "Both %s and %s exist. "
- "Recreating metadata from backup.",
+ WT_RET(__wt_msg(session,
+ "Both %s and %s exist; recreating metadata from "
+ "backup",
WT_METADATA_TURTLE, WT_METADATA_BACKUP));
WT_RET(__wt_remove_if_exists(session, WT_METAFILE));
WT_RET(__wt_remove_if_exists(
@@ -242,7 +249,7 @@ __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, char **valuep)
{
WT_DECL_ITEM(buf);
WT_DECL_RET;
- WT_FH *fh;
+ WT_FSTREAM *fs;
bool exist, match;
*valuep = NULL;
@@ -253,24 +260,23 @@ __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, char **valuep)
* the turtle file, and that means returning the default configuration
* string for the metadata file.
*/
- WT_RET(__wt_exist(session, WT_METADATA_TURTLE, &exist));
+ WT_RET(__wt_fs_exist(session, WT_METADATA_TURTLE, &exist));
if (!exist)
return (strcmp(key, WT_METAFILE_URI) == 0 ?
__metadata_config(session, valuep) : WT_NOTFOUND);
- WT_RET(__wt_open(session, WT_METADATA_TURTLE,
- WT_FILE_TYPE_REGULAR, WT_OPEN_READONLY | WT_STREAM_READ, &fh));
+ WT_RET(__wt_fopen(session, WT_METADATA_TURTLE, 0, WT_STREAM_READ, &fs));
/* Search for the key. */
WT_ERR(__wt_scr_alloc(session, 512, &buf));
for (match = false;;) {
- WT_ERR(__wt_getline(session, buf, fh));
+ WT_ERR(__wt_getline(session, fs, buf));
if (buf->size == 0)
WT_ERR(WT_NOTFOUND);
if (strcmp(key, buf->data) == 0)
match = true;
/* Key matched: read the subsequent line for the value. */
- WT_ERR(__wt_getline(session, buf, fh));
+ WT_ERR(__wt_getline(session, fs, buf));
if (buf->size == 0)
WT_ERR(__wt_illegal_value(session, WT_METADATA_TURTLE));
if (match)
@@ -280,7 +286,7 @@ __wt_turtle_read(WT_SESSION_IMPL *session, const char *key, char **valuep)
/* Copy the value for the caller. */
WT_ERR(__wt_strdup(session, buf->data, valuep));
-err: WT_TRET(__wt_close(session, &fh));
+err: WT_TRET(__wt_fclose(session, &fs));
__wt_scr_free(session, &buf);
if (ret != 0)
@@ -295,38 +301,34 @@ err: WT_TRET(__wt_close(session, &fh));
int
__wt_turtle_update(WT_SESSION_IMPL *session, const char *key, const char *value)
{
- WT_FH *fh;
- WT_DECL_ITEM(buf);
+ WT_FSTREAM *fs;
WT_DECL_RET;
int vmajor, vminor, vpatch;
const char *version;
- fh = NULL;
+ fs = NULL;
/*
* Create the turtle setup file: we currently re-write it from scratch
* every time.
*/
- WT_RET(__wt_open(session, WT_METADATA_TURTLE_SET,
- WT_FILE_TYPE_REGULAR, WT_OPEN_CREATE | WT_OPEN_EXCLUSIVE, &fh));
+ WT_RET(__wt_fopen(session, WT_METADATA_TURTLE_SET,
+ WT_OPEN_CREATE | WT_OPEN_EXCLUSIVE, WT_STREAM_WRITE, &fs));
version = wiredtiger_version(&vmajor, &vminor, &vpatch);
- WT_ERR(__wt_scr_alloc(session, 2 * 1024, &buf));
- WT_ERR(__wt_buf_fmt(session, buf,
+ WT_ERR(__wt_fprintf(session, fs,
"%s\n%s\n%s\n" "major=%d,minor=%d,patch=%d\n%s\n%s\n",
WT_METADATA_VERSION_STR, version,
WT_METADATA_VERSION, vmajor, vminor, vpatch,
key, value));
- WT_ERR(__wt_write(session, fh, 0, buf->size, buf->data));
- /* Flush the handle and rename the file into place. */
- ret = __wt_sync_handle_and_rename(
- session, &fh, WT_METADATA_TURTLE_SET, WT_METADATA_TURTLE);
+ /* Flush the stream and rename the file into place. */
+ ret = __wt_sync_and_rename(
+ session, &fs, WT_METADATA_TURTLE_SET, WT_METADATA_TURTLE);
/* Close any file handle left open, remove any temporary file. */
-err: WT_TRET(__wt_close(session, &fh));
+err: WT_TRET(__wt_fclose(session, &fs));
WT_TRET(__wt_remove_if_exists(session, WT_METADATA_TURTLE_SET));
- __wt_scr_free(session, &buf);
return (ret);
}
diff --git a/src/os_common/filename.c b/src/os_common/filename.c
index dfd67284948..5f174288350 100644
--- a/src/os_common/filename.c
+++ b/src/os_common/filename.c
@@ -60,9 +60,9 @@ __wt_remove_if_exists(WT_SESSION_IMPL *session, const char *name)
{
bool exist;
- WT_RET(__wt_exist(session, name, &exist));
+ WT_RET(__wt_fs_exist(session, name, &exist));
if (exist)
- WT_RET(__wt_remove(session, name));
+ WT_RET(__wt_fs_remove(session, name));
return (0);
}
@@ -78,7 +78,7 @@ __wt_rename_and_sync_directory(
bool same_directory;
/* Rename the source file to the target. */
- WT_RET(__wt_rename(session, from, to));
+ WT_RET(__wt_fs_rename(session, from, to));
/*
* Flush the backing directory to guarantee the rename. My reading of
@@ -89,7 +89,7 @@ __wt_rename_and_sync_directory(
* with specific mount options. Flush both of the from/to directories
* until it's a performance problem.
*/
- WT_RET(__wt_directory_sync(session, from));
+ WT_RET(__wt_fs_directory_sync(session, from));
/*
* In almost all cases, we're going to be renaming files in the same
@@ -101,29 +101,7 @@ __wt_rename_and_sync_directory(
(fp != NULL && tp != NULL &&
fp - from == tp - to && memcmp(from, to, (size_t)(fp - from)) == 0);
- return (same_directory ? 0 : __wt_directory_sync(session, to));
-}
-
-/*
- * __wt_sync_handle_and_rename --
- * Sync and close a handle, and swap it into place.
- */
-int
-__wt_sync_handle_and_rename(
- WT_SESSION_IMPL *session, WT_FH **fhp, const char *from, const char *to)
-{
- WT_DECL_RET;
- WT_FH *fh;
-
- fh = *fhp;
- *fhp = NULL;
-
- /* Flush to disk and close the handle. */
- ret = __wt_fsync(session, fh, true);
- WT_TRET(__wt_close(session, &fh));
- WT_RET(ret);
-
- return (__wt_rename_and_sync_directory(session, from, to));
+ return (same_directory ? 0 : __wt_fs_directory_sync(session, to));
}
/*
@@ -160,10 +138,9 @@ __wt_copy_and_sync(WT_SESSION *wt_session, const char *from, const char *to)
WT_ERR(__wt_remove_if_exists(session, tmp->data));
/* Open the from and temporary file handles. */
- WT_ERR(__wt_open(session, from,
- WT_FILE_TYPE_REGULAR, WT_OPEN_READONLY, &ffh));
- WT_ERR(__wt_open(session, tmp->data,
- WT_FILE_TYPE_REGULAR, WT_OPEN_CREATE | WT_OPEN_EXCLUSIVE, &tfh));
+ WT_ERR(__wt_open(session, from, WT_OPEN_FILE_TYPE_REGULAR, 0, &ffh));
+ WT_ERR(__wt_open(session, tmp->data, WT_OPEN_FILE_TYPE_REGULAR,
+ WT_OPEN_CREATE | WT_OPEN_EXCLUSIVE, &tfh));
/*
* Allocate a copy buffer. Don't use a scratch buffer, this thing is
@@ -182,7 +159,10 @@ __wt_copy_and_sync(WT_SESSION *wt_session, const char *from, const char *to)
/* Close the from handle, then swap the temporary file into place. */
WT_ERR(__wt_close(session, &ffh));
- ret = __wt_sync_handle_and_rename(session, &tfh, tmp->data, to);
+ WT_ERR(__wt_fsync(session, tfh, true));
+ WT_ERR(__wt_close(session, &tfh));
+
+ ret = __wt_rename_and_sync_directory(session, tmp->data, to);
err: WT_TRET(__wt_close(session, &ffh));
WT_TRET(__wt_close(session, &tfh));
diff --git a/src/os_common/os_fhandle.c b/src/os_common/os_fhandle.c
index b16b2e24bfa..818829203e0 100644
--- a/src/os_common/os_fhandle.c
+++ b/src/os_common/os_fhandle.c
@@ -9,20 +9,88 @@
#include "wt_internal.h"
/*
- * __wt_handle_search --
- * Search for a matching handle.
+ * __fhandle_method_finalize --
+ * Initialize any NULL WT_FH structure methods to not-supported. Doing
+ * this means that custom file systems with incomplete implementations
+ * won't dereference NULL pointers.
+ */
+static int
+__fhandle_method_finalize(
+ WT_SESSION_IMPL *session, WT_FILE_HANDLE *handle, bool readonly)
+{
+#define WT_HANDLE_METHOD_REQ(name) \
+ if (handle->name == NULL) \
+ WT_RET_MSG(session, EINVAL, \
+ "a WT_FILE_HANDLE.%s method must be configured", #name)
+
+ WT_HANDLE_METHOD_REQ(close);
+ /* not required: fadvise */
+ /* not required: fallocate */
+ /* not required: fallocate_nolock */
+ /* not required: lock */
+ /* not required: map */
+ /* not required: map_discard */
+ /* not required: map_preload */
+ /* not required: map_unmap */
+ WT_HANDLE_METHOD_REQ(read);
+ WT_HANDLE_METHOD_REQ(size);
+ /* not required: sync */
+ /* not required: sync_nowait */
+ if (!readonly) {
+ WT_HANDLE_METHOD_REQ(truncate);
+ WT_HANDLE_METHOD_REQ(write);
+ }
+
+ return (0);
+}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_handle_is_open --
+ * Return if there's an open handle matching a name.
*/
bool
-__wt_handle_search(WT_SESSION_IMPL *session,
- const char *name, bool increment_ref, WT_FH *newfh, WT_FH **fhp)
+__wt_handle_is_open(WT_SESSION_IMPL *session, const char *name)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_FH *fh;
+ uint64_t bucket, hash;
+ bool found;
+
+ conn = S2C(session);
+ found = false;
+
+ hash = __wt_hash_city64(name, strlen(name));
+ bucket = hash % WT_HASH_ARRAY_SIZE;
+
+ __wt_spin_lock(session, &conn->fh_lock);
+
+ TAILQ_FOREACH(fh, &conn->fhhash[bucket], hashq)
+ if (strcmp(name, fh->name) == 0) {
+ found = true;
+ break;
+ }
+
+ __wt_spin_unlock(session, &conn->fh_lock);
+
+ return (found);
+}
+#endif
+
+/*
+ * __handle_search --
+ * Search for a matching handle.
+ */
+static bool
+__handle_search(
+ WT_SESSION_IMPL *session, const char *name, WT_FH *newfh, WT_FH **fhp)
{
WT_CONNECTION_IMPL *conn;
WT_FH *fh;
uint64_t bucket, hash;
bool found;
- if (fhp != NULL)
- *fhp = NULL;
+ *fhp = NULL;
conn = S2C(session);
found = false;
@@ -33,15 +101,13 @@ __wt_handle_search(WT_SESSION_IMPL *session,
__wt_spin_lock(session, &conn->fh_lock);
/*
- * If we already have the file open, optionally increment the reference
- * count and return a pointer.
+ * If we already have the file open, increment the reference count and
+ * return a pointer.
*/
TAILQ_FOREACH(fh, &conn->fhhash[bucket], hashq)
if (strcmp(name, fh->name) == 0) {
- if (increment_ref)
- ++fh->ref;
- if (fhp != NULL)
- *fhp = fh;
+ ++fh->ref;
+ *fhp = fh;
found = true;
break;
}
@@ -49,13 +115,11 @@ __wt_handle_search(WT_SESSION_IMPL *session,
/* If we don't find a match, optionally add a new entry. */
if (!found && newfh != NULL) {
newfh->name_hash = hash;
- WT_CONN_FILE_INSERT(conn, newfh, bucket);
+ WT_FILE_HANDLE_INSERT(conn, newfh, bucket);
(void)__wt_atomic_add32(&conn->open_file_count, 1);
- if (increment_ref)
- ++newfh->ref;
- if (fhp != NULL)
- *fhp = newfh;
+ ++newfh->ref;
+ *fhp = newfh;
}
__wt_spin_unlock(session, &conn->fh_lock);
@@ -68,8 +132,8 @@ __wt_handle_search(WT_SESSION_IMPL *session,
* Optionally output a verbose message on handle open.
*/
static inline int
-__open_verbose(WT_SESSION_IMPL *session,
- const char *name, uint32_t file_type, uint32_t flags)
+__open_verbose(
+ WT_SESSION_IMPL *session, const char *name, int file_type, u_int flags)
{
#ifdef HAVE_VERBOSE
WT_DECL_RET;
@@ -85,19 +149,19 @@ __open_verbose(WT_SESSION_IMPL *session,
*/
switch (file_type) {
- case WT_FILE_TYPE_CHECKPOINT:
+ case WT_OPEN_FILE_TYPE_CHECKPOINT:
file_type_tag = "checkpoint";
break;
- case WT_FILE_TYPE_DATA:
+ case WT_OPEN_FILE_TYPE_DATA:
file_type_tag = "data";
break;
- case WT_FILE_TYPE_DIRECTORY:
+ case WT_OPEN_FILE_TYPE_DIRECTORY:
file_type_tag = "directory";
break;
- case WT_FILE_TYPE_LOG:
+ case WT_OPEN_FILE_TYPE_LOG:
file_type_tag = "log";
break;
- case WT_FILE_TYPE_REGULAR:
+ case WT_OPEN_FILE_TYPE_REGULAR:
file_type_tag = "regular";
break;
default:
@@ -115,18 +179,16 @@ __open_verbose(WT_SESSION_IMPL *session,
}
WT_OPEN_VERBOSE_FLAG(WT_OPEN_CREATE, "create");
+ WT_OPEN_VERBOSE_FLAG(WT_OPEN_DIRECTIO, "direct-IO");
WT_OPEN_VERBOSE_FLAG(WT_OPEN_EXCLUSIVE, "exclusive");
WT_OPEN_VERBOSE_FLAG(WT_OPEN_FIXED, "fixed");
WT_OPEN_VERBOSE_FLAG(WT_OPEN_READONLY, "readonly");
- WT_OPEN_VERBOSE_FLAG(WT_STREAM_APPEND, "stream-append");
- WT_OPEN_VERBOSE_FLAG(WT_STREAM_READ, "stream-read");
- WT_OPEN_VERBOSE_FLAG(WT_STREAM_WRITE, "stream-write");
if (tmp->size != 0)
WT_ERR(__wt_buf_catfmt(session, tmp, ")"));
ret = __wt_verbose(session, WT_VERB_FILEOPS,
- "%s: handle-open: type %s%s",
+ "%s: file-open: type %s%s",
name, file_type_tag, tmp->size == 0 ? "" : (char *)tmp->data);
err: __wt_scr_free(session, &tmp);
@@ -146,17 +208,19 @@ err: __wt_scr_free(session, &tmp);
*/
int
__wt_open(WT_SESSION_IMPL *session,
- const char *name, uint32_t file_type, uint32_t flags, WT_FH **fhp)
+ const char *name, WT_OPEN_FILE_TYPE file_type, u_int flags, WT_FH **fhp)
{
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_FH *fh;
+ WT_FILE_SYSTEM *file_system;
bool lock_file, open_called;
char *path;
WT_ASSERT(session, file_type != 0); /* A file type is required. */
conn = S2C(session);
+ file_system = conn->file_system;
fh = NULL;
open_called = false;
path = NULL;
@@ -164,21 +228,12 @@ __wt_open(WT_SESSION_IMPL *session,
WT_RET(__open_verbose(session, name, file_type, flags));
/* Check if the handle is already open. */
- if (__wt_handle_search(session, name, true, NULL, &fh)) {
- /*
- * XXX
- * The in-memory implementation has to reset the file offset
- * when a file is re-opened (which obviously also depends on
- * in-memory configurations never opening a file in more than
- * one thread at a time). This needs to be fixed.
- */
- if (F_ISSET(fh, WT_FH_IN_MEMORY) && fh->ref == 1)
- fh->off = 0;
+ if (__handle_search(session, name, NULL, &fh)) {
*fhp = fh;
return (0);
}
- /* Allocate a structure and set the name. */
+ /* Allocate and initialize the handle. */
WT_ERR(__wt_calloc_one(session, &fh));
WT_ERR(__wt_strdup(session, name, &fh->name));
@@ -200,17 +255,21 @@ __wt_open(WT_SESSION_IMPL *session,
WT_ERR(__wt_filename(session, name, &path));
/* Call the underlying open function. */
- WT_ERR(conn->handle_open(
- session, fh, path == NULL ? name : path, file_type, flags));
+ WT_ERR(file_system->open_file(file_system, &session->iface,
+ path == NULL ? name : path, file_type, flags, &fh->handle));
open_called = true;
+ WT_ERR(__fhandle_method_finalize(
+ session, fh->handle, LF_ISSET(WT_OPEN_READONLY)));
+
/*
* Repeat the check for a match: if there's no match, link our newly
* created handle onto the database's list of files.
*/
- if (__wt_handle_search(session, name, true, fh, fhp)) {
+ if (__handle_search(session, name, fh, fhp)) {
err: if (open_called)
- WT_TRET(fh->fh_close(session, fh));
+ WT_TRET(fh->handle->close(
+ fh->handle, (WT_SESSION *)session));
if (fh != NULL) {
__wt_free(session, fh->name);
__wt_free(session, fh);
@@ -242,7 +301,7 @@ __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp)
/* Track handle-close as a file operation, so open and close match. */
WT_RET(__wt_verbose(
- session, WT_VERB_FILEOPS, "%s: handle-close", fh->name));
+ session, WT_VERB_FILEOPS, "%s: file-close", fh->name));
/*
* If the reference count hasn't gone to 0, or if it's an in-memory
@@ -252,20 +311,20 @@ __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp)
*/
__wt_spin_lock(session, &conn->fh_lock);
WT_ASSERT(session, fh->ref > 0);
- if ((fh->ref > 0 && --fh->ref > 0) || F_ISSET(fh, WT_FH_IN_MEMORY)) {
+ if ((fh->ref > 0 && --fh->ref > 0)) {
__wt_spin_unlock(session, &conn->fh_lock);
return (0);
}
/* Remove from the list. */
bucket = fh->name_hash % WT_HASH_ARRAY_SIZE;
- WT_CONN_FILE_REMOVE(conn, fh, bucket);
+ WT_FILE_HANDLE_REMOVE(conn, fh, bucket);
(void)__wt_atomic_sub32(&conn->open_file_count, 1);
__wt_spin_unlock(session, &conn->fh_lock);
/* Discard underlying resources. */
- ret = fh->fh_close(session, fh);
+ ret = fh->handle->close(fh->handle, (WT_SESSION *)session);
__wt_free(session, fh->name);
__wt_free(session, fh);
@@ -287,18 +346,13 @@ __wt_close_connection_close(WT_SESSION_IMPL *session)
conn = S2C(session);
while ((fh = TAILQ_FIRST(&conn->fhqh)) != NULL) {
- /*
- * In-memory configurations will have open files, but the ref
- * counts should be zero.
- */
- if (!F_ISSET(conn, WT_CONN_IN_MEMORY) || fh->ref != 0) {
+ if (fh->ref != 0) {
ret = EBUSY;
__wt_errx(session,
"Connection has open file handles: %s", fh->name);
}
fh->ref = 1;
- F_CLR(fh, WT_FH_IN_MEMORY);
WT_TRET(__wt_close(session, &fh));
}
diff --git a/src/os_common/os_fs_inmemory.c b/src/os_common/os_fs_inmemory.c
index 260514eac66..53da3f10e5c 100644
--- a/src/os_common/os_fs_inmemory.c
+++ b/src/os_common/os_fs_inmemory.c
@@ -8,475 +8,588 @@
#include "wt_internal.h"
-static int __im_handle_size(WT_SESSION_IMPL *, WT_FH *, wt_off_t *);
-
/*
- * In-memory information.
+ * File system interface for in-memory implementation.
*/
typedef struct {
+ WT_FILE_SYSTEM iface;
+
+ TAILQ_HEAD(__wt_fhhash_inmem,
+ __wt_file_handle_inmem) fhhash[WT_HASH_ARRAY_SIZE];
+ TAILQ_HEAD(__wt_fh_inmem_qh, __wt_file_handle_inmem) fhqh;
+
WT_SPINLOCK lock;
-} WT_IM;
+} WT_FILE_SYSTEM_INMEM;
+
+static int __im_file_size(WT_FILE_HANDLE *, WT_SESSION *, wt_off_t *);
/*
- * __im_directory_list --
- * Get a list of files from a directory, in-memory version.
+ * __im_handle_search --
+ * Return a matching handle, if one exists.
*/
-static int
-__im_directory_list(WT_SESSION_IMPL *session, const char *dir,
- const char *prefix, uint32_t flags, char ***dirlist, u_int *countp)
+static WT_FILE_HANDLE_INMEM *
+__im_handle_search(WT_FILE_SYSTEM *file_system, const char *name)
{
- WT_UNUSED(session);
- WT_UNUSED(dir);
- WT_UNUSED(prefix);
- WT_UNUSED(flags);
- WT_UNUSED(dirlist);
- WT_UNUSED(countp);
+ WT_FILE_HANDLE_INMEM *im_fh;
+ WT_FILE_SYSTEM_INMEM *im_fs;
+ uint64_t bucket, hash;
+
+ im_fs = (WT_FILE_SYSTEM_INMEM *)file_system;
- WT_RET_MSG(session, ENOTSUP, "directory-list");
+ hash = __wt_hash_city64(name, strlen(name));
+ bucket = hash % WT_HASH_ARRAY_SIZE;
+ TAILQ_FOREACH(im_fh, &im_fs->fhhash[bucket], hashq)
+ if (strcmp(im_fh->iface.name, name) == 0)
+ break;
+
+ return (im_fh);
}
/*
- * __im_directory_sync --
- * Flush a directory to ensure file creation is durable.
+ * __im_handle_remove --
+ * Destroy an in-memory file handle. Should only happen on remove or
+ * shutdown.
*/
static int
-__im_directory_sync(WT_SESSION_IMPL *session, const char *path)
+__im_handle_remove(WT_SESSION_IMPL *session,
+ WT_FILE_SYSTEM *file_system, WT_FILE_HANDLE_INMEM *im_fh)
{
- WT_UNUSED(session);
- WT_UNUSED(path);
+ WT_FILE_HANDLE *fhp;
+ WT_FILE_SYSTEM_INMEM *im_fs;
+ uint64_t bucket;
+
+ im_fs = (WT_FILE_SYSTEM_INMEM *)file_system;
+
+ if (im_fh->ref != 0)
+ WT_RET_MSG(session, EBUSY,
+ "%s: file-remove", im_fh->iface.name);
+
+ bucket = im_fh->name_hash % WT_HASH_ARRAY_SIZE;
+ WT_FILE_HANDLE_REMOVE(im_fs, im_fh, bucket);
+
+ /* Clean up private information. */
+ __wt_buf_free(session, &im_fh->buf);
+
+ /* Clean up public information. */
+ fhp = (WT_FILE_HANDLE *)im_fh;
+ __wt_free(session, fhp->name);
+
+ __wt_free(session, im_fh);
+
return (0);
}
/*
- * __im_file_exist --
- * Return if the file exists.
+ * __im_handle_size --
+ * Return the handle's data size.
*/
-static int
-__im_file_exist(WT_SESSION_IMPL *session, const char *name, bool *existp)
+static void
+__im_handle_size(WT_FILE_HANDLE_INMEM *im_fh, wt_off_t *sizep)
{
- *existp = __wt_handle_search(session, name, false, NULL, NULL);
- return (0);
+ /*
+ * XXX
+ * This function exists as a place for this comment. MongoDB assumes
+ * any file with content will have a non-zero size. In memory tables
+ * generally are zero-sized, make MongoDB happy.
+ */
+ *sizep = im_fh->buf.size == 0 ? 1024 : (wt_off_t)im_fh->buf.size;
}
/*
- * __im_file_remove --
- * POSIX remove.
+ * __im_fs_directory_list --
+ * Return the directory contents.
*/
static int
-__im_file_remove(WT_SESSION_IMPL *session, const char *name)
+__im_fs_directory_list(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *wt_session, const char *directory,
+ const char *prefix, char ***dirlistp, uint32_t *countp)
{
WT_DECL_RET;
- WT_FH *fh;
+ WT_FILE_HANDLE_INMEM *im_fh;
+ WT_FILE_SYSTEM_INMEM *im_fs;
+ WT_SESSION_IMPL *session;
+ size_t dirallocsz, len;
+ uint32_t count;
+ char *name, **entries;
+
+ im_fs = (WT_FILE_SYSTEM_INMEM *)file_system;
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ *dirlistp = NULL;
+ *countp = 0;
+
+ dirallocsz = 0;
+ len = strlen(directory);
+ entries = NULL;
+
+ __wt_spin_lock(session, &im_fs->lock);
+
+ count = 0;
+ TAILQ_FOREACH(im_fh, &im_fs->fhqh, q) {
+ name = im_fh->iface.name;
+ if (strncmp(name, directory, len) != 0 ||
+ (prefix != NULL && !WT_PREFIX_MATCH(name + len, prefix)))
+ continue;
+
+ WT_ERR(__wt_realloc_def(
+ session, &dirallocsz, count + 1, &entries));
+ WT_ERR(__wt_strdup(session, name, &entries[count]));
+ ++count;
+ }
- if (__wt_handle_search(session, name, true, NULL, &fh)) {
- WT_ASSERT(session, fh->ref == 1);
+ *dirlistp = entries;
+ *countp = count;
+
+err: __wt_spin_unlock(session, &im_fs->lock);
+ if (ret == 0)
+ return (0);
- /* Force a discard of the handle. */
- F_CLR(fh, WT_FH_IN_MEMORY);
- ret = __wt_close(session, &fh);
+ if (entries != NULL) {
+ while (count > 0)
+ __wt_free(session, entries[--count]);
+ __wt_free(session, entries);
}
- return (ret);
+
+ WT_RET_MSG(session, ret,
+ "%s: directory-list, prefix \"%s\"",
+ directory, prefix == NULL ? "" : prefix);
}
/*
- * __im_file_rename --
- * POSIX rename.
+ * __im_fs_directory_list_free --
+ * Free memory returned by __im_fs_directory_list.
*/
static int
-__im_file_rename(WT_SESSION_IMPL *session, const char *from, const char *to)
+__im_fs_directory_list_free(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *wt_session, char **dirlist, uint32_t count)
{
- WT_CONNECTION_IMPL *conn;
- WT_DECL_RET;
- WT_FH *fh;
- uint64_t bucket, hash;
- char *to_name;
-
- conn = S2C(session);
-
- /* We'll need a copy of the target name. */
- WT_RET(__wt_strdup(session, to, &to_name));
+ WT_SESSION_IMPL *session;
- __wt_spin_lock(session, &conn->fh_lock);
+ WT_UNUSED(file_system);
- /* Make sure the target name isn't active. */
- hash = __wt_hash_city64(to, strlen(to));
- bucket = hash % WT_HASH_ARRAY_SIZE;
- TAILQ_FOREACH(fh, &conn->fhhash[bucket], hashq)
- if (strcmp(to, fh->name) == 0)
- WT_ERR(EPERM);
-
- /* Find the source name. */
- hash = __wt_hash_city64(from, strlen(from));
- bucket = hash % WT_HASH_ARRAY_SIZE;
- TAILQ_FOREACH(fh, &conn->fhhash[bucket], hashq)
- if (strcmp(from, fh->name) == 0)
- break;
- if (fh == NULL)
- WT_ERR(ENOENT);
-
- /* Remove source from the list. */
- WT_CONN_FILE_REMOVE(conn, fh, bucket);
+ session = (WT_SESSION_IMPL *)wt_session;
- /* Swap the names. */
- __wt_free(session, fh->name);
- fh->name = to_name;
- to_name = NULL;
-
- /* Put source back on the list. */
- hash = __wt_hash_city64(to, strlen(to));
- bucket = hash % WT_HASH_ARRAY_SIZE;
- WT_CONN_FILE_INSERT(conn, fh, bucket);
-
- if (0) {
-err: __wt_free(session, to_name);
+ if (dirlist != NULL) {
+ while (count > 0)
+ __wt_free(session, dirlist[--count]);
+ __wt_free(session, dirlist);
}
- __wt_spin_unlock(session, &conn->fh_lock);
-
- return (ret);
+ return (0);
}
/*
- * __im_file_size --
- * Get the size of a file in bytes, by file name.
+ * __im_fs_exist --
+ * Return if the file exists.
*/
static int
-__im_file_size(
- WT_SESSION_IMPL *session, const char *name, bool silent, wt_off_t *sizep)
+__im_fs_exist(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *wt_session, const char *name, bool *existp)
{
- WT_DECL_RET;
- WT_FH *fh;
- WT_IM *im;
+ WT_FILE_SYSTEM_INMEM *im_fs;
+ WT_SESSION_IMPL *session;
- WT_UNUSED(silent);
+ im_fs = (WT_FILE_SYSTEM_INMEM *)file_system;
+ session = (WT_SESSION_IMPL *)wt_session;
- im = S2C(session)->inmemory;
- __wt_spin_lock(session, &im->lock);
+ __wt_spin_lock(session, &im_fs->lock);
- if (__wt_handle_search(session, name, true, NULL, &fh)) {
- WT_ERR(__im_handle_size(session, fh, sizep));
- WT_ERR(__wt_close(session, &fh));
- } else
- ret = ENOENT;
+ *existp = __im_handle_search(file_system, name) != NULL;
-err: __wt_spin_unlock(session, &im->lock);
- return (ret);
+ __wt_spin_unlock(session, &im_fs->lock);
+ return (0);
}
/*
- * __im_handle_advise --
- * POSIX fadvise.
+ * __im_fs_remove --
+ * POSIX remove.
*/
static int
-__im_handle_advise(WT_SESSION_IMPL *session,
- WT_FH *fh, wt_off_t offset, wt_off_t len, int advice)
+__im_fs_remove(
+ WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const char *name)
{
- WT_UNUSED(session);
- WT_UNUSED(fh);
- WT_UNUSED(offset);
- WT_UNUSED(len);
- WT_UNUSED(advice);
- return (ENOTSUP);
+ WT_DECL_RET;
+ WT_FILE_HANDLE_INMEM *im_fh;
+ WT_FILE_SYSTEM_INMEM *im_fs;
+ WT_SESSION_IMPL *session;
+
+ im_fs = (WT_FILE_SYSTEM_INMEM *)file_system;
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ __wt_spin_lock(session, &im_fs->lock);
+
+ ret = ENOENT;
+ if ((im_fh = __im_handle_search(file_system, name)) != NULL)
+ ret = __im_handle_remove(session, file_system, im_fh);
+
+ __wt_spin_unlock(session, &im_fs->lock);
+ return (ret);
}
/*
- * __im_handle_close --
- * ANSI C close/fclose.
+ * __im_fs_rename --
+ * POSIX rename.
*/
static int
-__im_handle_close(WT_SESSION_IMPL *session, WT_FH *fh)
+__im_fs_rename(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *wt_session, const char *from, const char *to)
{
- __wt_buf_free(session, &fh->buf);
+ WT_DECL_RET;
+ WT_FILE_HANDLE_INMEM *im_fh;
+ WT_FILE_SYSTEM_INMEM *im_fs;
+ WT_SESSION_IMPL *session;
+ uint64_t bucket;
+ char *copy;
+
+ im_fs = (WT_FILE_SYSTEM_INMEM *)file_system;
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ __wt_spin_lock(session, &im_fs->lock);
+
+ ret = ENOENT;
+ if ((im_fh = __im_handle_search(file_system, from)) != NULL) {
+ WT_ERR(__wt_strdup(session, to, &copy));
+ __wt_free(session, im_fh->iface.name);
+ im_fh->iface.name = copy;
+
+ bucket = im_fh->name_hash % WT_HASH_ARRAY_SIZE;
+ WT_FILE_HANDLE_REMOVE(im_fs, im_fh, bucket);
+ im_fh->name_hash = __wt_hash_city64(to, strlen(to));
+ bucket = im_fh->name_hash % WT_HASH_ARRAY_SIZE;
+ WT_FILE_HANDLE_INSERT(im_fs, im_fh, bucket);
+ }
- return (0);
+err: __wt_spin_unlock(session, &im_fs->lock);
+ return (ret);
}
/*
- * __im_handle_getc --
- * ANSI C fgetc.
+ * __im_fs_size --
+ * Get the size of a file in bytes, by file name.
*/
static int
-__im_handle_getc(WT_SESSION_IMPL *session, WT_FH *fh, int *chp)
+__im_fs_size(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *wt_session, const char *name, wt_off_t *sizep)
{
- WT_IM *im;
+ WT_DECL_RET;
+ WT_FILE_HANDLE_INMEM *im_fh;
+ WT_FILE_SYSTEM_INMEM *im_fs;
+ WT_SESSION_IMPL *session;
- im = S2C(session)->inmemory;
- __wt_spin_lock(session, &im->lock);
+ im_fs = (WT_FILE_SYSTEM_INMEM *)file_system;
+ session = (WT_SESSION_IMPL *)wt_session;
- if (fh->off >= fh->buf.size)
- *chp = EOF;
+ __wt_spin_lock(session, &im_fs->lock);
+
+ /* Search for the handle, then get its size. */
+ if ((im_fh = __im_handle_search(file_system, name)) == NULL)
+ ret = ENOENT;
else
- *chp = ((char *)fh->buf.data)[fh->off++];
+ __im_handle_size(im_fh, sizep);
- __wt_spin_unlock(session, &im->lock);
- return (0);
-}
+ __wt_spin_unlock(session, &im_fs->lock);
-/*
- * __im_handle_lock --
- * Lock/unlock a file.
- */
-static int
-__im_handle_lock(WT_SESSION_IMPL *session, WT_FH *fh, bool lock)
-{
- WT_UNUSED(session);
- WT_UNUSED(fh);
- WT_UNUSED(lock);
- return (0);
+ return (ret);
}
/*
- * __im_handle_printf --
- * ANSI C vfprintf.
+ * __im_file_close --
+ * ANSI C close.
*/
static int
-__im_handle_printf(
- WT_SESSION_IMPL *session, WT_FH *fh, const char *fmt, va_list ap)
+__im_file_close(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session)
{
- va_list ap_copy;
- WT_DECL_ITEM(tmp);
- WT_DECL_RET;
- WT_IM *im;
- size_t len;
-
- im = S2C(session)->inmemory;
-
- /* Build the string we're writing. */
- WT_RET(__wt_scr_alloc(session, strlen(fmt) * 2 + 128, &tmp));
- for (;;) {
- va_copy(ap_copy, ap);
- len = (size_t)vsnprintf(tmp->mem, tmp->memsize, fmt, ap_copy);
- va_end(ap_copy);
- if (len < tmp->memsize) {
- tmp->data = tmp->mem;
- tmp->size = len;
- break;
- }
- WT_ERR(__wt_buf_extend(session, tmp, len + 1));
- }
+ WT_FILE_HANDLE_INMEM *im_fh;
+ WT_FILE_SYSTEM_INMEM *im_fs;
+ WT_SESSION_IMPL *session;
- __wt_spin_lock(session, &im->lock);
+ im_fh = (WT_FILE_HANDLE_INMEM *)file_handle;
+ im_fs = (WT_FILE_SYSTEM_INMEM *)file_handle->file_system;
+ session = (WT_SESSION_IMPL *)wt_session;
- /* Grow the handle's buffer as necessary. */
- WT_ERR(__wt_buf_grow(session, &fh->buf, fh->off + len));
+ __wt_spin_lock(session, &im_fs->lock);
- /* Copy the data into place and update the offset. */
- memcpy((uint8_t *)fh->buf.mem + fh->off, tmp->data, len);
- fh->off += len;
+ --im_fh->ref;
-err: __wt_spin_unlock(session, &im->lock);
+ __wt_spin_unlock(session, &im_fs->lock);
- __wt_scr_free(session, &tmp);
- return (ret);
+ return (0);
}
/*
- * __im_handle_read --
+ * __im_file_read --
* POSIX pread.
*/
static int
-__im_handle_read(
- WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf)
+__im_file_read(WT_FILE_HANDLE *file_handle,
+ WT_SESSION *wt_session, wt_off_t offset, size_t len, void *buf)
{
WT_DECL_RET;
- WT_IM *im;
+ WT_FILE_HANDLE_INMEM *im_fh;
+ WT_FILE_SYSTEM_INMEM *im_fs;
+ WT_SESSION_IMPL *session;
size_t off;
- im = S2C(session)->inmemory;
- __wt_spin_lock(session, &im->lock);
+ im_fh = (WT_FILE_HANDLE_INMEM *)file_handle;
+ im_fs = (WT_FILE_SYSTEM_INMEM *)file_handle->file_system;
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ __wt_spin_lock(session, &im_fs->lock);
off = (size_t)offset;
- if (off < fh->buf.size) {
- len = WT_MIN(len, fh->buf.size - off);
- memcpy(buf, (uint8_t *)fh->buf.mem + off, len);
- fh->off = off + len;
+ if (off < im_fh->buf.size) {
+ len = WT_MIN(len, im_fh->buf.size - off);
+ memcpy(buf, (uint8_t *)im_fh->buf.mem + off, len);
+ im_fh->off = off + len;
} else
ret = WT_ERROR;
- __wt_spin_unlock(session, &im->lock);
+ __wt_spin_unlock(session, &im_fs->lock);
if (ret == 0)
return (0);
WT_RET_MSG(session, WT_ERROR,
"%s: handle-read: failed to read %" WT_SIZET_FMT " bytes at "
"offset %" WT_SIZET_FMT,
- fh->name, len, off);
+ file_handle->name, len, off);
}
/*
- * __im_handle_size --
+ * __im_file_size --
* Get the size of a file in bytes, by file handle.
*/
static int
-__im_handle_size(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *sizep)
+__im_file_size(
+ WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, wt_off_t *sizep)
{
- WT_UNUSED(session);
+ WT_FILE_HANDLE_INMEM *im_fh;
+ WT_FILE_SYSTEM_INMEM *im_fs;
+ WT_SESSION_IMPL *session;
- /*
- * XXX hack - MongoDB assumes that any file with content will have a
- * non-zero size. In memory tables generally are zero-sized, make
- * MongoDB happy.
- */
- *sizep = fh->buf.size == 0 ? 1024 : (wt_off_t)fh->buf.size;
- return (0);
-}
+ im_fh = (WT_FILE_HANDLE_INMEM *)file_handle;
+ im_fs = (WT_FILE_SYSTEM_INMEM *)file_handle->file_system;
+ session = (WT_SESSION_IMPL *)wt_session;
-/*
- * __im_handle_sync --
- * POSIX fflush/fsync.
- */
-static int
-__im_handle_sync(WT_SESSION_IMPL *session, WT_FH *fh, bool block)
-{
- WT_UNUSED(session);
- WT_UNUSED(fh);
+ __wt_spin_lock(session, &im_fs->lock);
- /*
- * Callers attempting asynchronous flush handle ENOTSUP returns, and
- * won't make further attempts.
- */
- return (block ? 0 : ENOTSUP);
+ __im_handle_size(im_fh, sizep);
+
+ __wt_spin_unlock(session, &im_fs->lock);
+
+ return (0);
}
/*
- * __im_handle_truncate --
+ * __im_file_truncate --
* POSIX ftruncate.
*/
static int
-__im_handle_truncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset)
+__im_file_truncate(
+ WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, wt_off_t offset)
{
WT_DECL_RET;
- WT_IM *im;
+ WT_FILE_HANDLE_INMEM *im_fh;
+ WT_FILE_SYSTEM_INMEM *im_fs;
+ WT_SESSION_IMPL *session;
size_t off;
- im = S2C(session)->inmemory;
- __wt_spin_lock(session, &im->lock);
+ im_fh = (WT_FILE_HANDLE_INMEM *)file_handle;
+ im_fs = (WT_FILE_SYSTEM_INMEM *)file_handle->file_system;
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ __wt_spin_lock(session, &im_fs->lock);
/*
- * Grow the buffer as necessary, clear any new space in the file,
- * and reset the file's data length.
+ * Grow the buffer as necessary, clear any new space in the file, and
+ * reset the file's data length.
*/
off = (size_t)offset;
- WT_ERR(__wt_buf_grow(session, &fh->buf, off));
- if (fh->buf.size < off)
- memset((uint8_t *)
- fh->buf.data + fh->buf.size, 0, off - fh->buf.size);
- fh->buf.size = off;
+ WT_ERR(__wt_buf_grow(session, &im_fh->buf, off));
+ if (im_fh->buf.size < off)
+ memset((uint8_t *)im_fh->buf.data + im_fh->buf.size,
+ 0, off - im_fh->buf.size);
+ im_fh->buf.size = off;
-err: __wt_spin_unlock(session, &im->lock);
+err: __wt_spin_unlock(session, &im_fs->lock);
return (ret);
}
/*
- * __im_handle_write --
+ * __im_file_write --
* POSIX pwrite.
*/
static int
-__im_handle_write(WT_SESSION_IMPL *session,
- WT_FH *fh, wt_off_t offset, size_t len, const void *buf)
+__im_file_write(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session,
+ wt_off_t offset, size_t len, const void *buf)
{
WT_DECL_RET;
- WT_IM *im;
+ WT_FILE_HANDLE_INMEM *im_fh;
+ WT_FILE_SYSTEM_INMEM *im_fs;
+ WT_SESSION_IMPL *session;
size_t off;
- im = S2C(session)->inmemory;
- __wt_spin_lock(session, &im->lock);
+ im_fh = (WT_FILE_HANDLE_INMEM *)file_handle;
+ im_fs = (WT_FILE_SYSTEM_INMEM *)file_handle->file_system;
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ __wt_spin_lock(session, &im_fs->lock);
off = (size_t)offset;
- WT_ERR(__wt_buf_grow(session, &fh->buf, off + len + 1024));
+ WT_ERR(__wt_buf_grow(session, &im_fh->buf, off + len + 1024));
- memcpy((uint8_t *)fh->buf.data + off, buf, len);
- if (off + len > fh->buf.size)
- fh->buf.size = off + len;
- fh->off = off + len;
+ memcpy((uint8_t *)im_fh->buf.data + off, buf, len);
+ if (off + len > im_fh->buf.size)
+ im_fh->buf.size = off + len;
+ im_fh->off = off + len;
-err: __wt_spin_unlock(session, &im->lock);
+err: __wt_spin_unlock(session, &im_fs->lock);
if (ret == 0)
return (0);
WT_RET_MSG(session, ret,
"%s: handle-write: failed to write %" WT_SIZET_FMT " bytes at "
"offset %" WT_SIZET_FMT,
- fh->name, len, off);
+ file_handle->name, len, off);
}
/*
- * __im_handle_open --
+ * __im_file_open --
* POSIX fopen/open.
*/
static int
-__im_handle_open(WT_SESSION_IMPL *session,
- WT_FH *fh, const char *path, uint32_t file_type, uint32_t flags)
+__im_file_open(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session,
+ const char *name, WT_OPEN_FILE_TYPE file_type, uint32_t flags,
+ WT_FILE_HANDLE **file_handlep)
{
- WT_UNUSED(session);
- WT_UNUSED(path);
+ WT_DECL_RET;
+ WT_FILE_HANDLE *file_handle;
+ WT_FILE_HANDLE_INMEM *im_fh;
+ WT_FILE_SYSTEM_INMEM *im_fs;
+ WT_SESSION_IMPL *session;
+ uint64_t bucket, hash;
+
WT_UNUSED(file_type);
WT_UNUSED(flags);
- fh->off = 0;
- F_SET(fh, WT_FH_IN_MEMORY);
+ im_fs = (WT_FILE_SYSTEM_INMEM *)file_system;
+ session = (WT_SESSION_IMPL *)wt_session;
- fh->fh_advise = __im_handle_advise;
- fh->fh_close = __im_handle_close;
- fh->fh_getc = __im_handle_getc;
- fh->fh_lock = __im_handle_lock;
- fh->fh_printf = __im_handle_printf;
- fh->fh_read = __im_handle_read;
- fh->fh_size = __im_handle_size;
- fh->fh_sync = __im_handle_sync;
- fh->fh_truncate = __im_handle_truncate;
- fh->fh_write = __im_handle_write;
+ __wt_spin_lock(session, &im_fs->lock);
- return (0);
+ /*
+ * First search the file queue, if we find it, assert there's only a
+ * single reference, in-memory only supports a single handle on any
+ * file, for now.
+ */
+ im_fh = __im_handle_search(file_system, name);
+ if (im_fh != NULL) {
+
+ if (im_fh->ref != 0)
+ WT_ERR_MSG(session, EBUSY,
+ "%s: file-open: already open", name);
+
+ im_fh->ref = 1;
+ im_fh->off = 0;
+
+ *file_handlep = (WT_FILE_HANDLE *)im_fh;
+
+ __wt_spin_unlock(session, &im_fs->lock);
+ return (0);
+ }
+
+ /* The file hasn't been opened before, create a new one. */
+ WT_ERR(__wt_calloc_one(session, &im_fh));
+
+ /* Initialize public information. */
+ file_handle = (WT_FILE_HANDLE *)im_fh;
+ file_handle->file_system = file_system;
+ WT_ERR(__wt_strdup(session, name, &file_handle->name));
+
+ /* Initialize private information. */
+ im_fh->ref = 1;
+ im_fh->off = 0;
+
+ hash = __wt_hash_city64(name, strlen(name));
+ bucket = hash % WT_HASH_ARRAY_SIZE;
+ im_fh->name_hash = hash;
+ WT_FILE_HANDLE_INSERT(im_fs, im_fh, bucket);
+
+ file_handle->close = __im_file_close;
+ file_handle->read = __im_file_read;
+ file_handle->size = __im_file_size;
+ file_handle->truncate = __im_file_truncate;
+ file_handle->write = __im_file_write;
+
+ *file_handlep = file_handle;
+
+ if (0) {
+err: __wt_free(session, im_fh);
+ }
+
+ __wt_spin_unlock(session, &im_fs->lock);
+ return (ret);
}
/*
- * __wt_os_inmemory --
- * Initialize an in-memory configuration.
+ * __im_terminate --
+ * Terminate an in-memory configuration.
*/
-int
-__wt_os_inmemory(WT_SESSION_IMPL *session)
+static int
+__im_terminate(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session)
{
- WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
- WT_IM *im;
+ WT_FILE_HANDLE_INMEM *im_fh;
+ WT_FILE_SYSTEM_INMEM *im_fs;
+ WT_SESSION_IMPL *session;
- conn = S2C(session);
- im = NULL;
+ WT_UNUSED(file_system);
- /* Initialize the in-memory jump table. */
- conn->file_directory_list = __im_directory_list;
- conn->file_directory_sync = __im_directory_sync;
- conn->file_exist = __im_file_exist;
- conn->file_remove = __im_file_remove;
- conn->file_rename = __im_file_rename;
- conn->file_size = __im_file_size;
- conn->handle_open = __im_handle_open;
-
- /* Allocate an in-memory structure. */
- WT_RET(__wt_calloc_one(session, &im));
- WT_ERR(__wt_spin_init(session, &im->lock, "in-memory I/O"));
- conn->inmemory = im;
+ session = (WT_SESSION_IMPL *)wt_session;
+ im_fs = (WT_FILE_SYSTEM_INMEM *)file_system;
- return (0);
+ while ((im_fh = TAILQ_FIRST(&im_fs->fhqh)) != NULL)
+ WT_TRET(__im_handle_remove(session, file_system, im_fh));
+
+ __wt_spin_destroy(session, &im_fs->lock);
+ __wt_free(session, im_fs);
-err: __wt_free(session, im);
return (ret);
}
/*
- * __wt_os_inmemory_cleanup --
- * Discard an in-memory configuration.
+ * __wt_os_inmemory --
+ * Initialize an in-memory configuration.
*/
int
-__wt_os_inmemory_cleanup(WT_SESSION_IMPL *session)
+__wt_os_inmemory(WT_SESSION_IMPL *session)
{
WT_DECL_RET;
- WT_IM *im;
+ WT_FILE_SYSTEM *file_system;
+ WT_FILE_SYSTEM_INMEM *im_fs;
+ u_int i;
- if ((im = S2C(session)->inmemory) == NULL)
- return (0);
- S2C(session)->inmemory = NULL;
+ WT_RET(__wt_calloc_one(session, &im_fs));
+
+ /* Initialize private information. */
+ TAILQ_INIT(&im_fs->fhqh);
+ for (i = 0; i < WT_HASH_ARRAY_SIZE; i++)
+ TAILQ_INIT(&im_fs->fhhash[i]);
- __wt_spin_destroy(session, &im->lock);
- __wt_free(session, im);
+ WT_ERR(__wt_spin_init(session, &im_fs->lock, "in-memory I/O"));
+
+ /* Initialize the in-memory jump table. */
+ file_system = (WT_FILE_SYSTEM *)im_fs;
+ file_system->directory_list = __im_fs_directory_list;
+ file_system->directory_list_free = __im_fs_directory_list_free;
+ file_system->exist = __im_fs_exist;
+ file_system->open_file = __im_file_open;
+ file_system->remove = __im_fs_remove;
+ file_system->rename = __im_fs_rename;
+ file_system->size = __im_fs_size;
+ file_system->terminate = __im_terminate;
+
+ /* Switch the file system into place. */
+ S2C(session)->file_system = (WT_FILE_SYSTEM *)im_fs;
+
+ return (0);
+err: __wt_free(session, im_fs);
return (ret);
}
diff --git a/src/os_common/os_fs_stdio.c b/src/os_common/os_fs_stdio.c
deleted file mode 100644
index 9baba9b6945..00000000000
--- a/src/os_common/os_fs_stdio.c
+++ /dev/null
@@ -1,239 +0,0 @@
-/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
- * Copyright (c) 2008-2014 WiredTiger, Inc.
- * All rights reserved.
- *
- * See the file LICENSE for redistribution information.
- */
-
-#include "wt_internal.h"
-
-/*
- * __stdio_handle_advise --
- * POSIX fadvise.
- */
-static int
-__stdio_handle_advise(WT_SESSION_IMPL *session,
- WT_FH *fh, wt_off_t offset, wt_off_t len, int advice)
-{
- WT_UNUSED(offset);
- WT_UNUSED(len);
- WT_UNUSED(advice);
- WT_RET_MSG(session, ENOTSUP, "%s: handle-advise", fh->name);
-}
-
-/*
- * __stdio_handle_allocate --
- * POSIX fallocate.
- */
-static int
-__stdio_handle_allocate(
- WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, wt_off_t len)
-{
- WT_UNUSED(offset);
- WT_UNUSED(len);
- WT_RET_MSG(session, ENOTSUP, "%s: handle-allocate", fh->name);
-}
-
-/*
- * __stdio_handle_close --
- * ANSI C close/fclose.
- */
-static int
-__stdio_handle_close(WT_SESSION_IMPL *session, WT_FH *fh)
-{
- WT_RET_MSG(session, ENOTSUP, "%s: handle-close", fh->name);
-}
-
-/*
- * __stdio_handle_getc --
- * ANSI C fgetc.
- */
-static int
-__stdio_handle_getc(WT_SESSION_IMPL *session, WT_FH *fh, int *chp)
-{
- WT_UNUSED(chp);
- WT_RET_MSG(session, ENOTSUP, "%s: handle-getc", fh->name);
-}
-
-/*
- * __stdio_handle_lock --
- * Lock/unlock a file.
- */
-static int
-__stdio_handle_lock(WT_SESSION_IMPL *session, WT_FH *fh, bool lock)
-{
- WT_UNUSED(lock);
- WT_RET_MSG(session, ENOTSUP, "%s: handle-lock", fh->name);
-}
-
-/*
- * __stdio_handle_map --
- * Map a file.
- */
-static int
-__stdio_handle_map(WT_SESSION_IMPL *session,
- WT_FH *fh, void *p, size_t *lenp, void **mappingcookie)
-{
- WT_UNUSED(p);
- WT_UNUSED(lenp);
- WT_UNUSED(mappingcookie);
- WT_RET_MSG(session, ENOTSUP, "%s: handle-map", fh->name);
-}
-
-/*
- * __stdio_handle_map_discard --
- * Discard a section of a mapped region.
- */
-static int
-__stdio_handle_map_discard(
- WT_SESSION_IMPL *session, WT_FH *fh, void *p, size_t len)
-{
- WT_UNUSED(p);
- WT_UNUSED(len);
- WT_RET_MSG(session, ENOTSUP, "%s: handle-map-discard", fh->name);
-}
-
-/*
- * __stdio_handle_map_preload --
- * Preload a section of a mapped region.
- */
-static int
-__stdio_handle_map_preload(
- WT_SESSION_IMPL *session, WT_FH *fh, const void *p, size_t len)
-{
- WT_UNUSED(p);
- WT_UNUSED(len);
- WT_RET_MSG(session, ENOTSUP, "%s: handle-map-preload", fh->name);
-}
-
-/*
- * __stdio_handle_map_unmap --
- * Unmap a file.
- */
-static int
-__stdio_handle_map_unmap(WT_SESSION_IMPL *session,
- WT_FH *fh, void *p, size_t len, void **mappingcookie)
-{
- WT_UNUSED(p);
- WT_UNUSED(len);
- WT_UNUSED(mappingcookie);
- WT_RET_MSG(session, ENOTSUP, "%s: handle-map-unmap", fh->name);
-}
-
-/*
- * __stdio_handle_printf --
- * ANSI C vfprintf.
- */
-static int
-__stdio_handle_printf(
- WT_SESSION_IMPL *session, WT_FH *fh, const char *fmt, va_list ap)
-{
- if (vfprintf(fh->fp, fmt, ap) >= 0)
- return (0);
- WT_RET_MSG(session, EIO, "%s: handle-printf: vfprintf", fh->name);
-}
-
-/*
- * __stdio_handle_read --
- * POSIX pread.
- */
-static int
-__stdio_handle_read(
- WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf)
-{
- WT_UNUSED(offset);
- WT_UNUSED(len);
- WT_UNUSED(buf);
- WT_RET_MSG(session, ENOTSUP, "%s: handle-read", fh->name);
-}
-
-/*
- * __stdio_handle_size --
- * Get the size of a file in bytes, by file handle.
- */
-static int
-__stdio_handle_size(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *sizep)
-{
- WT_UNUSED(sizep);
- WT_RET_MSG(session, ENOTSUP, "%s: handle-size", fh->name);
-}
-
-/*
- * __stdio_handle_sync --
- * POSIX fflush/fsync.
- */
-static int
-__stdio_handle_sync(WT_SESSION_IMPL *session, WT_FH *fh, bool block)
-{
- WT_UNUSED(block);
-
- if (fflush(fh->fp) == 0)
- return (0);
- WT_RET_MSG(session, __wt_errno(), "%s: handle-sync: fflush", fh->name);
-}
-
-/*
- * __stdio_handle_truncate --
- * POSIX ftruncate.
- */
-static int
-__stdio_handle_truncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len)
-{
- WT_UNUSED(len);
- WT_RET_MSG(session, ENOTSUP, "%s: handle-truncate", fh->name);
-}
-
-/*
- * __stdio_handle_write --
- * POSIX pwrite.
- */
-static int
-__stdio_handle_write(WT_SESSION_IMPL *session,
- WT_FH *fh, wt_off_t offset, size_t len, const void *buf)
-{
- WT_UNUSED(offset);
- WT_UNUSED(len);
- WT_UNUSED(buf);
- WT_RET_MSG(session, ENOTSUP, "%s: handle-write", fh->name);
-}
-
-/*
- * __stdio_func_init --
- * Initialize stdio functions.
- */
-static void
-__stdio_func_init(WT_FH *fh, const char *name, FILE *fp)
-{
- fh->name = name;
- fh->fp = fp;
-
- fh->fh_advise = __stdio_handle_advise;
- fh->fh_allocate = __stdio_handle_allocate;
- fh->fh_close = __stdio_handle_close;
- fh->fh_getc = __stdio_handle_getc;
- fh->fh_lock = __stdio_handle_lock;
- fh->fh_map = __stdio_handle_map;
- fh->fh_map_discard = __stdio_handle_map_discard;
- fh->fh_map_preload = __stdio_handle_map_preload;
- fh->fh_map_unmap = __stdio_handle_map_unmap;
- fh->fh_printf = __stdio_handle_printf;
- fh->fh_read = __stdio_handle_read;
- fh->fh_size = __stdio_handle_size;
- fh->fh_sync = __stdio_handle_sync;
- fh->fh_truncate = __stdio_handle_truncate;
- fh->fh_write = __stdio_handle_write;
-}
-
-/*
- * __wt_os_stdio --
- * Initialize the stdio configuration.
- */
-int
-__wt_os_stdio(WT_SESSION_IMPL *session)
-{
- __stdio_func_init(WT_STDERR(session), "stderr", stderr);
- __stdio_func_init(WT_STDOUT(session), "stdout", stdout);
-
- return (0);
-}
diff --git a/src/os_common/os_fstream.c b/src/os_common/os_fstream.c
new file mode 100644
index 00000000000..fc0daf1c211
--- /dev/null
+++ b/src/os_common/os_fstream.c
@@ -0,0 +1,213 @@
+/*-
+ * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/* Buffer size for streamed reads/writes. */
+#define WT_STREAM_BUFSIZE 8192
+
+/*
+ * __fstream_close --
+ * Close a stream handle.
+ */
+static int
+__fstream_close(WT_SESSION_IMPL *session, WT_FSTREAM *fs)
+{
+ WT_DECL_RET;
+
+ if (!F_ISSET(fs, WT_STREAM_READ))
+ WT_TRET(fs->flush(session, fs));
+
+ WT_TRET(__wt_close(session, &fs->fh));
+ __wt_buf_free(session, &fs->buf);
+ __wt_free(session, fs);
+ return (ret);
+}
+
+/*
+ * __fstream_flush --
+ * Flush the data from a stream.
+ */
+static int
+__fstream_flush(WT_SESSION_IMPL *session, WT_FSTREAM *fs)
+{
+ if (fs->buf.size > 0) {
+ WT_RET(__wt_write(
+ session, fs->fh, fs->off, fs->buf.size, fs->buf.data));
+ fs->off += (wt_off_t)fs->buf.size;
+ fs->buf.size = 0;
+ }
+
+ return (0);
+}
+
+/*
+ * __fstream_flush_notsup --
+ * Stream flush unsupported.
+ */
+static int
+__fstream_flush_notsup(WT_SESSION_IMPL *session, WT_FSTREAM *fs)
+{
+ WT_RET_MSG(session, ENOTSUP, "%s: flush", fs->name);
+}
+
+/*
+ * __fstream_getline --
+ * Get a line from a stream.
+ *
+ * Implementation of the POSIX getline or BSD fgetln functions (finding the
+ * function in a portable way is hard, it's simple enough to write it instead).
+ *
+ * Note: Unlike the standard getline calls, this function doesn't include the
+ * trailing newline character in the returned buffer and discards empty lines
+ * (so the caller's EOF marker is a returned line length of 0).
+ */
+static int
+__fstream_getline(WT_SESSION_IMPL *session, WT_FSTREAM *fs, WT_ITEM *buf)
+{
+ const char *p;
+ size_t len;
+ char c;
+
+ /*
+ * We always NUL-terminate the returned string (even if it's empty),
+ * make sure there's buffer space for a trailing NUL in all cases.
+ */
+ WT_RET(__wt_buf_init(session, buf, 100));
+
+ for (;;) {
+ /* Check if we need to refill the buffer. */
+ if (WT_PTRDIFF(fs->buf.data, fs->buf.mem) >= fs->buf.size) {
+ len = WT_MIN(WT_STREAM_BUFSIZE,
+ (size_t)(fs->size - fs->off));
+ if (len == 0)
+ break; /* EOF */
+ WT_RET(__wt_buf_initsize(session, &fs->buf, len));
+ WT_RET(__wt_read(
+ session, fs->fh, fs->off, len, fs->buf.mem));
+ fs->off += (wt_off_t)len;
+ }
+
+ c = *(p = fs->buf.data);
+ fs->buf.data = ++p;
+
+ /* Leave space for a trailing NUL. */
+ WT_RET(__wt_buf_extend(session, buf, buf->size + 2));
+ if (c == '\n') {
+ if (buf->size == 0)
+ continue;
+ break;
+ }
+ ((char *)buf->mem)[buf->size++] = c;
+ }
+
+ ((char *)buf->mem)[buf->size] = '\0';
+
+ return (0);
+}
+
+/*
+ * __fstream_getline_notsup --
+ * Stream getline unsupported.
+ */
+static int
+__fstream_getline_notsup(WT_SESSION_IMPL *session, WT_FSTREAM *fs, WT_ITEM *buf)
+{
+ WT_UNUSED(buf);
+ WT_RET_MSG(session, ENOTSUP, "%s: getline", fs->name);
+}
+
+/*
+ * __fstream_printf --
+ * ANSI C vfprintf.
+ */
+static int
+__fstream_printf(
+ WT_SESSION_IMPL *session, WT_FSTREAM *fs, const char *fmt, va_list ap)
+{
+ WT_ITEM *buf;
+ va_list ap_copy;
+ size_t len, space;
+ char *p;
+
+ buf = &fs->buf;
+
+ for (;;) {
+ va_copy(ap_copy, ap);
+ p = (char *)((uint8_t *)buf->mem + buf->size);
+ WT_ASSERT(session, buf->memsize >= buf->size);
+ space = buf->memsize - buf->size;
+ len = (size_t)vsnprintf(p, space, fmt, ap_copy);
+ va_end(ap_copy);
+
+ if (len < space) {
+ buf->size += len;
+
+ return (buf->size >= WT_STREAM_BUFSIZE ?
+ __wt_fflush(session, fs) : 0);
+ }
+ WT_RET(__wt_buf_extend(session, buf, buf->size + len + 1));
+ }
+}
+
+/*
+ * __fstream_printf_notsup --
+ * ANSI C vfprintf unsupported.
+ */
+static int
+__fstream_printf_notsup(
+ WT_SESSION_IMPL *session, WT_FSTREAM *fs, const char *fmt, va_list ap)
+{
+ WT_UNUSED(fmt);
+ WT_UNUSED(ap);
+ WT_RET_MSG(session, ENOTSUP, "%s: printf", fs->name);
+}
+
+/*
+ * __wt_fopen --
+ * Open a stream handle.
+ */
+int
+__wt_fopen(WT_SESSION_IMPL *session,
+ const char *name, uint32_t open_flags, uint32_t flags, WT_FSTREAM **fsp)
+{
+ WT_DECL_RET;
+ WT_FH *fh;
+ WT_FSTREAM *fs;
+
+ fs = NULL;
+
+ WT_RET(__wt_open(
+ session, name, WT_OPEN_FILE_TYPE_REGULAR, open_flags, &fh));
+
+ WT_ERR(__wt_calloc_one(session, &fs));
+ fs->fh = fh;
+ fs->name = fh->name;
+ fs->flags = flags;
+
+ fs->close = __fstream_close;
+ WT_ERR(__wt_filesize(session, fh, &fs->size));
+ if (LF_ISSET(WT_STREAM_APPEND))
+ fs->off = fs->size;
+ if (LF_ISSET(WT_STREAM_APPEND | WT_STREAM_WRITE)) {
+ fs->flush = __fstream_flush;
+ fs->getline = __fstream_getline_notsup;
+ fs->printf = __fstream_printf;
+ } else {
+ WT_ASSERT(session, LF_ISSET(WT_STREAM_READ));
+ fs->flush = __fstream_flush_notsup;
+ fs->getline = __fstream_getline;
+ fs->printf = __fstream_printf_notsup;
+ }
+ *fsp = fs;
+ return (0);
+
+err: WT_TRET(__wt_close(session, &fh));
+ __wt_free(session, *fsp);
+ return (ret);
+}
diff --git a/src/os_common/os_fstream_stdio.c b/src/os_common/os_fstream_stdio.c
new file mode 100644
index 00000000000..4b0c761024b
--- /dev/null
+++ b/src/os_common/os_fstream_stdio.c
@@ -0,0 +1,84 @@
+/*-
+ * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __stdio_close --
+ * ANSI C close/fclose.
+ */
+static int
+__stdio_close(WT_SESSION_IMPL *session, WT_FSTREAM *fs)
+{
+ WT_RET_MSG(session, ENOTSUP, "%s: close", fs->name);
+}
+
+/*
+ * __stdio_flush --
+ * POSIX fflush.
+ */
+static int
+__stdio_flush(WT_SESSION_IMPL *session, WT_FSTREAM *fs)
+{
+ if (fflush(fs->fp) == 0)
+ return (0);
+ WT_RET_MSG(session, __wt_errno(), "%s: flush", fs->name);
+}
+
+/*
+ * __stdio_getline --
+ * ANSI C getline.
+ */
+static int
+__stdio_getline(WT_SESSION_IMPL *session, WT_FSTREAM *fs, WT_ITEM *buf)
+{
+ WT_UNUSED(buf);
+ WT_RET_MSG(session, ENOTSUP, "%s: getline", fs->name);
+}
+
+/*
+ * __stdio_printf --
+ * ANSI C vfprintf.
+ */
+static int
+__stdio_printf(
+ WT_SESSION_IMPL *session, WT_FSTREAM *fs, const char *fmt, va_list ap)
+{
+ if (vfprintf(fs->fp, fmt, ap) >= 0)
+ return (0);
+ WT_RET_MSG(session, EIO, "%s: printf", fs->name);
+}
+
+/*
+ * __stdio_init --
+ * Initialize stdio functions.
+ */
+static void
+__stdio_init(WT_FSTREAM *fs, const char *name, FILE *fp)
+{
+ fs->name = name;
+ fs->fp = fp;
+
+ fs->close = __stdio_close;
+ fs->flush = __stdio_flush;
+ fs->getline = __stdio_getline;
+ fs->printf = __stdio_printf;
+}
+
+/*
+ * __wt_os_stdio --
+ * Initialize the stdio configuration.
+ */
+int
+__wt_os_stdio(WT_SESSION_IMPL *session)
+{
+ __stdio_init(WT_STDERR(session), "stderr", stderr);
+ __stdio_init(WT_STDOUT(session), "stdout", stdout);
+
+ return (0);
+}
diff --git a/src/os_common/os_getline.c b/src/os_common/os_getline.c
deleted file mode 100644
index 01e11581edf..00000000000
--- a/src/os_common/os_getline.c
+++ /dev/null
@@ -1,51 +0,0 @@
-/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
- * Copyright (c) 2008-2014 WiredTiger, Inc.
- * All rights reserved.
- *
- * See the file LICENSE for redistribution information.
- */
-
-#include "wt_internal.h"
-
-/*
- * __wt_getline --
- * Get a line from a stream.
- *
- * Implementation of the POSIX getline or BSD fgetln functions (finding the
- * function in a portable way is hard, it's simple enough to write it instead).
- *
- * Note: Unlike the standard getline calls, this function doesn't include the
- * trailing newline character in the returned buffer and discards empty lines
- * (so the caller's EOF marker is a returned line length of 0).
- */
-int
-__wt_getline(WT_SESSION_IMPL *session, WT_ITEM *buf, WT_FH *fh)
-{
- int c;
-
- /*
- * We always NUL-terminate the returned string (even if it's empty),
- * make sure there's buffer space for a trailing NUL in all cases.
- */
- WT_RET(__wt_buf_init(session, buf, 100));
-
- for (;;) {
- WT_RET(fh->fh_getc(session, fh, &c));
- if (c == EOF)
- break;
-
- /* Leave space for a trailing NUL. */
- WT_RET(__wt_buf_extend(session, buf, buf->size + 2));
- if (c == '\n') {
- if (buf->size == 0)
- continue;
- break;
- }
- ((char *)buf->mem)[buf->size++] = (char)c;
- }
-
- ((char *)buf->mem)[buf->size] = '\0';
-
- return (0);
-}
diff --git a/src/os_common/os_init.c b/src/os_common/os_init.c
deleted file mode 100644
index 512216c52a5..00000000000
--- a/src/os_common/os_init.c
+++ /dev/null
@@ -1,41 +0,0 @@
-/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
- * Copyright (c) 2008-2014 WiredTiger, Inc.
- * All rights reserved.
- *
- * See the file LICENSE for redistribution information.
- */
-
-#include "wt_internal.h"
-
-/*
- * __wt_os_init --
- * Initialize the OS layer.
- */
-int
-__wt_os_init(WT_SESSION_IMPL *session)
-{
- return (F_ISSET(S2C(session), WT_CONN_IN_MEMORY) ?
- __wt_os_inmemory(session) :
-#if defined(_MSC_VER)
- __wt_os_win(session));
-#else
- __wt_os_posix(session));
-#endif
-}
-
-/*
- * __wt_os_cleanup --
- * Clean up the OS layer.
- */
-int
-__wt_os_cleanup(WT_SESSION_IMPL *session)
-{
- return (F_ISSET(S2C(session), WT_CONN_IN_MEMORY) ?
- __wt_os_inmemory_cleanup(session) :
-#if defined(_MSC_VER)
- __wt_os_win_cleanup(session));
-#else
- __wt_os_posix_cleanup(session));
-#endif
-}
diff --git a/src/os_posix/os_dir.c b/src/os_posix/os_dir.c
index 78ae5f8edd4..a23051e5b93 100644
--- a/src/os_posix/os_dir.c
+++ b/src/os_posix/os_dir.c
@@ -15,30 +15,33 @@
* Get a list of files from a directory, POSIX version.
*/
int
-__wt_posix_directory_list(WT_SESSION_IMPL *session, const char *dir,
- const char *prefix, uint32_t flags, char ***dirlist, u_int *countp)
+__wt_posix_directory_list(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *wt_session, const char *directory,
+ const char *prefix, char ***dirlistp, uint32_t *countp)
{
struct dirent *dp;
DIR *dirp;
WT_DECL_RET;
+ WT_SESSION_IMPL *session;
size_t dirallocsz;
- u_int count, dirsz;
- bool match;
- char **entries, *path;
+ uint32_t count;
+ char **entries;
- *dirlist = NULL;
- *countp = 0;
+ WT_UNUSED(file_system);
+
+ session = (WT_SESSION_IMPL *)wt_session;
- WT_RET(__wt_filename(session, dir, &path));
+ *dirlistp = NULL;
+ *countp = 0;
dirp = NULL;
dirallocsz = 0;
- dirsz = 0;
entries = NULL;
- WT_SYSCALL_RETRY(((dirp = opendir(path)) == NULL ? 1 : 0), ret);
+ WT_SYSCALL_RETRY(((dirp = opendir(directory)) == NULL ? 1 : 0), ret);
if (ret != 0)
- WT_ERR_MSG(session, ret, "%s: directory-list: opendir", path);
+ WT_RET_MSG(session, ret,
+ "%s: directory-list: opendir", directory);
for (count = 0; (dp = readdir(dirp)) != NULL;) {
/*
@@ -49,44 +52,50 @@ __wt_posix_directory_list(WT_SESSION_IMPL *session, const char *dir,
continue;
/* The list of files is optionally filtered by a prefix. */
- match = false;
- if (prefix != NULL &&
- ((LF_ISSET(WT_DIRLIST_INCLUDE) &&
- WT_PREFIX_MATCH(dp->d_name, prefix)) ||
- (LF_ISSET(WT_DIRLIST_EXCLUDE) &&
- !WT_PREFIX_MATCH(dp->d_name, prefix))))
- match = true;
- if (prefix == NULL || match) {
- /*
- * We have a file name we want to return.
- */
- count++;
- if (count > dirsz) {
- dirsz += WT_DIR_ENTRY;
- WT_ERR(__wt_realloc_def(
- session, &dirallocsz, dirsz, &entries));
- }
- WT_ERR(__wt_strdup(
- session, dp->d_name, &entries[count-1]));
- }
+ if (prefix != NULL && !WT_PREFIX_MATCH(dp->d_name, prefix))
+ continue;
+
+ WT_ERR(__wt_realloc_def(
+ session, &dirallocsz, count + 1, &entries));
+ WT_ERR(__wt_strdup(session, dp->d_name, &entries[count]));
+ ++count;
}
- if (count > 0)
- *dirlist = entries;
+
+ *dirlistp = entries;
*countp = count;
err: if (dirp != NULL)
(void)closedir(dirp);
- __wt_free(session, path);
if (ret == 0)
return (0);
- if (*dirlist != NULL) {
- for (count = dirsz; count > 0; count--)
- __wt_free(session, entries[count]);
- __wt_free(session, entries);
- }
+ WT_TRET(__wt_posix_directory_list_free(
+ file_system, wt_session, entries, count));
+
WT_RET_MSG(session, ret,
"%s: directory-list, prefix \"%s\"",
- dir, prefix == NULL ? "" : prefix);
+ directory, prefix == NULL ? "" : prefix);
+}
+
+/*
+ * __wt_posix_directory_list_free --
+ * Free memory returned by __wt_posix_directory_list.
+ */
+int
+__wt_posix_directory_list_free(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *wt_session, char **dirlist, uint32_t count)
+{
+ WT_SESSION_IMPL *session;
+
+ WT_UNUSED(file_system);
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ if (dirlist != NULL) {
+ while (count > 0)
+ __wt_free(session, dirlist[--count]);
+ __wt_free(session, dirlist);
+ }
+ return (0);
}
diff --git a/src/os_posix/os_dlopen.c b/src/os_posix/os_dlopen.c
index 9a74eb4813d..ad1fcc90150 100644
--- a/src/os_posix/os_dlopen.c
+++ b/src/os_posix/os_dlopen.c
@@ -19,7 +19,7 @@ __wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp)
WT_DLH *dlh;
WT_RET(__wt_calloc_one(session, &dlh));
- WT_ERR(__wt_strdup(session, path, &dlh->name));
+ WT_ERR(__wt_strdup(session, path == NULL ? "local" : path, &dlh->name));
if ((dlh->handle = dlopen(path, RTLD_LAZY)) == NULL)
WT_ERR_MSG(
diff --git a/src/os_posix/os_fallocate.c b/src/os_posix/os_fallocate.c
index 22879d36182..a162dbe01a1 100644
--- a/src/os_posix/os_fallocate.c
+++ b/src/os_posix/os_fallocate.c
@@ -12,47 +12,28 @@
#include <linux/falloc.h>
#include <sys/syscall.h>
#endif
-/*
- * __wt_posix_handle_allocate_configure --
- * Configure POSIX file-extension behavior for a file handle.
- */
-void
-__wt_posix_handle_allocate_configure(WT_SESSION_IMPL *session, WT_FH *fh)
-{
- WT_UNUSED(session);
-
- fh->fallocate_available = WT_FALLOCATE_NOT_AVAILABLE;
- fh->fallocate_requires_locking = false;
-
- /*
- * Check for the availability of some form of fallocate; in all cases,
- * start off requiring locking, we'll relax that requirement once we
- * know which system calls work with the handle's underlying filesystem.
- */
-#if defined(HAVE_FALLOCATE) || defined(HAVE_POSIX_FALLOCATE)
- fh->fallocate_available = WT_FALLOCATE_AVAILABLE;
- fh->fallocate_requires_locking = true;
-#endif
-#if defined(__linux__) && defined(SYS_fallocate)
- fh->fallocate_available = WT_FALLOCATE_AVAILABLE;
- fh->fallocate_requires_locking = true;
-#endif
-}
/*
* __posix_std_fallocate --
* Linux fallocate call.
*/
static int
-__posix_std_fallocate(WT_FH *fh, wt_off_t offset, wt_off_t len)
+__posix_std_fallocate(WT_FILE_HANDLE *file_handle,
+ WT_SESSION *wt_session, wt_off_t offset, wt_off_t len)
{
#if defined(HAVE_FALLOCATE)
WT_DECL_RET;
+ WT_FILE_HANDLE_POSIX *pfh;
- WT_SYSCALL_RETRY(fallocate(fh->fd, 0, offset, len), ret);
+ WT_UNUSED(wt_session);
+
+ pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
+
+ WT_SYSCALL_RETRY(fallocate(pfh->fd, 0, offset, len), ret);
return (ret);
#else
- WT_UNUSED(fh);
+ WT_UNUSED(file_handle);
+ WT_UNUSED(wt_session);
WT_UNUSED(offset);
WT_UNUSED(len);
return (ENOTSUP);
@@ -64,10 +45,16 @@ __posix_std_fallocate(WT_FH *fh, wt_off_t offset, wt_off_t len)
* Linux fallocate call (system call version).
*/
static int
-__posix_sys_fallocate(WT_FH *fh, wt_off_t offset, wt_off_t len)
+__posix_sys_fallocate(WT_FILE_HANDLE *file_handle,
+ WT_SESSION *wt_session, wt_off_t offset, wt_off_t len)
{
#if defined(__linux__) && defined(SYS_fallocate)
WT_DECL_RET;
+ WT_FILE_HANDLE_POSIX *pfh;
+
+ WT_UNUSED(wt_session);
+
+ pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
/*
* Try the system call for fallocate even if the C library wrapper was
@@ -75,10 +62,11 @@ __posix_sys_fallocate(WT_FH *fh, wt_off_t offset, wt_off_t len)
* Linux versions (RHEL 5.5), but not in the version of the C library.
* This allows it to work everywhere the kernel supports it.
*/
- WT_SYSCALL_RETRY(syscall(SYS_fallocate, fh->fd, 0, offset, len), ret);
+ WT_SYSCALL_RETRY(syscall(SYS_fallocate, pfh->fd, 0, offset, len), ret);
return (ret);
#else
- WT_UNUSED(fh);
+ WT_UNUSED(file_handle);
+ WT_UNUSED(wt_session);
WT_UNUSED(offset);
WT_UNUSED(len);
return (ENOTSUP);
@@ -90,15 +78,22 @@ __posix_sys_fallocate(WT_FH *fh, wt_off_t offset, wt_off_t len)
* POSIX fallocate call.
*/
static int
-__posix_posix_fallocate(WT_FH *fh, wt_off_t offset, wt_off_t len)
+__posix_posix_fallocate(WT_FILE_HANDLE *file_handle,
+ WT_SESSION *wt_session, wt_off_t offset, wt_off_t len)
{
#if defined(HAVE_POSIX_FALLOCATE)
WT_DECL_RET;
+ WT_FILE_HANDLE_POSIX *pfh;
+
+ WT_UNUSED(wt_session);
- WT_SYSCALL_RETRY(posix_fallocate(fh->fd, offset, len), ret);
+ pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
+
+ WT_SYSCALL_RETRY(posix_fallocate(pfh->fd, offset, len), ret);
return (ret);
#else
- WT_UNUSED(fh);
+ WT_UNUSED(file_handle);
+ WT_UNUSED(wt_session);
WT_UNUSED(offset);
WT_UNUSED(len);
return (ENOTSUP);
@@ -106,67 +101,52 @@ __posix_posix_fallocate(WT_FH *fh, wt_off_t offset, wt_off_t len)
}
/*
- * __wt_posix_handle_allocate --
+ * __wt_posix_file_fallocate --
* POSIX fallocate.
*/
int
-__wt_posix_handle_allocate(
- WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, wt_off_t len)
+__wt_posix_file_fallocate(WT_FILE_HANDLE *file_handle,
+ WT_SESSION *wt_session, wt_off_t offset, wt_off_t len)
{
- WT_DECL_RET;
-
- switch (fh->fallocate_available) {
- /*
- * Check for already configured handles and make the configured call.
- */
- case WT_FALLOCATE_POSIX:
- if ((ret = __posix_posix_fallocate(fh, offset, len)) == 0)
- return (0);
- WT_RET_MSG(session, ret, "%s: posix_fallocate", fh->name);
- case WT_FALLOCATE_STD:
- if ((ret = __posix_std_fallocate(fh, offset, len)) == 0)
- return (0);
- WT_RET_MSG(session, ret, "%s: fallocate", fh->name);
- case WT_FALLOCATE_SYS:
- if ((ret = __posix_sys_fallocate(fh, offset, len)) == 0)
- return (0);
- WT_RET_MSG(session, ret, "%s: sys_fallocate", fh->name);
-
/*
- * Figure out what allocation call this system/filesystem supports, if
- * any.
+ * The first fallocate call: figure out what fallocate call this system
+ * supports, if any.
+ *
+ * The function is configured as a locking fallocate call, so we know
+ * we're single-threaded through here. Set the nolock function first,
+ * then publish the NULL replacement to ensure the handle functions are
+ * always correct.
+ *
+ * We've seen Linux systems where posix_fallocate has corrupted
+ * existing file data (even though that is explicitly disallowed
+ * by POSIX). FreeBSD and Solaris support posix_fallocate, and
+ * so far we've seen no problems leaving it unlocked. Check for
+ * fallocate (and the system call version of fallocate) first to
+ * avoid locking on Linux if at all possible.
*/
- case WT_FALLOCATE_AVAILABLE:
- /*
- * We've seen Linux systems where posix_fallocate has corrupted
- * existing file data (even though that is explicitly disallowed
- * by POSIX). FreeBSD and Solaris support posix_fallocate, and
- * so far we've seen no problems leaving it unlocked. Check for
- * fallocate (and the system call version of fallocate) first to
- * avoid locking on Linux if at all possible.
- */
- if ((ret = __posix_std_fallocate(fh, offset, len)) == 0) {
- fh->fallocate_available = WT_FALLOCATE_STD;
- fh->fallocate_requires_locking = false;
- return (0);
- }
- if ((ret = __posix_sys_fallocate(fh, offset, len)) == 0) {
- fh->fallocate_available = WT_FALLOCATE_SYS;
- fh->fallocate_requires_locking = false;
- return (0);
- }
- if ((ret = __posix_posix_fallocate(fh, offset, len)) == 0) {
- fh->fallocate_available = WT_FALLOCATE_POSIX;
-#if !defined(__linux__)
- fh->fallocate_requires_locking = false;
+ if (__posix_std_fallocate(file_handle, wt_session, offset, len) == 0) {
+ file_handle->fallocate_nolock = __posix_std_fallocate;
+ WT_PUBLISH(file_handle->fallocate, NULL);
+ return (0);
+ }
+ if (__posix_sys_fallocate(file_handle, wt_session, offset, len) == 0) {
+ file_handle->fallocate_nolock = __posix_sys_fallocate;
+ WT_PUBLISH(file_handle->fallocate, NULL);
+ return (0);
+ }
+ if (__posix_posix_fallocate(
+ file_handle, wt_session, offset, len) == 0) {
+#if defined(__linux__)
+ file_handle->fallocate = __posix_posix_fallocate;
+ WT_WRITE_BARRIER();
+#else
+ file_handle->fallocate_nolock = __posix_posix_fallocate;
+ WT_PUBLISH(file_handle->fallocate, NULL);
#endif
- return (0);
- }
- /* FALLTHROUGH */
- case WT_FALLOCATE_NOT_AVAILABLE:
- default:
- fh->fallocate_available = WT_FALLOCATE_NOT_AVAILABLE;
- return (ENOTSUP);
+ return (0);
}
- /* NOTREACHED */
+
+ file_handle->fallocate = NULL;
+ WT_WRITE_BARRIER();
+ return (ENOTSUP);
}
diff --git a/src/os_posix/os_fs.c b/src/os_posix/os_fs.c
index 86aa8db8f4f..ab9c82613d6 100644
--- a/src/os_posix/os_fs.c
+++ b/src/os_posix/os_fs.c
@@ -13,30 +13,11 @@
* Underlying support function to flush a file handle.
*/
static int
-__posix_sync(WT_SESSION_IMPL *session,
- int fd, const char *name, const char *func, bool block)
+__posix_sync(
+ WT_SESSION_IMPL *session, int fd, const char *name, const char *func)
{
WT_DECL_RET;
- WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY));
-
-#ifdef HAVE_SYNC_FILE_RANGE
- if (!block) {
- WT_SYSCALL_RETRY(sync_file_range(fd,
- (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE), ret);
- if (ret == 0)
- return (0);
- WT_RET_MSG(session, ret, "%s: %s: sync_file_range", name, func);
- }
-#else
- /*
- * Callers attempting asynchronous flush handle ENOTSUP returns, and
- * won't make further attempts.
- */
- if (!block)
- return (ENOTSUP);
-#endif
-
#if defined(F_FULLFSYNC)
/*
* OS X fsync documentation:
@@ -73,45 +54,29 @@ __posix_sync(WT_SESSION_IMPL *session,
#endif
}
+#ifdef __linux__
/*
* __posix_directory_sync --
* Flush a directory to ensure file creation is durable.
*/
static int
-__posix_directory_sync(WT_SESSION_IMPL *session, const char *path)
+__posix_directory_sync(
+ WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const char *path)
{
-#ifdef __linux__
WT_DECL_RET;
+ WT_SESSION_IMPL *session;
int fd, tret;
- const char *dir;
- char *copy;
- tret = 0;
- /*
- * POSIX 1003.1 does not require that fsync of a file handle ensures the
- * entry in the directory containing the file has also reached disk (and
- * there are historic Linux filesystems requiring this), do an explicit
- * fsync on a file descriptor for the directory to be sure.
- */
- copy = NULL;
- if (path == NULL || (dir = strrchr(path, '/')) == NULL)
- path = S2C(session)->home;
- else {
- /*
- * Copy the directory name, leaving the trailing slash in place,
- * so a path of "/foo" doesn't result in an empty string.
- */
- WT_RET(__wt_strndup(
- session, path, (size_t)(dir - path) + 1, &copy));
- path = copy;
- }
+ WT_UNUSED(file_system);
+
+ session = (WT_SESSION_IMPL *)wt_session;
WT_SYSCALL_RETRY((
(fd = open(path, O_RDONLY, 0444)) == -1 ? 1 : 0), ret);
if (ret != 0)
- WT_ERR_MSG(session, ret, "%s: directory-sync: open", path);
+ WT_RET_MSG(session, ret, "%s: directory-sync: open", path);
- ret = __posix_sync(session, fd, path, "directory-sync", true);
+ ret = __posix_sync(session, fd, path, "directory-sync");
WT_SYSCALL_RETRY(close(fd), tret);
if (tret != 0) {
@@ -119,232 +84,182 @@ __posix_directory_sync(WT_SESSION_IMPL *session, const char *path)
if (ret == 0)
ret = tret;
}
-err: __wt_free(session, copy);
return (ret);
-#else
- WT_UNUSED(session);
- WT_UNUSED(path);
- return (0);
-#endif
}
+#endif
/*
- * __posix_file_exist --
+ * __posix_fs_exist --
* Return if the file exists.
*/
static int
-__posix_file_exist(WT_SESSION_IMPL *session, const char *name, bool *existp)
+__posix_fs_exist(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *wt_session, const char *name, bool *existp)
{
struct stat sb;
WT_DECL_RET;
- char *path;
+ WT_SESSION_IMPL *session;
- WT_RET(__wt_filename(session, name, &path));
- name = path;
+ WT_UNUSED(file_system);
+
+ session = (WT_SESSION_IMPL *)wt_session;
WT_SYSCALL_RETRY(stat(name, &sb), ret);
- if (ret == 0)
+ if (ret == 0) {
*existp = true;
- else if (ret == ENOENT) {
+ return (0);
+ }
+ if (ret == ENOENT) {
*existp = false;
- ret = 0;
- } else
- __wt_err(session, ret, "%s: file-exist: stat", name);
-
- __wt_free(session, path);
- return (ret);
+ return (0);
+ }
+ WT_RET_MSG(session, ret, "%s: file-exist: stat", name);
}
/*
- * __posix_file_remove --
+ * __posix_fs_remove --
* Remove a file.
*/
static int
-__posix_file_remove(WT_SESSION_IMPL *session, const char *name)
+__posix_fs_remove(
+ WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const char *name)
{
WT_DECL_RET;
- char *path;
+ WT_SESSION_IMPL *session;
-#ifdef HAVE_DIAGNOSTIC
- if (__wt_handle_search(session, name, false, NULL, NULL))
- WT_RET_MSG(session, EINVAL,
- "%s: file-remove: file has open handles", name);
-#endif
+ WT_UNUSED(file_system);
- WT_RET(__wt_filename(session, name, &path));
- name = path;
+ session = (WT_SESSION_IMPL *)wt_session;
WT_SYSCALL_RETRY(remove(name), ret);
- if (ret != 0)
- __wt_err(session, ret, "%s: file-remove: remove", name);
-
- __wt_free(session, path);
- return (ret);
+ if (ret == 0)
+ return (0);
+ WT_RET_MSG(session, ret, "%s: file-remove: remove", name);
}
/*
- * __posix_file_rename --
+ * __posix_fs_rename --
* Rename a file.
*/
static int
-__posix_file_rename(WT_SESSION_IMPL *session, const char *from, const char *to)
+__posix_fs_rename(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *wt_session, const char *from, const char *to)
{
WT_DECL_RET;
- char *from_path, *to_path;
-
-#ifdef HAVE_DIAGNOSTIC
- if (__wt_handle_search(session, from, false, NULL, NULL))
- WT_RET_MSG(session, EINVAL,
- "%s: file-rename: file has open handles", from);
- if (__wt_handle_search(session, to, false, NULL, NULL))
- WT_RET_MSG(session, EINVAL,
- "%s: file-rename: file has open handles", to);
-#endif
+ WT_SESSION_IMPL *session;
- from_path = to_path = NULL;
- WT_ERR(__wt_filename(session, from, &from_path));
- from = from_path;
- WT_ERR(__wt_filename(session, to, &to_path));
- to = to_path;
+ WT_UNUSED(file_system);
- WT_SYSCALL_RETRY(rename(from, to), ret);
- if (ret != 0)
- __wt_err(session, ret,
- "%s to %s: file-rename: rename", from, to);
+ session = (WT_SESSION_IMPL *)wt_session;
-err: __wt_free(session, from_path);
- __wt_free(session, to_path);
- return (ret);
+ WT_SYSCALL_RETRY(rename(from, to), ret);
+ if (ret == 0)
+ return (0);
+ WT_RET_MSG(session, ret, "%s to %s: file-rename: rename", from, to);
}
/*
- * __posix_file_size --
+ * __posix_fs_size --
* Get the size of a file in bytes, by file name.
*/
static int
-__posix_file_size(
- WT_SESSION_IMPL *session, const char *name, bool silent, wt_off_t *sizep)
+__posix_fs_size(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *wt_session, const char *name, wt_off_t *sizep)
{
struct stat sb;
WT_DECL_RET;
- char *path;
+ WT_SESSION_IMPL *session;
- WT_RET(__wt_filename(session, name, &path));
- name = path;
+ WT_UNUSED(file_system);
+
+ session = (WT_SESSION_IMPL *)wt_session;
- /*
- * Optionally don't log errors on ENOENT; some callers of this function
- * expect failure in that case and don't want an error message logged.
- */
WT_SYSCALL_RETRY(stat(name, &sb), ret);
- if (ret == 0)
+ if (ret == 0) {
*sizep = sb.st_size;
- else if (ret != ENOENT || !silent)
- __wt_err(session, ret, "%s: file-size: stat", name);
-
- __wt_free(session, path);
-
- return (ret);
+ return (0);
+ }
+ WT_RET_MSG(session, ret, "%s: file-size: stat", name);
}
+#if defined(HAVE_POSIX_FADVISE)
/*
- * __posix_handle_advise --
+ * __posix_file_advise --
* POSIX fadvise.
*/
static int
-__posix_handle_advise(WT_SESSION_IMPL *session,
- WT_FH *fh, wt_off_t offset, wt_off_t len, int advice)
+__posix_file_advise(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session,
+ wt_off_t offset, wt_off_t len, int advice)
{
-#if defined(HAVE_POSIX_FADVISE)
WT_DECL_RET;
+ WT_FILE_HANDLE_POSIX *pfh;
+ WT_SESSION_IMPL *session;
- /*
- * Refuse pre-load when direct I/O is configured for the file, the
- * kernel cache isn't interesting.
- */
- if (advice == POSIX_MADV_WILLNEED && fh->direct_io)
- return (ENOTSUP);
+ session = (WT_SESSION_IMPL *)wt_session;
+ pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
- WT_SYSCALL_RETRY(posix_fadvise(fh->fd, offset, len, advice), ret);
+ WT_SYSCALL_RETRY(posix_fadvise(pfh->fd, offset, len, advice), ret);
if (ret == 0)
return (0);
/*
* Treat EINVAL as not-supported, some systems don't support some flags.
- * Quietly fail, callers expect not-supported failures.
+ * Quietly fail, callers expect not-supported failures, and reset the
+ * handle method to prevent future calls.
*/
- if (ret == EINVAL)
+ if (ret == EINVAL) {
+ file_handle->fadvise = NULL;
return (ENOTSUP);
+ }
- WT_RET_MSG(session, ret, "%s: handle-advise: posix_fadvise", fh->name);
-#else
- WT_UNUSED(session);
- WT_UNUSED(fh);
- WT_UNUSED(offset);
- WT_UNUSED(len);
- WT_UNUSED(advice);
+ WT_RET_MSG(session, ret,
+ "%s: handle-advise: posix_fadvise", file_handle->name);
- /* Quietly fail, callers expect not-supported failures. */
- return (ENOTSUP);
-#endif
}
+#endif
/*
- * __posix_handle_close --
- * ANSI C close/fclose.
+ * __posix_file_close --
+ * ANSI C close.
*/
static int
-__posix_handle_close(WT_SESSION_IMPL *session, WT_FH *fh)
+__posix_file_close(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session)
{
WT_DECL_RET;
+ WT_FILE_HANDLE_POSIX *pfh;
+ WT_SESSION_IMPL *session;
- if (fh->fp == NULL) {
- WT_SYSCALL_RETRY(close(fh->fd), ret);
- if (ret == 0)
- return (0);
- WT_RET_MSG(session, ret, "%s: handle-close: close", fh->name);
- }
+ session = (WT_SESSION_IMPL *)wt_session;
+ pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
- /* If the stream was opened for writing, flush the file. */
- if (F_ISSET(fh, WT_FH_FLUSH_ON_CLOSE) && fflush(fh->fp) != 0) {
- ret = __wt_errno();
- __wt_err(session, ret, "%s: handle-close: fflush", fh->name);
+ /* Close the file handle. */
+ if (pfh->fd != -1) {
+ WT_SYSCALL_RETRY(close(pfh->fd), ret);
+ if (ret != 0)
+ __wt_err(session, ret,
+ "%s: handle-close: close", file_handle->name);
}
- /* Close the file. */
- if (fclose(fh->fp) != 0) {
- ret = __wt_errno();
- __wt_err(session, ret, "%s: handle-close: fclose", fh->name);
- }
+ __wt_free(session, file_handle->name);
+ __wt_free(session, pfh);
return (ret);
}
/*
- * __posix_handle_getc --
- * ANSI C fgetc.
- */
-static int
-__posix_handle_getc(WT_SESSION_IMPL *session, WT_FH *fh, int *chp)
-{
- if (fh->fp == NULL)
- WT_RET_MSG(session,
- ENOTSUP, "%s: handle-getc: no stream configured", fh->name);
-
- *chp = fgetc(fh->fp);
- if (*chp != EOF || !ferror(fh->fp))
- return (0);
- WT_RET_MSG(session, __wt_errno(), "%s: handle-getc: fgetc", fh->name);
-}
-
-/*
- * __posix_handle_lock --
+ * __posix_file_lock --
* Lock/unlock a file.
*/
static int
-__posix_handle_lock(WT_SESSION_IMPL *session, WT_FH *fh, bool lock)
+__posix_file_lock(
+ WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, bool lock)
{
struct flock fl;
WT_DECL_RET;
+ WT_FILE_HANDLE_POSIX *pfh;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
/*
* WiredTiger requires this function be able to acquire locks past
@@ -360,44 +275,32 @@ __posix_handle_lock(WT_SESSION_IMPL *session, WT_FH *fh, bool lock)
fl.l_type = lock ? F_WRLCK : F_UNLCK;
fl.l_whence = SEEK_SET;
- WT_SYSCALL_RETRY(fcntl(fh->fd, F_SETLK, &fl), ret);
+ WT_SYSCALL_RETRY(fcntl(pfh->fd, F_SETLK, &fl), ret);
if (ret == 0)
return (0);
- WT_RET_MSG(session, ret, "%s: handle-lock: fcntl", fh->name);
+ WT_RET_MSG(session, ret, "%s: handle-lock: fcntl", file_handle->name);
}
/*
- * __posix_handle_printf --
- * ANSI C vfprintf.
- */
-static int
-__posix_handle_printf(
- WT_SESSION_IMPL *session, WT_FH *fh, const char *fmt, va_list ap)
-{
- if (fh->fp == NULL)
- WT_RET_MSG(session, ENOTSUP,
- "%s: vfprintf: no stream configured", fh->name);
-
- if (vfprintf(fh->fp, fmt, ap) >= 0)
- return (0);
- WT_RET_MSG(session, EIO, "%s: handle-printf: vfprintf", fh->name);
-}
-
-/*
- * __posix_handle_read --
+ * __posix_file_read --
* POSIX pread.
*/
static int
-__posix_handle_read(
- WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf)
+__posix_file_read(WT_FILE_HANDLE *file_handle,
+ WT_SESSION *wt_session, wt_off_t offset, size_t len, void *buf)
{
+ WT_FILE_HANDLE_POSIX *pfh;
+ WT_SESSION_IMPL *session;
size_t chunk;
ssize_t nr;
uint8_t *addr;
+ session = (WT_SESSION_IMPL *)wt_session;
+ pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
+
/* Assert direct I/O is aligned and a multiple of the alignment. */
WT_ASSERT(session,
- !fh->direct_io ||
+ !pfh->direct_io ||
S2C(session)->buffer_alignment == 0 ||
(!((uintptr_t)buf &
(uintptr_t)(S2C(session)->buffer_alignment - 1)) &&
@@ -407,79 +310,122 @@ __posix_handle_read(
/* Break reads larger than 1GB into 1GB chunks. */
for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) {
chunk = WT_MIN(len, WT_GIGABYTE);
- if ((nr = pread(fh->fd, addr, chunk, offset)) <= 0)
+ if ((nr = pread(pfh->fd, addr, chunk, offset)) <= 0)
WT_RET_MSG(session, nr == 0 ? WT_ERROR : __wt_errno(),
"%s: handle-read: pread: failed to read %"
WT_SIZET_FMT " bytes at offset %" PRIuMAX,
- fh->name, chunk, (uintmax_t)offset);
+ file_handle->name, chunk, (uintmax_t)offset);
}
return (0);
}
/*
- * __posix_handle_size --
+ * __posix_file_size --
* Get the size of a file in bytes, by file handle.
*/
static int
-__posix_handle_size(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *sizep)
+__posix_file_size(
+ WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, wt_off_t *sizep)
{
struct stat sb;
WT_DECL_RET;
+ WT_FILE_HANDLE_POSIX *pfh;
+ WT_SESSION_IMPL *session;
- WT_SYSCALL_RETRY(fstat(fh->fd, &sb), ret);
+ session = (WT_SESSION_IMPL *)wt_session;
+ pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
+
+ WT_SYSCALL_RETRY(fstat(pfh->fd, &sb), ret);
if (ret == 0) {
*sizep = sb.st_size;
return (0);
}
- WT_RET_MSG(session, ret, "%s: handle-size: fstat", fh->name);
+ WT_RET_MSG(session, ret, "%s: handle-size: fstat", file_handle->name);
+}
+
+/*
+ * __posix_file_sync --
+ * POSIX fsync.
+ */
+static int
+__posix_file_sync(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session)
+{
+ WT_FILE_HANDLE_POSIX *pfh;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
+
+ return (
+ __posix_sync(session, pfh->fd, file_handle->name, "handle-sync"));
}
+#ifdef HAVE_SYNC_FILE_RANGE
/*
- * __posix_handle_sync --
- * POSIX fflush/fsync.
+ * __posix_file_sync_nowait --
+ * POSIX fsync.
*/
static int
-__posix_handle_sync(WT_SESSION_IMPL *session, WT_FH *fh, bool block)
+__posix_file_sync_nowait(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session)
{
- if (fh->fp == NULL)
- return (__posix_sync(
- session, fh->fd, fh->name, "handle-sync", block));
+ WT_DECL_RET;
+ WT_FILE_HANDLE_POSIX *pfh;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
- if (fflush(fh->fp) == 0)
+ WT_SYSCALL_RETRY(sync_file_range(pfh->fd,
+ (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE), ret);
+ if (ret == 0)
return (0);
- WT_RET_MSG(session, __wt_errno(), "%s: handle-sync: fflush", fh->name);
+ WT_RET_MSG(session, ret,
+ "%s: handle-sync-nowait: sync_file_range", file_handle->name);
}
+#endif
/*
- * __posix_handle_truncate --
+ * __posix_file_truncate --
* POSIX ftruncate.
*/
static int
-__posix_handle_truncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len)
+__posix_file_truncate(
+ WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, wt_off_t len)
{
WT_DECL_RET;
+ WT_FILE_HANDLE_POSIX *pfh;
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+ pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
- WT_SYSCALL_RETRY(ftruncate(fh->fd, len), ret);
+ WT_SYSCALL_RETRY(ftruncate(pfh->fd, len), ret);
if (ret == 0)
return (0);
- WT_RET_MSG(session, ret, "%s: handle-truncate: ftruncate", fh->name);
+ WT_RET_MSG(session, ret,
+ "%s: handle-truncate: ftruncate", file_handle->name);
}
/*
- * __posix_handle_write --
+ * __posix_file_write --
* POSIX pwrite.
*/
static int
-__posix_handle_write(WT_SESSION_IMPL *session,
- WT_FH *fh, wt_off_t offset, size_t len, const void *buf)
+__posix_file_write(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session,
+ wt_off_t offset, size_t len, const void *buf)
{
+ WT_FILE_HANDLE_POSIX *pfh;
+ WT_SESSION_IMPL *session;
size_t chunk;
ssize_t nw;
const uint8_t *addr;
+ session = (WT_SESSION_IMPL *)wt_session;
+ pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
+
/* Assert direct I/O is aligned and a multiple of the alignment. */
WT_ASSERT(session,
- !fh->direct_io ||
+ !pfh->direct_io ||
S2C(session)->buffer_alignment == 0 ||
(!((uintptr_t)buf &
(uintptr_t)(S2C(session)->buffer_alignment - 1)) &&
@@ -489,21 +435,21 @@ __posix_handle_write(WT_SESSION_IMPL *session,
/* Break writes larger than 1GB into 1GB chunks. */
for (addr = buf; len > 0; addr += nw, len -= (size_t)nw, offset += nw) {
chunk = WT_MIN(len, WT_GIGABYTE);
- if ((nw = pwrite(fh->fd, addr, chunk, offset)) < 0)
+ if ((nw = pwrite(pfh->fd, addr, chunk, offset)) < 0)
WT_RET_MSG(session, __wt_errno(),
"%s: handle-write: pwrite: failed to write %"
WT_SIZET_FMT " bytes at offset %" PRIuMAX,
- fh->name, chunk, (uintmax_t)offset);
+ file_handle->name, chunk, (uintmax_t)offset);
}
return (0);
}
/*
- * __posix_handle_open_cloexec --
+ * __posix_open_file_cloexec --
* Prevent child access to file handles.
*/
static inline int
-__posix_handle_open_cloexec(WT_SESSION_IMPL *session, int fd, const char *name)
+__posix_open_file_cloexec(WT_SESSION_IMPL *session, int fd, const char *name)
{
#if defined(HAVE_FCNTL) && defined(FD_CLOEXEC) && !defined(O_CLOEXEC)
int f;
@@ -528,28 +474,35 @@ __posix_handle_open_cloexec(WT_SESSION_IMPL *session, int fd, const char *name)
}
/*
- * __posix_handle_open --
+ * __posix_open_file --
* Open a file handle.
*/
static int
-__posix_handle_open(WT_SESSION_IMPL *session,
- WT_FH *fh, const char *name, uint32_t file_type, uint32_t flags)
+__posix_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session,
+ const char *name, WT_OPEN_FILE_TYPE file_type, uint32_t flags,
+ WT_FILE_HANDLE **file_handlep)
{
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
+ WT_FILE_HANDLE *file_handle;
+ WT_FILE_HANDLE_POSIX *pfh;
+ WT_SESSION_IMPL *session;
mode_t mode;
- int f, fd, tret;
- bool direct_io;
- const char *stream_mode;
+ int f;
+
+ WT_UNUSED(file_system);
+ *file_handlep = NULL;
+
+ session = (WT_SESSION_IMPL *)wt_session;
conn = S2C(session);
- direct_io = false;
+
+ WT_RET(__wt_calloc_one(session, &pfh));
/* Set up error handling. */
- fh->fd = fd = -1;
- fh->fp = NULL;
+ pfh->fd = -1;
- if (file_type == WT_FILE_TYPE_DIRECTORY) {
+ if (file_type == WT_OPEN_FILE_TYPE_DIRECTORY) {
f = O_RDONLY;
#ifdef O_CLOEXEC
/*
@@ -560,10 +513,10 @@ __posix_handle_open(WT_SESSION_IMPL *session,
f |= O_CLOEXEC;
#endif
WT_SYSCALL_RETRY((
- (fd = open(name, f, 0444)) == -1 ? 1 : 0), ret);
+ (pfh->fd = open(name, f, 0444)) == -1 ? 1 : 0), ret);
if (ret != 0)
WT_ERR_MSG(session, ret, "%s: handle-open: open", name);
- WT_ERR(__posix_handle_open_cloexec(session, fd, name));
+ WT_ERR(__posix_open_file_cloexec(session, pfh->fd, name));
goto directory_open;
}
@@ -589,28 +542,20 @@ __posix_handle_open(WT_SESSION_IMPL *session,
f |= O_CLOEXEC;
#endif
#ifdef O_DIRECT
- /*
- * Direct I/O: file-type is a flag from the set of possible flags stored
- * in the connection handle during configuration, check for a match.
- * Also, "direct_io=checkpoint" configures direct I/O for readonly data
- * files.
- */
- if (FLD_ISSET(conn->direct_io, file_type) ||
- (LF_ISSET(WT_OPEN_READONLY) &&
- file_type == WT_FILE_TYPE_DATA &&
- FLD_ISSET(conn->direct_io, WT_FILE_TYPE_CHECKPOINT))) {
+ /* Direct I/O. */
+ if (LF_ISSET(WT_OPEN_DIRECTIO)) {
f |= O_DIRECT;
- direct_io = true;
- }
+ pfh->direct_io = true;
+ } else
+ pfh->direct_io = false;
#endif
- fh->direct_io = direct_io;
#ifdef O_NOATIME
/* Avoid updating metadata for read-only workloads. */
- if (file_type == WT_FILE_TYPE_DATA)
+ if (file_type == WT_OPEN_FILE_TYPE_DATA)
f |= O_NOATIME;
#endif
- if (file_type == WT_FILE_TYPE_LOG &&
+ if (file_type == WT_OPEN_FILE_TYPE_LOG &&
FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC)) {
#ifdef O_DSYNC
f |= O_DSYNC;
@@ -622,115 +567,122 @@ __posix_handle_open(WT_SESSION_IMPL *session,
#endif
}
- WT_SYSCALL_RETRY(((fd = open(name, f, mode)) == -1 ? 1 : 0), ret);
+ WT_SYSCALL_RETRY(((pfh->fd = open(name, f, mode)) == -1 ? 1 : 0), ret);
if (ret != 0)
WT_ERR_MSG(session, ret,
- direct_io ?
+ pfh->direct_io ?
"%s: handle-open: open: failed with direct I/O configured, "
"some filesystem types do not support direct I/O" :
"%s: handle-open: open", name);
- WT_ERR(__posix_handle_open_cloexec(session, fd, name));
+ WT_ERR(__posix_open_file_cloexec(session, pfh->fd, name));
- /* Disable read-ahead on trees: it slows down random read workloads. */
#if defined(HAVE_POSIX_FADVISE)
- if (file_type == WT_FILE_TYPE_DATA) {
+ /*
+ * Disable read-ahead on trees: it slows down random read workloads.
+ * Ignore fadvise when doing direct I/O, the kernel cache isn't
+ * interesting.
+ */
+ if (!pfh->direct_io && file_type == WT_OPEN_FILE_TYPE_DATA) {
WT_SYSCALL_RETRY(
- posix_fadvise(fd, 0, 0, POSIX_FADV_RANDOM), ret);
+ posix_fadvise(pfh->fd, 0, 0, POSIX_FADV_RANDOM), ret);
if (ret != 0)
WT_ERR_MSG(session, ret,
"%s: handle-open: posix_fadvise", name);
}
#endif
- /* Optionally configure a stdio stream API. */
- switch (LF_MASK(WT_STREAM_APPEND | WT_STREAM_READ | WT_STREAM_WRITE)) {
- case WT_STREAM_APPEND:
- stream_mode = "a";
- F_SET(fh, WT_FH_FLUSH_ON_CLOSE);
- break;
- case WT_STREAM_READ:
- stream_mode = "r";
- break;
- case WT_STREAM_WRITE:
- stream_mode = "w";
- F_SET(fh, WT_FH_FLUSH_ON_CLOSE);
- break;
- case 0:
- default:
- stream_mode = NULL;
- break;
- }
- if (stream_mode != NULL) {
- if ((fh->fp = fdopen(fd, stream_mode)) == NULL)
- WT_ERR_MSG(session, __wt_errno(),
- "%s: handle-open: fdopen", name);
- if (LF_ISSET(WT_STREAM_LINE_BUFFER))
- __wt_stream_set_line_buffer(fh->fp);
- }
-
directory_open:
- fh->fd = fd;
-
- /* Configure fallocate calls. */
- __wt_posix_handle_allocate_configure(session, fh);
-
- fh->fh_advise = __posix_handle_advise;
- fh->fh_allocate = __wt_posix_handle_allocate;
- fh->fh_close = __posix_handle_close;
- fh->fh_getc = __posix_handle_getc;
- fh->fh_lock = __posix_handle_lock;
- fh->fh_map = __wt_posix_map;
- fh->fh_map_discard = __wt_posix_map_discard;
- fh->fh_map_preload = __wt_posix_map_preload;
- fh->fh_map_unmap = __wt_posix_map_unmap;
- fh->fh_printf = __posix_handle_printf;
- fh->fh_read = __posix_handle_read;
- fh->fh_size = __posix_handle_size;
- fh->fh_sync = __posix_handle_sync;
- fh->fh_truncate = __posix_handle_truncate;
- fh->fh_write = __posix_handle_write;
+ /* Initialize public information. */
+ file_handle = (WT_FILE_HANDLE *)pfh;
+ WT_ERR(__wt_strdup(session, name, &file_handle->name));
+
+ file_handle->close = __posix_file_close;
+#if defined(HAVE_POSIX_FADVISE)
+ /*
+ * Ignore fadvise when doing direct I/O, the kernel cache isn't
+ * interesting.
+ */
+ if (!pfh->direct_io)
+ file_handle->fadvise = __posix_file_advise;
+#endif
+ file_handle->fallocate = __wt_posix_file_fallocate;
+ file_handle->lock = __posix_file_lock;
+#ifdef WORDS_BIGENDIAN
+ /*
+ * The underlying objects are little-endian, mapping objects isn't
+ * currently supported on big-endian systems.
+ */
+#else
+ file_handle->map = __wt_posix_map;
+#ifdef HAVE_POSIX_MADVISE
+ file_handle->map_discard = __wt_posix_map_discard;
+ file_handle->map_preload = __wt_posix_map_preload;
+#endif
+ file_handle->unmap = __wt_posix_unmap;
+#endif
+ file_handle->read = __posix_file_read;
+ file_handle->size = __posix_file_size;
+ file_handle->sync = __posix_file_sync;
+#ifdef HAVE_SYNC_FILE_RANGE
+ file_handle->sync_nowait = __posix_file_sync_nowait;
+#endif
+ file_handle->truncate = __posix_file_truncate;
+ file_handle->write = __posix_file_write;
+
+ *file_handlep = file_handle;
return (0);
-err: if (fd != -1) {
- WT_SYSCALL_RETRY(close(fd), tret);
- if (tret != 0)
- __wt_err(session, tret, "%s: handle-open: close", name);
- }
+err: WT_TRET(__posix_file_close((WT_FILE_HANDLE *)pfh, wt_session));
return (ret);
}
/*
- * __wt_os_posix --
- * Initialize a POSIX configuration.
+ * __posix_terminate --
+ * Terminate a POSIX configuration.
*/
-int
-__wt_os_posix(WT_SESSION_IMPL *session)
+static int
+__posix_terminate(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session)
{
- WT_CONNECTION_IMPL *conn;
+ WT_SESSION_IMPL *session;
- conn = S2C(session);
+ WT_UNUSED(file_system);
- /* Initialize the POSIX jump table. */
- conn->file_directory_list = __wt_posix_directory_list;
- conn->file_directory_sync = __posix_directory_sync;
- conn->file_exist = __posix_file_exist;
- conn->file_remove = __posix_file_remove;
- conn->file_rename = __posix_file_rename;
- conn->file_size = __posix_file_size;
- conn->handle_open = __posix_handle_open;
+ session = (WT_SESSION_IMPL *)wt_session;
+ __wt_free(session, file_system);
return (0);
}
/*
- * __wt_os_posix_cleanup --
- * Discard a POSIX configuration.
+ * __wt_os_posix --
+ * Initialize a POSIX configuration.
*/
int
-__wt_os_posix_cleanup(WT_SESSION_IMPL *session)
+__wt_os_posix(WT_SESSION_IMPL *session)
{
- WT_UNUSED(session);
+ WT_CONNECTION_IMPL *conn;
+ WT_FILE_SYSTEM *file_system;
+
+ conn = S2C(session);
+
+ WT_RET(__wt_calloc_one(session, &file_system));
+
+ /* Initialize the POSIX jump table. */
+ file_system->directory_list = __wt_posix_directory_list;
+ file_system->directory_list_free = __wt_posix_directory_list_free;
+#ifdef __linux__
+ file_system->directory_sync = __posix_directory_sync;
+#endif
+ file_system->exist = __posix_fs_exist;
+ file_system->open_file = __posix_open_file;
+ file_system->remove = __posix_fs_remove;
+ file_system->rename = __posix_fs_rename;
+ file_system->size = __posix_fs_size;
+ file_system->terminate = __posix_terminate;
+
+ /* Switch it into place. */
+ conn->file_system = file_system;
return (0);
}
diff --git a/src/os_posix/os_map.c b/src/os_posix/os_map.c
index de28891ffd1..7fde4037250 100644
--- a/src/os_posix/os_map.c
+++ b/src/os_posix/os_map.c
@@ -13,23 +13,26 @@
* Map a file into memory.
*/
int
-__wt_posix_map(WT_SESSION_IMPL *session,
- WT_FH *fh, void *mapp, size_t *lenp, void **mappingcookie)
+__wt_posix_map(WT_FILE_HANDLE *fh, WT_SESSION *wt_session,
+ void *mapped_regionp, size_t *lenp, void *mapped_cookiep)
{
+ WT_FILE_HANDLE_POSIX *pfh;
+ WT_SESSION_IMPL *session;
size_t len;
wt_off_t file_size;
void *map;
- WT_UNUSED(mappingcookie);
+ WT_UNUSED(mapped_cookiep);
- WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_IN_MEMORY));
+ session = (WT_SESSION_IMPL *)wt_session;
+ pfh = (WT_FILE_HANDLE_POSIX *)fh;
/*
* Mapping isn't possible if direct I/O configured for the file, the
* Linux open(2) documentation says applications should avoid mixing
* mmap(2) of files with direct I/O to the same files.
*/
- if (fh->direct_io)
+ if (pfh->direct_io)
return (ENOTSUP);
/*
@@ -37,7 +40,7 @@ __wt_posix_map(WT_SESSION_IMPL *session,
* underneath us, our caller needs to ensure consistency of the mapped
* region vs. any other file activity.
*/
- WT_RET(__wt_filesize(session, fh, &file_size));
+ WT_RET(fh->size(fh, wt_session, &file_size));
len = (size_t)file_size;
(void)__wt_verbose(session, WT_VERB_HANDLEOPS,
@@ -49,43 +52,48 @@ __wt_posix_map(WT_SESSION_IMPL *session,
MAP_NOCORE |
#endif
MAP_PRIVATE,
- fh->fd, (wt_off_t)0)) == MAP_FAILED)
+ pfh->fd, (wt_off_t)0)) == MAP_FAILED)
WT_RET_MSG(session,
__wt_errno(), "%s: memory-map: mmap", fh->name);
- *(void **)mapp = map;
+ *(void **)mapped_regionp = map;
*lenp = len;
return (0);
}
#ifdef HAVE_POSIX_MADVISE
/*
- * __posix_map_preload_madvise --
+ * __wt_posix_map_preload --
* Cause a section of a memory map to be faulted in.
*/
-static int
-__posix_map_preload_madvise(
- WT_SESSION_IMPL *session, WT_FH *fh, const void *p, size_t size)
+int
+__wt_posix_map_preload(WT_FILE_HANDLE *fh,
+ WT_SESSION *wt_session, const void *map, size_t length, void *mapped_cookie)
{
WT_BM *bm;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
+ WT_SESSION_IMPL *session;
void *blk;
+ WT_UNUSED(mapped_cookie);
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
conn = S2C(session);
bm = S2BT(session)->bm;
/* Linux requires the address be aligned to a 4KB boundary. */
- blk = (void *)((uintptr_t)p & ~(uintptr_t)(conn->page_size - 1));
- size += WT_PTRDIFF(p, blk);
+ blk = (void *)((uintptr_t)map & ~(uintptr_t)(conn->page_size - 1));
+ length += WT_PTRDIFF(map, blk);
/* XXX proxy for "am I doing a scan?" -- manual read-ahead */
if (F_ISSET(session, WT_SESSION_NO_CACHE)) {
/* Read in 2MB blocks every 1MB of data. */
- if (((uintptr_t)((uint8_t *)blk + size) &
+ if (((uintptr_t)((uint8_t *)blk + length) &
(uintptr_t)((1<<20) - 1)) < (uintptr_t)blk)
return (0);
- size = WT_MIN(WT_MAX(20 * size, 2 << 20),
+ length = WT_MIN(WT_MAX(20 * length, 2 << 20),
WT_PTRDIFF((uint8_t *)bm->map + bm->maplen, blk));
}
@@ -93,10 +101,10 @@ __posix_map_preload_madvise(
* Manual pages aren't clear on whether alignment is required for the
* size, so we will be conservative.
*/
- size &= ~(size_t)(conn->page_size - 1);
+ length &= ~(size_t)(conn->page_size - 1);
- if (size <= (size_t)conn->page_size ||
- (ret = posix_madvise(blk, size, POSIX_MADV_WILLNEED)) == 0)
+ if (length <= (size_t)conn->page_size ||
+ (ret = posix_madvise(blk, length, POSIX_MADV_WILLNEED)) == 0)
return (0);
WT_RET_MSG(session, ret,
"%s: memory-map preload: posix_madvise: POSIX_MADV_WILLNEED",
@@ -104,46 +112,30 @@ __posix_map_preload_madvise(
}
#endif
-/*
- * __wt_posix_map_preload --
- * Cause a section of a memory map to be faulted in.
- */
-int
-__wt_posix_map_preload(
- WT_SESSION_IMPL *session, WT_FH *fh, const void *p, size_t size)
-{
- WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_IN_MEMORY));
-
-#ifdef HAVE_POSIX_MADVISE
- return (__posix_map_preload_madvise(session, fh, p, size));
-#else
- WT_UNUSED(fh);
- WT_UNUSED(p);
- WT_UNUSED(size);
- return (ENOTSUP);
-#endif
-}
-
#ifdef HAVE_POSIX_MADVISE
/*
- * __posix_map_discard_madvise --
+ * __wt_posix_map_discard --
* Discard a chunk of the memory map.
*/
-static int
-__posix_map_discard_madvise(
- WT_SESSION_IMPL *session, WT_FH *fh, void *p, size_t size)
+int
+__wt_posix_map_discard(WT_FILE_HANDLE *fh,
+ WT_SESSION *wt_session, void *map, size_t length, void *mapped_cookie)
{
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
+ WT_SESSION_IMPL *session;
void *blk;
+ WT_UNUSED(mapped_cookie);
+
+ session = (WT_SESSION_IMPL *)wt_session;
conn = S2C(session);
/* Linux requires the address be aligned to a 4KB boundary. */
- blk = (void *)((uintptr_t)p & ~(uintptr_t)(conn->page_size - 1));
- size += WT_PTRDIFF(p, blk);
+ blk = (void *)((uintptr_t)map & ~(uintptr_t)(conn->page_size - 1));
+ length += WT_PTRDIFF(map, blk);
- if ((ret = posix_madvise(blk, size, POSIX_MADV_DONTNEED)) == 0)
+ if ((ret = posix_madvise(blk, length, POSIX_MADV_DONTNEED)) == 0)
return (0);
WT_RET_MSG(session, ret,
"%s: memory-map discard: posix_madvise: POSIX_MADV_DONTNEED",
@@ -152,41 +144,23 @@ __posix_map_discard_madvise(
#endif
/*
- * __wt_posix_map_discard --
- * Discard a chunk of the memory map.
- */
-int
-__wt_posix_map_discard(
- WT_SESSION_IMPL *session, WT_FH *fh, void *p, size_t size)
-{
- WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_IN_MEMORY));
-
-#ifdef HAVE_POSIX_MADVISE
- return (__posix_map_discard_madvise(session, fh, p, size));
-#else
- WT_UNUSED(fh);
- WT_UNUSED(p);
- WT_UNUSED(size);
- return (ENOTSUP);
-#endif
-}
-
-/*
- * __wt_posix_map_unmap --
+ * __wt_posix_unmap --
* Remove a memory mapping.
*/
int
-__wt_posix_map_unmap(WT_SESSION_IMPL *session,
- WT_FH *fh, void *map, size_t len, void **mappingcookie)
+__wt_posix_unmap(WT_FILE_HANDLE *fh, WT_SESSION *wt_session,
+ void *mapped_region, size_t len, void *mapped_cookie)
{
- WT_UNUSED(mappingcookie);
+ WT_SESSION_IMPL *session;
+
+ WT_UNUSED(mapped_cookie);
- WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_IN_MEMORY));
+ session = (WT_SESSION_IMPL *)wt_session;
(void)__wt_verbose(session, WT_VERB_HANDLEOPS,
"%s: memory-unmap: %" WT_SIZET_FMT " bytes", fh->name, len);
- if (munmap(map, len) == 0)
+ if (munmap(mapped_region, len) == 0)
return (0);
WT_RET_MSG(session, __wt_errno(), "%s: memory-unmap: munmap", fh->name);
diff --git a/src/os_win/os_dir.c b/src/os_win/os_dir.c
index 64eae60983c..6f796f6ef7d 100644
--- a/src/os_win/os_dir.c
+++ b/src/os_win/os_dir.c
@@ -13,34 +13,37 @@
* Get a list of files from a directory, MSVC version.
*/
int
-__wt_win_directory_list(WT_SESSION_IMPL *session, const char *dir,
- const char *prefix, uint32_t flags, char ***dirlist, u_int *countp)
+__wt_win_directory_list(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *wt_session, const char *directory,
+ const char *prefix, char ***dirlistp, uint32_t *countp)
{
HANDLE findhandle;
WIN32_FIND_DATA finddata;
WT_DECL_ITEM(pathbuf);
WT_DECL_RET;
+ WT_SESSION_IMPL *session;
size_t dirallocsz, pathlen;
- u_int count, dirsz;
- bool match;
- char **entries, *path;
+ uint32_t count;
+ char *dir_copy, **entries;
- *dirlist = NULL;
- *countp = 0;
+ WT_UNUSED(file_system);
- WT_RET(__wt_filename(session, dir, &path));
+ session = (WT_SESSION_IMPL *)wt_session;
- pathlen = strlen(path);
- if (path[pathlen - 1] == '\\')
- path[pathlen - 1] = '\0';
- WT_ERR(__wt_scr_alloc(session, pathlen + 3, &pathbuf));
- WT_ERR(__wt_buf_fmt(session, pathbuf, "%s\\*", path));
+ *dirlistp = NULL;
+ *countp = 0;
findhandle = INVALID_HANDLE_VALUE;
dirallocsz = 0;
- dirsz = 0;
entries = NULL;
+ WT_ERR(__wt_strdup(session, directory, &dir_copy));
+ pathlen = strlen(dir_copy);
+ if (dir_copy[pathlen - 1] == '\\')
+ dir_copy[pathlen - 1] = '\0';
+ WT_ERR(__wt_scr_alloc(session, pathlen + 3, &pathbuf));
+ WT_ERR(__wt_buf_fmt(session, pathbuf, "%s\\*", dir_copy));
+
findhandle = FindFirstFileA(pathbuf->data, &finddata);
if (findhandle == INVALID_HANDLE_VALUE)
WT_ERR_MSG(session, __wt_getlasterror(),
@@ -56,46 +59,54 @@ __wt_win_directory_list(WT_SESSION_IMPL *session, const char *dir,
continue;
/* The list of files is optionally filtered by a prefix. */
- match = false;
if (prefix != NULL &&
- ((LF_ISSET(WT_DIRLIST_INCLUDE) &&
- WT_PREFIX_MATCH(finddata.cFileName, prefix)) ||
- (LF_ISSET(WT_DIRLIST_EXCLUDE) &&
- !WT_PREFIX_MATCH(finddata.cFileName, prefix))))
- match = true;
- if (prefix == NULL || match) {
- /*
- * We have a file name we want to return.
- */
- count++;
- if (count > dirsz) {
- dirsz += WT_DIR_ENTRY;
- WT_ERR(__wt_realloc_def(session,
- &dirallocsz, dirsz, &entries));
- }
- WT_ERR(__wt_strdup(session,
- finddata.cFileName, &entries[count - 1]));
- }
+ !WT_PREFIX_MATCH(finddata.cFileName, prefix))
+ continue;
+
+ WT_ERR(__wt_realloc_def(
+ session, &dirallocsz, count + 1, &entries));
+ WT_ERR(__wt_strdup(
+ session, finddata.cFileName, &entries[count]));
+ ++count;
} while (FindNextFileA(findhandle, &finddata) != 0);
- if (count > 0)
- *dirlist = entries;
+
+ *dirlistp = entries;
*countp = count;
err: if (findhandle != INVALID_HANDLE_VALUE)
(void)FindClose(findhandle);
- __wt_free(session, path);
+ __wt_free(session, dir_copy);
__wt_scr_free(session, &pathbuf);
if (ret == 0)
return (0);
- if (*dirlist != NULL) {
- for (count = dirsz; count > 0; count--)
- __wt_free(session, entries[count]);
- __wt_free(session, entries);
- }
+ WT_TRET(__wt_win_directory_list_free(
+ file_system, wt_session, entries, count));
WT_RET_MSG(session, ret,
"%s: directory-list, prefix \"%s\"",
- dir, prefix == NULL ? "" : prefix);
+ directory, prefix == NULL ? "" : prefix);
+}
+
+/*
+ * __wt_win_directory_list_free --
+ * Free memory returned by __wt_win_directory_list, Windows version.
+ */
+int
+__wt_win_directory_list_free(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *wt_session, char **dirlist, uint32_t count)
+{
+ WT_SESSION_IMPL *session;
+
+ WT_UNUSED(file_system);
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ if (dirlist != NULL) {
+ while (count > 0)
+ __wt_free(session, dirlist[--count]);
+ __wt_free(session, dirlist);
+ }
+ return (0);
}
diff --git a/src/os_win/os_dlopen.c b/src/os_win/os_dlopen.c
index ce949e4ea5f..9289c8f6488 100644
--- a/src/os_win/os_dlopen.c
+++ b/src/os_win/os_dlopen.c
@@ -20,6 +20,7 @@ __wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp)
WT_RET(__wt_calloc_one(session, &dlh));
WT_ERR(__wt_strdup(session, path, &dlh->name));
+ WT_ERR(__wt_strdup(session, path == NULL ? "local" : path, &dlh->name));
/* NULL means load from the current binary */
if (path == NULL) {
diff --git a/src/os_win/os_fs.c b/src/os_win/os_fs.c
index 95c0ea40ce6..33e281bf8ae 100644
--- a/src/os_win/os_fs.c
+++ b/src/os_win/os_fs.c
@@ -9,34 +9,21 @@
#include "wt_internal.h"
/*
- * __win_directory_sync --
- * Flush a directory to ensure a file creation is durable.
- */
-static int
-__win_directory_sync(WT_SESSION_IMPL *session, const char *path)
-{
- WT_UNUSED(session);
- WT_UNUSED(path);
- return (0);
-}
-
-/*
- * __win_file_exist --
+ * __win_fs_exist --
* Return if the file exists.
*/
static int
-__win_file_exist(WT_SESSION_IMPL *session, const char *name, bool *existp)
+__win_fs_exist(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *wt_session, const char *name, bool *existp)
{
WT_DECL_RET;
- char *path;
+ WT_SESSION_IMPL *session;
- WT_RET(__wt_filename(session, name, &path));
+ WT_UNUSED(file_system);
- ret = GetFileAttributesA(path);
+ session = (WT_SESSION_IMPL *)wt_session;
- __wt_free(session, path);
-
- if (ret != INVALID_FILE_ATTRIBUTES)
+ if (GetFileAttributesA(name) != INVALID_FILE_ATTRIBUTES)
*existp = true;
else
*existp = false;
@@ -45,242 +32,138 @@ __win_file_exist(WT_SESSION_IMPL *session, const char *name, bool *existp)
}
/*
- * __win_file_remove --
+ * __win_fs_remove --
* Remove a file.
*/
static int
-__win_file_remove(WT_SESSION_IMPL *session, const char *name)
+__win_fs_remove(
+ WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const char *name)
{
WT_DECL_RET;
- char *path;
+ WT_SESSION_IMPL *session;
-#ifdef HAVE_DIAGNOSTIC
- if (__wt_handle_search(session, name, false, NULL, NULL))
- WT_RET_MSG(session, EINVAL,
- "%s: file-remove: file has open handles", name);
-#endif
+ WT_UNUSED(file_system);
- WT_RET(__wt_filename(session, name, &path));
- name = path;
+ session = (WT_SESSION_IMPL *)wt_session;
- if (DeleteFileA(name) == FALSE) {
- ret = __wt_getlasterror();
- __wt_err(session, ret, "%s: file-remove: DeleteFileA", name);
- }
+ if (DeleteFileA(name) == FALSE)
+ WT_RET_MSG(session, __wt_getlasterror(),
+ "%s: file-remove: DeleteFileA", name);
- __wt_free(session, path);
- return (ret);
+ return (0);
}
/*
- * __win_file_rename --
+ * __win_fs_rename --
* Rename a file.
*/
static int
-__win_file_rename(WT_SESSION_IMPL *session, const char *from, const char *to)
+__win_fs_rename(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *wt_session, const char *from, const char *to)
{
WT_DECL_RET;
- char *from_path, *to_path;
+ WT_SESSION_IMPL *session;
-#ifdef HAVE_DIAGNOSTIC
- if (__wt_handle_search(session, from, false, NULL, NULL))
- WT_RET_MSG(session, EINVAL,
- "%s: file-rename: file has open handles", from);
- if (__wt_handle_search(session, to, false, NULL, NULL))
- WT_RET_MSG(session, EINVAL,
- "%s: file-rename: file has open handles", to);
-#endif
+ WT_UNUSED(file_system);
- from_path = to_path = NULL;
- WT_ERR(__wt_filename(session, from, &from_path));
- from = from_path;
- WT_ERR(__wt_filename(session, to, &to_path));
- to = to_path;
+ session = (WT_SESSION_IMPL *)wt_session;
/*
* Check if file exists since Windows does not override the file if
* it exists.
*/
if (GetFileAttributesA(to) != INVALID_FILE_ATTRIBUTES)
- if (DeleteFileA(to) == FALSE) {
- ret = __wt_getlasterror();
- __wt_err(session, ret,
+ if (DeleteFileA(to) == FALSE)
+ WT_RET_MSG(session, __wt_getlasterror(),
"%s to %s: file-rename: rename", from, to);
- }
- if (ret == 0 && MoveFileA(from, to) == FALSE) {
- ret = __wt_getlasterror();
- __wt_err(session, ret,
+ if (MoveFileA(from, to) == FALSE)
+ WT_RET_MSG(session, __wt_getlasterror(),
"%s to %s: file-rename: rename", from, to);
- }
-err: __wt_free(session, from_path);
- __wt_free(session, to_path);
- return (ret);
+ return (0);
}
/*
- * __win_file_size --
+ * __wt_win_fs_size --
* Get the size of a file in bytes, by file name.
*/
-static int
-__win_file_size(
- WT_SESSION_IMPL *session, const char *name, bool silent, wt_off_t *sizep)
+int
+__wt_win_fs_size(WT_FILE_SYSTEM *file_system,
+ WT_SESSION *wt_session, const char *name, wt_off_t *sizep)
{
WIN32_FILE_ATTRIBUTE_DATA data;
- WT_DECL_RET;
- char *path;
+ WT_SESSION_IMPL *session;
- WT_RET(__wt_filename(session, name, &path));
+ WT_UNUSED(file_system);
- ret = GetFileAttributesExA(path, GetFileExInfoStandard, &data);
+ session = (WT_SESSION_IMPL *)wt_session;
- __wt_free(session, path);
-
- if (ret != 0) {
+ if (GetFileAttributesExA(name, GetFileExInfoStandard, &data) != 0) {
*sizep =
((int64_t)data.nFileSizeHigh << 32) | data.nFileSizeLow;
return (0);
}
- /*
- * Some callers of this function expect failure if the file doesn't
- * exist, and don't want an error message logged.
- */
- ret = __wt_getlasterror();
- if (!silent)
- WT_RET_MSG(session, ret,
- "%s: file-size: GetFileAttributesEx", name);
- return (ret);
+ WT_RET_MSG(session, __wt_getlasterror(),
+ "%s: file-size: GetFileAttributesEx", name);
}
/*
- * __win_handle_advise --
- * MSVC fadvise.
+ * __win_file_close --
+ * ANSI C close.
*/
static int
-__win_handle_advise(WT_SESSION_IMPL *session,
- WT_FH *fh, wt_off_t offset, wt_off_t len, int advice)
+__win_file_close(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session)
{
- WT_UNUSED(session);
- WT_UNUSED(fh);
- WT_UNUSED(offset);
- WT_UNUSED(len);
- WT_UNUSED(advice);
-
- /* Quietly fail, callers expect not-supported failures. */
- return (ENOTSUP);
-}
+ WT_DECL_RET;
+ WT_FILE_HANDLE_WIN *win_fh;
+ WT_SESSION_IMPL *session;
-/*
- * __win_handle_allocate_configure --
- * Configure fallocate behavior for a file handle.
- */
-static void
-__win_handle_allocate_configure(WT_SESSION_IMPL *session, WT_FH *fh)
-{
- WT_UNUSED(session);
+ win_fh = (WT_FILE_HANDLE_WIN *)file_handle;
+ session = (WT_SESSION_IMPL *)wt_session;
/*
- * fallocate on Windows would be implemented using SetEndOfFile, which
- * can also truncate the file. WiredTiger expects fallocate to ignore
- * requests to truncate the file which Windows does not do, so we don't
- * support the call.
+ * Close the primary and secondary handles.
+ *
+ * We don't open Windows system handles when opening directories for
+ * flushing, as it's not necessary (or possible) to flush a directory
+ * on Windows. Confirm the file handle is open before closing it.
*/
- fh->fallocate_available = WT_FALLOCATE_NOT_AVAILABLE;
- fh->fallocate_requires_locking = false;
-}
-
-/*
- * __win_handle_allocate --
- * Allocate space for a file handle.
- */
-static int
-__win_handle_allocate(
- WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, wt_off_t len)
-{
- WT_UNUSED(session);
- WT_UNUSED(fh);
- WT_UNUSED(offset);
- WT_UNUSED(len);
-
- WT_RET_MSG(session, ENOTSUP, "%s: handle-allocate", fh->name);
- return (ENOTSUP);
-}
-
-/*
- * __win_handle_close --
- * Close a file handle.
- */
-static int
-__win_handle_close(WT_SESSION_IMPL *session, WT_FH *fh)
-{
- WT_DECL_RET;
-
- if (fh->filehandle != INVALID_HANDLE_VALUE) {
- /*
- * We don't open Windows system handles when opening directories
- * for flushing, as it is not necessary (or possible) to flush
- * a directory on Windows. Confirm the file handle is set before
- * attempting to close it.
- */
- if (CloseHandle(fh->filehandle) == 0) {
- ret = __wt_getlasterror();
- __wt_err(session, ret,
- "%s: handle-close: CloseHandle", fh->name);
- }
- }
- if (fh->fp != NULL) {
- /* If the stream was opened for writing, flush the file. */
- if (F_ISSET(fh, WT_FH_FLUSH_ON_CLOSE) && fflush(fh->fp) != 0) {
- ret = __wt_errno();
- __wt_err(session,
- ret, "%s: handle-close: fflush", fh->name);
- }
-
- /* Close the file, closing all the underlying handles. */
- if (fclose(fh->fp) != 0) {
- ret = __wt_errno();
- __wt_err(session,
- ret, "%s: handle-close: fclose", fh->name);
- }
+ if (win_fh->filehandle != INVALID_HANDLE_VALUE &&
+ CloseHandle(win_fh->filehandle) == 0) {
+ ret = __wt_getlasterror();
+ __wt_err(session, ret,
+ "%s: handle-close: CloseHandle", file_handle->name);
}
- /* Close the secondary handle. */
- if (fh->filehandle_secondary != INVALID_HANDLE_VALUE &&
- CloseHandle(fh->filehandle_secondary) == 0) {
+ if (win_fh->filehandle_secondary != INVALID_HANDLE_VALUE &&
+ CloseHandle(win_fh->filehandle_secondary) == 0) {
ret = __wt_getlasterror();
__wt_err(session, ret,
- "%s: handle-close: secondary: CloseHandle", fh->name);
+ "%s: handle-close: secondary: CloseHandle",
+ file_handle->name);
}
- return (ret);
-}
-/*
- * __win_handle_getc --
- * ANSI C fgetc.
- */
-static int
-__win_handle_getc(WT_SESSION_IMPL *session, WT_FH *fh, int *chp)
-{
- if (fh->fp == NULL)
- WT_RET_MSG(session,
- ENOTSUP, "%s: handle-getc: no stream configured", fh->name);
-
- *chp = fgetc(fh->fp);
- if (*chp != EOF || !ferror(fh->fp))
- return (0);
- WT_RET_MSG(session, __wt_errno(), "%s: handle-getc: fgetc", fh->name);
+ __wt_free(session, file_handle->name);
+ __wt_free(session, win_fh);
+ return (ret);
}
/*
- * __win_handle_lock --
+ * __win_file_lock --
* Lock/unlock a file.
*/
static int
-__win_handle_lock(WT_SESSION_IMPL *session, WT_FH *fh, bool lock)
+__win_file_lock(
+ WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, bool lock)
{
WT_DECL_RET;
+ WT_FILE_HANDLE_WIN *win_fh;
+ WT_SESSION_IMPL *session;
+
+ win_fh = (WT_FILE_HANDLE_WIN *)file_handle;
+ session = (WT_SESSION_IMPL *)wt_session;
/*
* WiredTiger requires this function be able to acquire locks past
@@ -298,54 +181,42 @@ __win_handle_lock(WT_SESSION_IMPL *session, WT_FH *fh, bool lock)
* This is useful to coordinate adding records to the end of a file.
*/
if (lock) {
- if (LockFile(fh->filehandle, 0, 0, 1, 0) == FALSE) {
+ if (LockFile(win_fh->filehandle, 0, 0, 1, 0) == FALSE) {
ret = __wt_getlasterror();
__wt_err(session, ret,
- "%s: handle-lock: LockFile", fh->name);
+ "%s: handle-lock: LockFile", file_handle->name);
}
} else
- if (UnlockFile(fh->filehandle, 0, 0, 1, 0) == FALSE) {
+ if (UnlockFile(win_fh->filehandle, 0, 0, 1, 0) == FALSE) {
ret = __wt_getlasterror();
__wt_err(session, ret,
- "%s: handle-lock: UnlockFile", fh->name);
+ "%s: handle-lock: UnlockFile", file_handle->name);
}
return (ret);
}
/*
- * __win_handle_printf --
- * ANSI C vfprintf.
- */
-static int
-__win_handle_printf(
- WT_SESSION_IMPL *session, WT_FH *fh, const char *fmt, va_list ap)
-{
- if (fh->fp == NULL)
- WT_RET_MSG(session, ENOTSUP,
- "%s: vfprintf: no stream configured", fh->name);
-
- if (vfprintf(fh->fp, fmt, ap) >= 0)
- return (0);
- WT_RET_MSG(session, EIO, "%s: handle-printf: vfprintf", fh->name);
-}
-
-/*
- * __win_handle_read --
+ * __win_file_read --
* Read a chunk.
*/
static int
-__win_handle_read(
- WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf)
+__win_file_read(WT_FILE_HANDLE *file_handle,
+ WT_SESSION *wt_session, wt_off_t offset, size_t len, void *buf)
{
DWORD chunk, nr;
uint8_t *addr;
OVERLAPPED overlapped = { 0 };
+ WT_FILE_HANDLE_WIN *win_fh;
+ WT_SESSION_IMPL *session;
+
+ win_fh = (WT_FILE_HANDLE_WIN *)file_handle;
+ session = (WT_SESSION_IMPL *)wt_session;
nr = 0;
/* Assert direct I/O is aligned and a multiple of the alignment. */
WT_ASSERT(session,
- !fh->direct_io ||
+ !win_fh->direct_io ||
S2C(session)->buffer_alignment == 0 ||
(!((uintptr_t)buf &
(uintptr_t)(S2C(session)->buffer_alignment - 1)) &&
@@ -358,44 +229,54 @@ __win_handle_read(
overlapped.Offset = UINT32_MAX & offset;
overlapped.OffsetHigh = UINT32_MAX & (offset >> 32);
- if (!ReadFile(fh->filehandle, addr, chunk, &nr, &overlapped))
+ if (!ReadFile(
+ win_fh->filehandle, addr, chunk, &nr, &overlapped))
WT_RET_MSG(session,
- nr == 0 ? WT_ERROR : __wt_getlasterror(),
+ __wt_getlasterror(),
"%s: handle-read: ReadFile: failed to read %lu "
"bytes at offset %" PRIuMAX,
- fh->name, chunk, (uintmax_t)offset);
+ file_handle->name, chunk, (uintmax_t)offset);
}
return (0);
}
/*
- * __win_handle_size --
+ * __win_file_size --
* Get the size of a file in bytes, by file handle.
*/
static int
-__win_handle_size(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t *sizep)
+__win_file_size(
+ WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, wt_off_t *sizep)
{
+ WT_FILE_HANDLE_WIN *win_fh;
+ WT_SESSION_IMPL *session;
LARGE_INTEGER size;
- if (GetFileSizeEx(fh->filehandle, &size) != 0) {
+ win_fh = (WT_FILE_HANDLE_WIN *)file_handle;
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ if (GetFileSizeEx(win_fh->filehandle, &size) != 0) {
*sizep = size.QuadPart;
return (0);
}
- WT_RET_MSG(session,
- __wt_getlasterror(), "%s: handle-size: GetFileSizeEx", fh->name);
+ WT_RET_MSG(session, __wt_getlasterror(),
+ "%s: handle-size: GetFileSizeEx", file_handle->name);
}
/*
- * __win_handle_sync --
- * MSVC fflush/fsync.
+ * __win_file_sync --
+ * MSVC fsync.
*/
static int
-__win_handle_sync(WT_SESSION_IMPL *session, WT_FH *fh, bool block)
+__win_file_sync(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session)
{
WT_DECL_RET;
+ WT_FILE_HANDLE_WIN *win_fh;
+ WT_SESSION_IMPL *session;
- WT_ASSERT(session, !F_ISSET(S2C(session), WT_CONN_READONLY));
+ win_fh = (WT_FILE_HANDLE_WIN *)file_handle;
+ session = (WT_SESSION_IMPL *)wt_session;
/*
* We don't open Windows system handles when opening directories
@@ -403,76 +284,79 @@ __win_handle_sync(WT_SESSION_IMPL *session, WT_FH *fh, bool block)
* a directory on Windows. Confirm the file handle is set before
* attempting to sync it.
*/
- if (fh->fp == NULL && fh->filehandle == INVALID_HANDLE_VALUE)
+ if (win_fh->filehandle == INVALID_HANDLE_VALUE)
return (0);
- if (fh->fp == NULL) {
- /*
- * Callers attempting asynchronous flush handle ENOTSUP returns,
- * and won't make further attempts.
- */
- if (!block)
- return (ENOTSUP);
-
- if ((ret = FlushFileBuffers(fh->filehandle)) == FALSE)
- WT_RET_MSG(session, __wt_getlasterror(),
- "%s handle-sync: FlushFileBuffers error", fh->name);
- return (0);
+ if (FlushFileBuffers(win_fh->filehandle) == FALSE) {
+ ret = __wt_getlasterror();
+ WT_RET_MSG(session, ret,
+ "%s handle-sync: FlushFileBuffers error",
+ file_handle->name);
}
-
- if (fflush(fh->fp) == 0)
- return (0);
- WT_RET_MSG(session, __wt_errno(), "%s: handle-sync: fflush", fh->name);
+ return (0);
}
/*
- * __win_handle_truncate --
+ * __win_file_truncate --
* Truncate a file.
*/
static int
-__win_handle_truncate(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t len)
+__win_file_truncate(
+ WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, wt_off_t len)
{
WT_DECL_RET;
+ WT_FILE_HANDLE_WIN *win_fh;
+ WT_SESSION_IMPL *session;
LARGE_INTEGER largeint;
+ win_fh = (WT_FILE_HANDLE_WIN *)file_handle;
+ session = (WT_SESSION_IMPL *)wt_session;
+
largeint.QuadPart = len;
- if (fh->filehandle_secondary == INVALID_HANDLE_VALUE)
+ if (win_fh->filehandle_secondary == INVALID_HANDLE_VALUE)
WT_RET_MSG(session, EINVAL,
- "%s: handle-truncate: read-only", fh->name);
+ "%s: handle-truncate: read-only", file_handle->name);
if (SetFilePointerEx(
- fh->filehandle_secondary, largeint, NULL, FILE_BEGIN) == FALSE)
+ win_fh->filehandle_secondary, largeint, NULL, FILE_BEGIN) == FALSE)
WT_RET_MSG(session, __wt_getlasterror(),
- "%s: handle-truncate: SetFilePointerEx", fh->name);
+ "%s: handle-truncate: SetFilePointerEx",
+ file_handle->name);
- if (SetEndOfFile(fh->filehandle_secondary) == FALSE) {
+ if (SetEndOfFile(win_fh->filehandle_secondary) == FALSE) {
if (GetLastError() == ERROR_USER_MAPPED_FILE)
return (EBUSY);
WT_RET_MSG(session, __wt_getlasterror(),
- "%s: handle-truncate: SetEndOfFile error", fh->name);
+ "%s: handle-truncate: SetEndOfFile error",
+ file_handle->name);
}
return (0);
}
/*
- * __win_handle_write --
+ * __win_file_write --
* Write a chunk.
*/
static int
-__win_handle_write(WT_SESSION_IMPL *session,
- WT_FH *fh, wt_off_t offset, size_t len, const void *buf)
+__win_file_write(WT_FILE_HANDLE *file_handle,
+ WT_SESSION *wt_session, wt_off_t offset, size_t len, const void *buf)
{
DWORD chunk;
DWORD nw;
const uint8_t *addr;
OVERLAPPED overlapped = { 0 };
+ WT_FILE_HANDLE_WIN *win_fh;
+ WT_SESSION_IMPL *session;
+
+ win_fh = (WT_FILE_HANDLE_WIN *)file_handle;
+ session = (WT_SESSION_IMPL *)wt_session;
nw = 0;
/* Assert direct I/O is aligned and a multiple of the alignment. */
WT_ASSERT(session,
- !fh->direct_io ||
+ !win_fh->direct_io ||
S2C(session)->buffer_alignment == 0 ||
(!((uintptr_t)buf &
(uintptr_t)(S2C(session)->buffer_alignment - 1)) &&
@@ -485,38 +369,47 @@ __win_handle_write(WT_SESSION_IMPL *session,
overlapped.Offset = UINT32_MAX & offset;
overlapped.OffsetHigh = UINT32_MAX & (offset >> 32);
- if (!WriteFile(fh->filehandle, addr, chunk, &nw, &overlapped))
+ if (!WriteFile(
+ win_fh->filehandle, addr, chunk, &nw, &overlapped))
WT_RET_MSG(session, __wt_getlasterror(),
"%s: handle-write: WriteFile: failed to write %lu "
"bytes at offset %" PRIuMAX,
- fh->name, chunk, (uintmax_t)offset);
+ file_handle->name, chunk, (uintmax_t)offset);
}
return (0);
}
/*
- * __win_handle_open --
+ * __win_open_file --
* Open a file handle.
*/
static int
-__win_handle_open(WT_SESSION_IMPL *session,
- WT_FH *fh, const char *name, uint32_t file_type, uint32_t flags)
+__win_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session,
+ const char *name, WT_OPEN_FILE_TYPE file_type, uint32_t flags,
+ WT_FILE_HANDLE **file_handlep)
{
DWORD dwCreationDisposition;
- HANDLE filehandle, filehandle_secondary;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
+ WT_FILE_HANDLE *file_handle;
+ WT_FILE_HANDLE_WIN *win_fh;
+ WT_SESSION_IMPL *session;
int desired_access, f;
- bool direct_io;
- const char *stream_mode;
+ WT_UNUSED(file_system);
+
+ *file_handlep = NULL;
+
+ session = (WT_SESSION_IMPL *)wt_session;
conn = S2C(session);
- direct_io = false;
+
+ WT_RET(__wt_calloc_one(session, &win_fh));
+
+ win_fh->direct_io = false;
/* Set up error handling. */
- fh->filehandle = fh->filehandle_secondary =
- filehandle = filehandle_secondary = INVALID_HANDLE_VALUE;
- fh->fp = NULL;
+ win_fh->filehandle =
+ win_fh->filehandle_secondary = INVALID_HANDLE_VALUE;
/*
* Opening a file handle on a directory is only to support filesystems
@@ -524,7 +417,7 @@ __win_handle_open(WT_SESSION_IMPL *session,
* require that functionality: create an empty WT_FH structure with
* invalid handles.
*/
- if (file_type == WT_FILE_TYPE_DIRECTORY)
+ if (file_type == WT_OPEN_FILE_TYPE_DIRECTORY)
goto directory_open;
desired_access = GENERIC_READ;
@@ -549,43 +442,36 @@ __win_handle_open(WT_SESSION_IMPL *session,
} else
dwCreationDisposition = OPEN_EXISTING;
- /*
- * direct_io means no OS file caching. This requires aligned buffer
- * allocations like O_DIRECT.
- */
- if (FLD_ISSET(conn->direct_io, file_type) ||
- (LF_ISSET(WT_OPEN_READONLY) &&
- file_type == WT_FILE_TYPE_DATA &&
- FLD_ISSET(conn->direct_io, WT_FILE_TYPE_CHECKPOINT))) {
+ /* Direct I/O. */
+ if (LF_ISSET(WT_OPEN_DIRECTIO)) {
f |= FILE_FLAG_NO_BUFFERING;
- direct_io = true;
+ win_fh->direct_io = true;
}
- fh->direct_io = direct_io;
/* FILE_FLAG_WRITE_THROUGH does not require aligned buffers */
if (FLD_ISSET(conn->write_through, file_type))
f |= FILE_FLAG_WRITE_THROUGH;
- if (file_type == WT_FILE_TYPE_LOG &&
+ if (file_type == WT_OPEN_FILE_TYPE_LOG &&
FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC))
f |= FILE_FLAG_WRITE_THROUGH;
/* Disable read-ahead on trees: it slows down random read workloads. */
- if (file_type == WT_FILE_TYPE_DATA)
+ if (file_type == WT_OPEN_FILE_TYPE_DATA)
f |= FILE_FLAG_RANDOM_ACCESS;
- filehandle = CreateFileA(name, desired_access,
+ win_fh->filehandle = CreateFileA(name, desired_access,
FILE_SHARE_READ | FILE_SHARE_WRITE,
NULL, dwCreationDisposition, f, NULL);
- if (filehandle == INVALID_HANDLE_VALUE) {
+ if (win_fh->filehandle == INVALID_HANDLE_VALUE) {
if (LF_ISSET(WT_OPEN_CREATE) &&
GetLastError() == ERROR_FILE_EXISTS)
- filehandle = CreateFileA(name, desired_access,
+ win_fh->filehandle = CreateFileA(name, desired_access,
FILE_SHARE_READ | FILE_SHARE_WRITE,
NULL, OPEN_EXISTING, f, NULL);
- if (filehandle == INVALID_HANDLE_VALUE)
+ if (win_fh->filehandle == INVALID_HANDLE_VALUE)
WT_ERR_MSG(session, __wt_getlasterror(),
- direct_io ?
+ win_fh->direct_io ?
"%s: handle-open: CreateFileA: failed with direct "
"I/O configured, some filesystem types do not "
"support direct I/O" :
@@ -598,78 +484,60 @@ __win_handle_open(WT_SESSION_IMPL *session,
* pointer.
*/
if (!LF_ISSET(WT_OPEN_READONLY)) {
- filehandle_secondary = CreateFileA(name, desired_access,
+ win_fh->filehandle_secondary = CreateFileA(name, desired_access,
FILE_SHARE_READ | FILE_SHARE_WRITE,
NULL, OPEN_EXISTING, f, NULL);
- if (filehandle_secondary == INVALID_HANDLE_VALUE)
+ if (win_fh->filehandle_secondary == INVALID_HANDLE_VALUE)
WT_ERR_MSG(session, __wt_getlasterror(),
"%s: handle-open: CreateFileA: secondary", name);
}
- /* Optionally configure a stdio stream API. */
- switch (LF_MASK(WT_STREAM_APPEND | WT_STREAM_READ | WT_STREAM_WRITE)) {
- case WT_STREAM_APPEND:
- f = _O_APPEND | _O_TEXT;
- stream_mode = "a";
- F_SET(fh, WT_FH_FLUSH_ON_CLOSE);
- break;
- case WT_STREAM_READ:
- f = _O_RDONLY | _O_TEXT;
- stream_mode = "r";
- break;
- case WT_STREAM_WRITE:
- f = _O_TEXT;
- stream_mode = "w";
- F_SET(fh, WT_FH_FLUSH_ON_CLOSE);
- break;
- case 0:
- default:
- stream_mode = NULL;
- break;
- }
- if (stream_mode != NULL) {
- if ((fh->fp = fopen(name, stream_mode)) == NULL)
- WT_ERR_MSG(session, __wt_errno(),
- "%s: handle-open: fopen", name);
-
- if (LF_ISSET(WT_STREAM_LINE_BUFFER))
- __wt_stream_set_line_buffer(fh->fp);
- }
+directory_open:
+ /* Initialize public information. */
+ file_handle = (WT_FILE_HANDLE *)win_fh;
+ WT_ERR(__wt_strdup(session, name, &file_handle->name));
- /* Configure fallocate/posix_fallocate calls. */
- __win_handle_allocate_configure(session, fh);
+ file_handle->close = __win_file_close;
+ file_handle->lock = __win_file_lock;
+#ifdef WORDS_BIGENDIAN
+ /*
+ * The underlying objects are little-endian, mapping objects isn't
+ * currently supported on big-endian systems.
+ */
+#else
+ file_handle->map = __wt_win_map;
+ file_handle->unmap = __wt_win_unmap;
+#endif
+ file_handle->read = __win_file_read;
+ file_handle->size = __win_file_size;
+ file_handle->sync = __win_file_sync;
+ file_handle->truncate = __win_file_truncate;
+ file_handle->write = __win_file_write;
-directory_open:
- fh->filehandle = filehandle;
- fh->filehandle_secondary = filehandle_secondary;
-
- fh->fh_advise = __win_handle_advise;
- fh->fh_allocate = __win_handle_allocate;
- fh->fh_close = __win_handle_close;
- fh->fh_getc = __win_handle_getc;
- fh->fh_lock = __win_handle_lock;
- fh->fh_map = __wt_win_map;
- fh->fh_map_discard = __wt_win_map_discard;
- fh->fh_map_preload = __wt_win_map_preload;
- fh->fh_map_unmap = __wt_win_map_unmap;
- fh->fh_printf = __win_handle_printf;
- fh->fh_read = __win_handle_read;
- fh->fh_size = __win_handle_size;
- fh->fh_sync = __win_handle_sync;
- fh->fh_truncate = __win_handle_truncate;
- fh->fh_write = __win_handle_write;
+ *file_handlep = file_handle;
return (0);
-err: if (filehandle != INVALID_HANDLE_VALUE)
- (void)CloseHandle(filehandle);
- if (filehandle_secondary != INVALID_HANDLE_VALUE)
- (void)CloseHandle(filehandle_secondary);
-
+err: WT_TRET(__win_file_close((WT_FILE_HANDLE *)win_fh, wt_session));
return (ret);
}
/*
+ * __win_terminate --
+ * Discard a Windows configuration.
+ */
+static int
+__win_terminate(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session)
+{
+ WT_SESSION_IMPL *session;
+
+ session = (WT_SESSION_IMPL *)wt_session;
+
+ __wt_free(session, file_system);
+ return (0);
+}
+
+/*
* __wt_os_win --
* Initialize a MSVC configuration.
*/
@@ -677,29 +545,24 @@ int
__wt_os_win(WT_SESSION_IMPL *session)
{
WT_CONNECTION_IMPL *conn;
+ WT_FILE_SYSTEM *file_system;
conn = S2C(session);
- /* Initialize the POSIX jump table. */
- conn->file_directory_list = __wt_win_directory_list;
- conn->file_directory_sync = __win_directory_sync;
- conn->file_exist = __win_file_exist;
- conn->file_remove = __win_file_remove;
- conn->file_rename = __win_file_rename;
- conn->file_size = __win_file_size;
- conn->handle_open = __win_handle_open;
+ WT_RET(__wt_calloc_one(session, &file_system));
- return (0);
-}
+ /* Initialize the Windows jump table. */
+ file_system->directory_list = __wt_win_directory_list;
+ file_system->directory_list_free = __wt_win_directory_list_free;
+ file_system->exist = __win_fs_exist;
+ file_system->open_file = __win_open_file;
+ file_system->remove = __win_fs_remove;
+ file_system->rename = __win_fs_rename;
+ file_system->size = __wt_win_fs_size;
+ file_system->terminate = __win_terminate;
-/*
- * __wt_os_win_cleanup --
- * Discard a POSIX configuration.
- */
-int
-__wt_os_win_cleanup(WT_SESSION_IMPL *session)
-{
- WT_UNUSED(session);
+ /* Switch it into place. */
+ conn->file_system = file_system;
return (0);
}
diff --git a/src/os_win/os_map.c b/src/os_win/os_map.c
index b043f9c9923..488cbfb2ceb 100644
--- a/src/os_win/os_map.c
+++ b/src/os_win/os_map.c
@@ -13,106 +13,83 @@
* Map a file into memory.
*/
int
-__wt_win_map(WT_SESSION_IMPL *session,
- WT_FH *fh, void *mapp, size_t *lenp, void **mappingcookie)
+__wt_win_map(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session,
+ void *mapped_regionp, size_t *lenp, void *mapped_cookiep)
{
WT_DECL_RET;
+ WT_FILE_HANDLE_WIN *win_fh;
+ WT_SESSION_IMPL *session;
size_t len;
wt_off_t file_size;
- void *map;
+ void *map, *mapped_cookie;
+
+ win_fh = (WT_FILE_HANDLE_WIN *)file_handle;
+ session = (WT_SESSION_IMPL *)wt_session;
/*
* There's no locking here to prevent the underlying file from changing
* underneath us, our caller needs to ensure consistency of the mapped
* region vs. any other file activity.
*/
- WT_RET(__wt_filesize(session, fh, &file_size));
+ WT_RET(__wt_win_fs_size(file_handle->file_system,
+ wt_session, file_handle->name, &file_size));
len = (size_t)file_size;
(void)__wt_verbose(session, WT_VERB_HANDLEOPS,
- "%s: memory-map: %" WT_SIZET_FMT " bytes", fh->name, len);
+ "%s: memory-map: %" WT_SIZET_FMT " bytes", file_handle->name, len);
- *mappingcookie =
- CreateFileMappingA(fh->filehandle, NULL, PAGE_READONLY, 0, 0, NULL);
- if (*mappingcookie == NULL)
+ mapped_cookie = CreateFileMappingA(
+ win_fh->filehandle, NULL, PAGE_READONLY, 0, 0, NULL);
+ if (mapped_cookie == NULL)
WT_RET_MSG(session, __wt_getlasterror(),
- "%s: memory-map: CreateFileMappingA", fh->name);
+ "%s: memory-map: CreateFileMappingA", file_handle->name);
if ((map =
- MapViewOfFile(*mappingcookie, FILE_MAP_READ, 0, 0, len)) == NULL) {
+ MapViewOfFile(mapped_cookie, FILE_MAP_READ, 0, 0, len)) == NULL) {
/* Retrieve the error before cleaning up. */
ret = __wt_getlasterror();
- CloseHandle(*mappingcookie);
- *mappingcookie = NULL;
+ CloseHandle(mapped_cookie);
WT_RET_MSG(session, ret,
- "%s: memory-map: MapViewOfFile", fh->name);
+ "%s: memory-map: MapViewOfFile", file_handle->name);
}
- *(void **)mapp = map;
+ *(void **)mapped_cookiep = mapped_cookie;
+ *(void **)mapped_regionp = map;
*lenp = len;
return (0);
}
/*
- * __wt_win_map_preload --
- * Cause a section of a memory map to be faulted in.
- */
-int
-__wt_win_map_preload(
- WT_SESSION_IMPL *session, WT_FH *fh, const void *p, size_t size)
-{
- WT_UNUSED(session);
- WT_UNUSED(fh);
- WT_UNUSED(p);
- WT_UNUSED(size);
-
- return (ENOTSUP);
-}
-
-/*
- * __wt_win_map_discard --
- * Discard a chunk of the memory map.
- */
-int
-__wt_win_map_discard(WT_SESSION_IMPL *session, WT_FH *fh, void *p, size_t size)
-{
- WT_UNUSED(session);
- WT_UNUSED(fh);
- WT_UNUSED(p);
- WT_UNUSED(size);
-
- return (ENOTSUP);
-}
-
-/*
- * __wt_win_map_unmap --
+ * __wt_win_unmap --
* Remove a memory mapping.
*/
int
-__wt_win_map_unmap(WT_SESSION_IMPL *session,
- WT_FH *fh, void *map, size_t len, void **mappingcookie)
+__wt_win_unmap(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session,
+ void *mapped_region, size_t length, void *mapped_cookie)
{
WT_DECL_RET;
+ WT_FILE_HANDLE_WIN *win_fh;
+ WT_SESSION_IMPL *session;
- (void)__wt_verbose(session, WT_VERB_HANDLEOPS,
- "%s: memory-unmap: %" WT_SIZET_FMT " bytes", fh->name, len);
+ win_fh = (WT_FILE_HANDLE_WIN *)file_handle;
+ session = (WT_SESSION_IMPL *)wt_session;
- WT_ASSERT(session, *mappingcookie != NULL);
+ (void)__wt_verbose(session, WT_VERB_HANDLEOPS,
+ "%s: memory-unmap: %" WT_SIZET_FMT " bytes",
+ file_handle->name, length);
- if (UnmapViewOfFile(map) == 0) {
+ if (UnmapViewOfFile(mapped_region) == 0) {
ret = __wt_getlasterror();
__wt_err(session, ret,
- "%s: memory-unmap: UnmapViewOfFile", fh->name);
+ "%s: memory-unmap: UnmapViewOfFile", file_handle->name);
}
- if (CloseHandle(*mappingcookie) == 0) {
+ if (CloseHandle(*(void **)mapped_cookie) == 0) {
ret = __wt_getlasterror();
__wt_err(session, ret,
- "%s: memory-unmap: CloseHandle", fh->name);
+ "%s: memory-unmap: CloseHandle", file_handle->name);
}
- *mappingcookie = NULL;
-
return (ret);
}
diff --git a/src/os_win/os_thread.c b/src/os_win/os_thread.c
index 94c5a8b0ab2..2b846216442 100644
--- a/src/os_win/os_thread.c
+++ b/src/os_win/os_thread.c
@@ -21,7 +21,7 @@ __wt_thread_create(WT_SESSION_IMPL *session,
if (*tidret != 0)
return (0);
- WT_RET_MSG(session, __wt_errno, "thread create: _beginthreadex");
+ WT_RET_MSG(session, __wt_errno(), "thread create: _beginthreadex");
}
/*
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index 26123f6b66d..a46662b4b9d 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -299,13 +299,13 @@ static int __rec_cell_build_ovfl(WT_SESSION_IMPL *,
WT_RECONCILE *, WT_KV *, uint8_t, uint64_t);
static int __rec_cell_build_val(WT_SESSION_IMPL *,
WT_RECONCILE *, const void *, size_t, uint64_t);
-static int __rec_col_fix(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+static int __rec_col_fix(WT_SESSION_IMPL *, WT_RECONCILE *, WT_REF *);
static int __rec_col_fix_slvg(WT_SESSION_IMPL *,
- WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *);
-static int __rec_col_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
+ WT_RECONCILE *, WT_REF *, WT_SALVAGE_COOKIE *);
+static int __rec_col_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_REF *);
static int __rec_col_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static int __rec_col_var(WT_SESSION_IMPL *,
- WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *);
+ WT_RECONCILE *, WT_REF *, WT_SALVAGE_COOKIE *);
static int __rec_col_var_helper(WT_SESSION_IMPL *, WT_RECONCILE *,
WT_SALVAGE_COOKIE *, WT_ITEM *, bool, uint8_t, uint64_t);
static int __rec_destroy_session(WT_SESSION_IMPL *);
@@ -391,16 +391,16 @@ __wt_reconcile(WT_SESSION_IMPL *session,
switch (page->type) {
case WT_PAGE_COL_FIX:
if (salvage != NULL)
- ret = __rec_col_fix_slvg(session, r, page, salvage);
+ ret = __rec_col_fix_slvg(session, r, ref, salvage);
else
- ret = __rec_col_fix(session, r, page);
+ ret = __rec_col_fix(session, r, ref);
break;
case WT_PAGE_COL_INT:
WT_WITH_PAGE_INDEX(session,
- ret = __rec_col_int(session, r, page));
+ ret = __rec_col_int(session, r, ref));
break;
case WT_PAGE_COL_VAR:
- ret = __rec_col_var(session, r, page, salvage);
+ ret = __rec_col_var(session, r, ref, salvage);
break;
case WT_PAGE_ROW_INT:
WT_WITH_PAGE_INDEX(session,
@@ -630,12 +630,12 @@ __rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
*/
switch (page->type) {
case WT_PAGE_COL_INT:
- WT_RET(__wt_page_alloc(session, WT_PAGE_COL_INT,
- 1, mod->mod_multi_entries, false, &next));
+ WT_RET(__wt_page_alloc(session,
+ WT_PAGE_COL_INT, mod->mod_multi_entries, false, &next));
break;
case WT_PAGE_ROW_INT:
- WT_RET(__wt_page_alloc(session, WT_PAGE_ROW_INT,
- WT_RECNO_OOB, mod->mod_multi_entries, false, &next));
+ WT_RET(__wt_page_alloc(session,
+ WT_PAGE_ROW_INT, mod->mod_multi_entries, false, &next));
break;
WT_ILLEGAL_VALUE(session);
}
@@ -2465,7 +2465,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session,
WT_SESSION *wt_session;
size_t corrected_page_size, extra_skip, len, result_len;
uint64_t recno;
- uint32_t entry, i, result_slots, slots;
+ uint32_t entry, i, max_image_slot, result_slots, slots;
bool last_block;
uint8_t *dsk_start;
@@ -2525,7 +2525,7 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session,
if (dsk->type == WT_PAGE_COL_VAR)
recno = last->recno;
- entry = slots = 0;
+ entry = max_image_slot = slots = 0;
WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
++entry;
@@ -2575,6 +2575,15 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session,
dsk->type == WT_PAGE_COL_VAR)
r->raw_recnos[slots] = recno;
r->raw_entries[slots] = entry;
+
+ /*
+ * Don't create an image so large that any future update will
+ * cause a split in memory. Use half of the maximum size so
+ * we split very compressible pages that have reached the
+ * maximum size in memory into two equal blocks.
+ */
+ if (len > (size_t)btree->maxmempage / 2)
+ max_image_slot = slots;
}
/*
@@ -2634,21 +2643,32 @@ __rec_split_raw_worker(WT_SESSION_IMPL *session,
ret = compressor->compress_raw(compressor, wt_session,
r->page_size_orig, btree->split_pct,
WT_BLOCK_COMPRESS_SKIP + extra_skip,
- (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP,
- r->raw_offsets, slots,
+ (uint8_t *)dsk + WT_BLOCK_COMPRESS_SKIP, r->raw_offsets,
+ no_more_rows || max_image_slot == 0 ? slots : max_image_slot,
(uint8_t *)dst->mem + WT_BLOCK_COMPRESS_SKIP,
- result_len, no_more_rows, &result_len, &result_slots);
+ result_len,
+ no_more_rows || max_image_slot != 0,
+ &result_len, &result_slots);
switch (ret) {
case EAGAIN:
/*
- * The compression function wants more rows; accumulate and
- * retry.
+ * The compression function wants more rows, accumulate and
+ * retry if possible.
*
- * Reset the resulting slots count, just in case the compression
- * function modified it before giving up.
+ * First, reset the resulting slots count, just in case the
+ * compression function modified it before giving up.
*/
result_slots = 0;
- break;
+
+ /*
+ * If the image is too large and there are more rows to gather,
+ * act as if the compression engine gave up on this chunk of
+ * data. That doesn't make sense (we flagged the engine that we
+ * wouldn't give it any more rows, but it's a possible return).
+ */
+ if (no_more_rows || max_image_slot == 0)
+ break;
+ /* FALLTHROUGH */
case 0:
/*
* If the compression function returned zero result slots, it's
@@ -3431,7 +3451,7 @@ __rec_update_las(WT_SESSION_IMPL *session,
case WT_PAGE_ROW_LEAF:
if (list->ins == NULL) {
slot = WT_ROW_SLOT(page, list->rip);
- upd = page->pg_row_upd[slot];
+ upd = page->modify->mod_row_update[slot];
} else
upd = list->ins->upd;
break;
@@ -3787,7 +3807,7 @@ __rec_vtype(WT_ADDR *addr)
* Reconcile a column-store internal page.
*/
static int
-__rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+__rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref)
{
WT_ADDR *addr;
WT_BTREE *btree;
@@ -3795,11 +3815,12 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
WT_CHILD_STATE state;
WT_DECL_RET;
WT_KV *val;
- WT_PAGE *child;
+ WT_PAGE *child, *page;
WT_REF *ref;
bool hazard;
btree = S2BT(session);
+ page = pageref->page;
child = NULL;
hazard = false;
@@ -3807,12 +3828,12 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
vpack = &_vpack;
WT_RET(__rec_split_init(
- session, r, page, page->pg_intl_recno, btree->maxintlpage));
+ session, r, page, pageref->ref_recno, btree->maxintlpage));
/* For each entry in the in-memory page... */
WT_INTL_FOREACH_BEGIN(session, page, ref) {
/* Update the starting record number in case we split. */
- r->recno = ref->key.recno;
+ r->recno = ref->ref_recno;
/*
* Modified child.
@@ -3886,7 +3907,7 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
} else
__rec_cell_build_addr(session, r,
addr->addr, addr->size,
- __rec_vtype(addr), ref->key.recno);
+ __rec_vtype(addr), ref->ref_recno);
WT_CHILD_RELEASE_ERR(session, hazard, ref);
/* Boundary: split or write the page. */
@@ -3951,31 +3972,34 @@ __rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
* Reconcile a fixed-width, column-store leaf page.
*/
static int
-__rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
+__rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref)
{
WT_BTREE *btree;
WT_INSERT *ins;
+ WT_PAGE *page;
WT_UPDATE *upd;
uint64_t recno;
uint32_t entry, nrecs;
btree = S2BT(session);
+ page = pageref->page;
WT_RET(__rec_split_init(
- session, r, page, page->pg_fix_recno, btree->maxleafpage));
+ session, r, page, pageref->ref_recno, btree->maxleafpage));
+
+ /* Copy the original, disk-image bytes into place. */
+ memcpy(r->first_free, page->pg_fix_bitf,
+ __bitstr_size((size_t)page->pg_fix_entries * btree->bitcnt));
/* Update any changes to the original on-page data items. */
WT_SKIP_FOREACH(ins, WT_COL_UPDATE_SINGLE(page)) {
WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd));
if (upd != NULL)
- __bit_setv_recno(page, WT_INSERT_RECNO(ins),
- btree->bitcnt, ((uint8_t *)WT_UPDATE_DATA(upd))[0]);
+ __bit_setv(r->first_free,
+ WT_INSERT_RECNO(ins) - pageref->ref_recno,
+ btree->bitcnt, *(uint8_t *)WT_UPDATE_DATA(upd));
}
- /* Copy the updated, disk-image bytes into place. */
- memcpy(r->first_free, page->pg_fix_bitf,
- __bitstr_size((size_t)page->pg_fix_entries * btree->bitcnt));
-
/* Calculate the number of entries per page remainder. */
entry = page->pg_fix_entries;
nrecs = WT_FIX_BYTES_TO_ENTRIES(
@@ -4002,7 +4026,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
* the last key on this page, we have to decrement it.
*/
if ((recno =
- page->modify->mod_split_recno) == WT_RECNO_OOB)
+ page->modify->mod_col_split_recno) == WT_RECNO_OOB)
break;
recno -= 1;
@@ -4032,7 +4056,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
if (nrecs > 0) {
__bit_setv(r->first_free, entry, btree->bitcnt,
upd == NULL ? 0 :
- ((uint8_t *)WT_UPDATE_DATA(upd))[0]);
+ *(uint8_t *)WT_UPDATE_DATA(upd));
--nrecs;
++entry;
++r->recno;
@@ -4076,13 +4100,15 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
*/
static int
__rec_col_fix_slvg(WT_SESSION_IMPL *session,
- WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
+ WT_RECONCILE *r, WT_REF *pageref, WT_SALVAGE_COOKIE *salvage)
{
WT_BTREE *btree;
+ WT_PAGE *page;
uint64_t page_start, page_take;
uint32_t entry, nrecs;
btree = S2BT(session);
+ page = pageref->page;
/*
* !!!
@@ -4097,7 +4123,7 @@ __rec_col_fix_slvg(WT_SESSION_IMPL *session,
* don't want to have to retrofit the code later.
*/
WT_RET(__rec_split_init(
- session, r, page, page->pg_fix_recno, btree->maxleafpage));
+ session, r, page, pageref->ref_recno, btree->maxleafpage));
/* We may not be taking all of the entries on the original page. */
page_take = salvage->take == 0 ? page->pg_fix_entries : salvage->take;
@@ -4220,7 +4246,7 @@ __rec_col_var_helper(WT_SESSION_IMPL *session, WT_RECONCILE *r,
*/
static int
__rec_col_var(WT_SESSION_IMPL *session,
- WT_RECONCILE *r, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
+ WT_RECONCILE *r, WT_REF *pageref, WT_SALVAGE_COOKIE *salvage)
{
enum { OVFL_IGNORE, OVFL_UNUSED, OVFL_USED } ovfl_state;
WT_BTREE *btree;
@@ -4231,6 +4257,7 @@ __rec_col_var(WT_SESSION_IMPL *session,
WT_DECL_RET;
WT_INSERT *ins;
WT_ITEM *last;
+ WT_PAGE *page;
WT_UPDATE *upd;
uint64_t n, nrepeat, repeat_count, rle, skip, src_recno;
uint32_t i, size;
@@ -4238,6 +4265,7 @@ __rec_col_var(WT_SESSION_IMPL *session,
const void *data;
btree = S2BT(session);
+ page = pageref->page;
last = r->last;
vpack = &_vpack;
@@ -4247,7 +4275,7 @@ __rec_col_var(WT_SESSION_IMPL *session,
upd = NULL;
WT_RET(__rec_split_init(
- session, r, page, page->pg_var_recno, btree->maxleafpage));
+ session, r, page, pageref->ref_recno, btree->maxleafpage));
/*
* The salvage code may be calling us to reconcile a page where there
@@ -4561,7 +4589,8 @@ compare: /*
* first key on the split page, that is, one larger than
* the last key on this page, we have to decrement it.
*/
- if ((n = page->modify->mod_split_recno) == WT_RECNO_OOB)
+ if ((n = page->
+ modify->mod_col_split_recno) == WT_RECNO_OOB)
break;
WT_ASSERT(session, n >= src_recno);
n -= 1;
@@ -5430,18 +5459,24 @@ __rec_split_discard(WT_SESSION_IMPL *session, WT_PAGE *page)
__wt_free(session, multi->key.ikey);
break;
}
- if (multi->disk_image == NULL) {
- if (multi->addr.reuse)
- multi->addr.addr = NULL;
- else {
- WT_RET(__wt_btree_block_free(session,
- multi->addr.addr, multi->addr.size));
- __wt_free(session, multi->addr.addr);
- }
- } else {
- __wt_free(session, multi->supd);
- __wt_free(session, multi->disk_image);
+
+ /*
+ * If the page was re-written free the backing disk blocks used
+ * in the previous write (unless the blocks were reused in this
+ * write). The page may instead have been a disk image with
+ * associated saved updates: ownership of the disk image is
+ * transferred when rewriting the page in-memory and there may
+ * not have been saved updates. We've gotten this wrong a few
+ * times, so use the existence of an address to confirm backing
+ * blocks we care about, and free any disk image/saved updates.
+ */
+ if (multi->addr.addr != NULL && !multi->addr.reuse) {
+ WT_RET(__wt_btree_block_free(
+ session, multi->addr.addr, multi->addr.size));
+ __wt_free(session, multi->addr.addr);
}
+ __wt_free(session, multi->supd);
+ __wt_free(session, multi->disk_image);
}
__wt_free(session, mod->mod_multi);
mod->mod_multi_entries = 0;
diff --git a/src/schema/schema_create.c b/src/schema/schema_create.c
index 756f1fdcc6c..67d64cf1c75 100644
--- a/src/schema/schema_create.c
+++ b/src/schema/schema_create.c
@@ -35,7 +35,7 @@ __wt_direct_io_size_check(WT_SESSION_IMPL *session,
* units of its happy place.
*/
if (FLD_ISSET(conn->direct_io,
- WT_FILE_TYPE_CHECKPOINT | WT_FILE_TYPE_DATA)) {
+ WT_DIRECT_IO_CHECKPOINT | WT_DIRECT_IO_DATA)) {
align = (int64_t)conn->buffer_alignment;
if (align != 0 && (cval.val < align || cval.val % align != 0))
WT_RET_MSG(session, EINVAL,
diff --git a/src/schema/schema_open.c b/src/schema/schema_open.c
index e7ce4e42498..1554d021953 100644
--- a/src/schema/schema_open.c
+++ b/src/schema/schema_open.c
@@ -427,6 +427,8 @@ __schema_open_table(WT_SESSION_IMPL *session,
const char *tconfig;
char *tablename;
+ *tablep = NULL;
+
cursor = NULL;
table = NULL;
tablename = NULL;
@@ -527,6 +529,8 @@ __wt_schema_get_colgroup(WT_SESSION_IMPL *session,
const char *tablename, *tend;
u_int i;
+ if (tablep != NULL)
+ *tablep = NULL;
*colgroupp = NULL;
tablename = uri;
@@ -571,6 +575,8 @@ __wt_schema_get_index(WT_SESSION_IMPL *session,
const char *tablename, *tend;
u_int i;
+ if (tablep != NULL)
+ *tablep = NULL;
*indexp = NULL;
tablename = uri;
diff --git a/src/schema/schema_rename.c b/src/schema/schema_rename.c
index 21402ed9332..8f4d374fd22 100644
--- a/src/schema/schema_rename.c
+++ b/src/schema/schema_rename.c
@@ -55,7 +55,7 @@ __rename_file(
default:
WT_ERR(ret);
}
- WT_ERR(__wt_exist(session, newfile, &exist));
+ WT_ERR(__wt_fs_exist(session, newfile, &exist));
if (exist)
WT_ERR_MSG(session, EEXIST, "%s", newfile);
@@ -64,7 +64,7 @@ __rename_file(
WT_ERR(__wt_metadata_insert(session, newuri, oldvalue));
/* Rename the underlying file. */
- WT_ERR(__wt_rename(session, filename, newfile));
+ WT_ERR(__wt_fs_rename(session, filename, newfile));
if (WT_META_TRACKING(session))
WT_ERR(__wt_meta_track_fileop(session, uri, newuri));
diff --git a/src/schema/schema_stat.c b/src/schema/schema_stat.c
index d3d0605c60a..c204d6b1a24 100644
--- a/src/schema/schema_stat.c
+++ b/src/schema/schema_stat.c
@@ -69,6 +69,7 @@ __curstat_size_only(WT_SESSION_IMPL *session,
WT_ITEM namebuf;
wt_off_t filesize;
char *tableconf;
+ bool exist;
WT_CLEAR(namebuf);
*was_fast = false;
@@ -96,10 +97,11 @@ __curstat_size_only(WT_SESSION_IMPL *session,
* are concurrent schema level operations (for example drop). That is
* fine - failing here results in falling back to the slow path of
* opening the handle.
- * !!! Deliberately discard the return code from a failed call - the
- * error is flagged by not setting fast to true.
*/
- if (__wt_filesize_name(session, namebuf.data, true, &filesize) == 0) {
+ WT_ERR(__wt_fs_exist(session, namebuf.data, &exist));
+ if (exist) {
+ WT_ERR(__wt_fs_size(session, namebuf.data, &filesize));
+
/* Setup and populate the statistics structure */
__wt_stat_dsrc_init_single(&cst->u.dsrc_stats);
cst->u.dsrc_stats.block_size = filesize;
diff --git a/src/session/session_api.c b/src/session/session_api.c
index bb496494234..933f2273902 100644
--- a/src/session/session_api.c
+++ b/src/session/session_api.c
@@ -785,8 +785,8 @@ static int
__session_join(WT_SESSION *wt_session, WT_CURSOR *join_cursor,
WT_CURSOR *ref_cursor, const char *config)
{
- WT_CURSOR *firstcg;
WT_CONFIG_ITEM cval;
+ WT_CURSOR *firstcg;
WT_CURSOR_INDEX *cindex;
WT_CURSOR_JOIN *cjoin;
WT_CURSOR_TABLE *ctable;
@@ -794,15 +794,18 @@ __session_join(WT_SESSION *wt_session, WT_CURSOR *join_cursor,
WT_INDEX *idx;
WT_SESSION_IMPL *session;
WT_TABLE *table;
+ bool nested;
uint64_t count;
uint32_t bloom_bit_count, bloom_hash_count;
uint8_t flags, range;
- count = 0;
- firstcg = NULL;
session = (WT_SESSION_IMPL *)wt_session;
SESSION_API_CALL(session, join, config, cfg);
+
+ firstcg = NULL;
table = NULL;
+ nested = false;
+ count = 0;
if (!WT_PREFIX_MATCH(join_cursor->uri, "join:"))
WT_ERR_MSG(session, EINVAL, "not a join cursor");
@@ -817,19 +820,25 @@ __session_join(WT_SESSION *wt_session, WT_CURSOR *join_cursor,
ctable = (WT_CURSOR_TABLE *)ref_cursor;
table = ctable->table;
firstcg = ctable->cg_cursors[0];
+ } else if (WT_PREFIX_MATCH(ref_cursor->uri, "join:")) {
+ idx = NULL;
+ table = ((WT_CURSOR_JOIN *)ref_cursor)->table;
+ nested = true;
} else
- WT_ERR_MSG(session, EINVAL, "not an index or table cursor");
+ WT_ERR_MSG(session, EINVAL,
+ "ref_cursor must be an index, table or join cursor");
- if (!F_ISSET(firstcg, WT_CURSTD_KEY_SET))
+ if (firstcg != NULL && !F_ISSET(firstcg, WT_CURSTD_KEY_SET))
WT_ERR_MSG(session, EINVAL,
"requires reference cursor be positioned");
cjoin = (WT_CURSOR_JOIN *)join_cursor;
if (cjoin->table != table)
WT_ERR_MSG(session, EINVAL,
- "table for join cursor does not match table for index");
+ "table for join cursor does not match table for "
+ "ref_cursor");
if (F_ISSET(ref_cursor, WT_CURSTD_JOINED))
WT_ERR_MSG(session, EINVAL,
- "index cursor already used in a join");
+ "cursor already used in a join");
/* "ge" is the default */
range = WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ;
@@ -868,15 +877,20 @@ __session_join(WT_SESSION *wt_session, WT_CURSOR *join_cursor,
WT_ERR_MSG(session, EINVAL,
"bloom_hash_count: value too large");
bloom_hash_count = (uint32_t)cval.val;
- if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM)) {
- if (count == 0)
- WT_ERR_MSG(session, EINVAL,
- "count must be nonzero when strategy=bloom");
- if (cjoin->entries_next == 0)
- WT_ERR_MSG(session, EINVAL,
- "the first joined cursor cannot specify "
- "strategy=bloom");
- }
+ if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM) && count == 0)
+ WT_ERR_MSG(session, EINVAL,
+ "count must be nonzero when strategy=bloom");
+
+ WT_ERR(__wt_config_gets(session, cfg, "operation", &cval));
+ if (cval.len != 0 && WT_STRING_MATCH("or", cval.str, cval.len))
+ LF_SET(WT_CURJOIN_ENTRY_DISJUNCTION);
+
+ if (nested && (count != 0 || range != WT_CURJOIN_END_EQ ||
+ LF_ISSET(WT_CURJOIN_ENTRY_BLOOM)))
+ WT_ERR_MSG(session, EINVAL,
+ "joining a nested join cursor is incompatible with "
+ "setting \"strategy\", \"compare\" or \"count\"");
+
WT_ERR(__wt_curjoin_join(session, cjoin, idx, ref_cursor, flags,
range, count, bloom_bit_count, bloom_hash_count));
/*
@@ -1095,7 +1109,7 @@ __session_truncate(WT_SESSION *wt_session,
if (!WT_STREQ(uri, "log:"))
WT_ERR_MSG(session, EINVAL,
"the truncate method should not specify any"
- "target after the log: URI prefix.");
+ "target after the log: URI prefix");
WT_ERR(__wt_log_truncate_files(session, start, cfg));
} else if (WT_PREFIX_MATCH(uri, "file:"))
WT_ERR(__wt_session_range_truncate(
diff --git a/src/support/err.c b/src/support/err.c
index f64492f1561..815b79c16db 100644
--- a/src/support/err.c
+++ b/src/support/err.c
@@ -24,7 +24,7 @@ __handle_error_default(WT_EVENT_HANDLER *handler,
session = (WT_SESSION_IMPL *)wt_session;
WT_RET(__wt_fprintf(session, WT_STDERR(session), "%s\n", errmsg));
- WT_RET(__wt_fsync(session, WT_STDERR(session), true));
+ WT_RET(__wt_fflush(session, WT_STDERR(session)));
return (0);
}
@@ -42,7 +42,7 @@ __handle_message_default(WT_EVENT_HANDLER *handler,
session = (WT_SESSION_IMPL *)wt_session;
WT_RET(__wt_fprintf(session, WT_STDOUT(session), "%s\n", message));
- WT_RET(__wt_fsync(session, WT_STDOUT(session), true));
+ WT_RET(__wt_fflush(session, WT_STDOUT(session)));
return (0);
}
diff --git a/src/support/scratch.c b/src/support/scratch.c
index aea98dc49ef..1881f8ad5a5 100644
--- a/src/support/scratch.c
+++ b/src/support/scratch.c
@@ -117,7 +117,7 @@ __wt_buf_catfmt(WT_SESSION_IMPL *session, WT_ITEM *buf, const char *fmt, ...)
p = (char *)((uint8_t *)buf->mem + buf->size);
WT_ASSERT(session, buf->memsize >= buf->size);
space = buf->memsize - buf->size;
- len = (size_t)vsnprintf(p, (size_t)space, fmt, ap);
+ len = (size_t)vsnprintf(p, space, fmt, ap);
va_end(ap);
/* Check if there was enough space. */
diff --git a/src/support/stat.c b/src/support/stat.c
index 2a826eda962..bb46ad03e43 100644
--- a/src/support/stat.c
+++ b/src/support/stat.c
@@ -60,6 +60,7 @@ static const char * const __stats_dsrc_desc[] = {
"cache: page written requiring lookaside records",
"cache: pages read into cache",
"cache: pages read into cache requiring lookaside entries",
+ "cache: pages requested from the cache",
"cache: pages written from cache",
"cache: pages written requiring in-memory restoration",
"cache: unmodified pages evicted",
@@ -189,6 +190,7 @@ __wt_stat_dsrc_clear_single(WT_DSRC_STATS *stats)
stats->cache_write_lookaside = 0;
stats->cache_read = 0;
stats->cache_read_lookaside = 0;
+ stats->cache_pages_requested = 0;
stats->cache_write = 0;
stats->cache_write_restore = 0;
stats->cache_eviction_clean = 0;
@@ -316,6 +318,7 @@ __wt_stat_dsrc_aggregate_single(
to->cache_write_lookaside += from->cache_write_lookaside;
to->cache_read += from->cache_read;
to->cache_read_lookaside += from->cache_read_lookaside;
+ to->cache_pages_requested += from->cache_pages_requested;
to->cache_write += from->cache_write;
to->cache_write_restore += from->cache_write_restore;
to->cache_eviction_clean += from->cache_eviction_clean;
@@ -455,6 +458,8 @@ __wt_stat_dsrc_aggregate(
WT_STAT_READ(from, cache_write_lookaside);
to->cache_read += WT_STAT_READ(from, cache_read);
to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside);
+ to->cache_pages_requested +=
+ WT_STAT_READ(from, cache_pages_requested);
to->cache_write += WT_STAT_READ(from, cache_write);
to->cache_write_restore += WT_STAT_READ(from, cache_write_restore);
to->cache_eviction_clean += WT_STAT_READ(from, cache_eviction_clean);
@@ -543,11 +548,16 @@ static const char * const __stats_connection_desc[] = {
"cache: bytes read into cache",
"cache: bytes written from cache",
"cache: checkpoint blocked page eviction",
+ "cache: eviction calls to get a page",
+ "cache: eviction calls to get a page found queue empty",
+ "cache: eviction calls to get a page found queue empty after locking",
"cache: eviction currently operating in aggressive mode",
"cache: eviction server candidate queue empty when topping up",
"cache: eviction server candidate queue not empty when topping up",
"cache: eviction server evicting pages",
"cache: eviction server populating queue, but not evicting pages",
+ "cache: eviction server skipped very large page",
+ "cache: eviction server slept, because we did not make progress with eviction",
"cache: eviction server unable to reach eviction goal",
"cache: eviction worker thread evicting pages",
"cache: failed eviction of pages that exceeded the in-memory maximum",
@@ -570,6 +580,7 @@ static const char * const __stats_connection_desc[] = {
"cache: pages evicted by application threads",
"cache: pages read into cache",
"cache: pages read into cache requiring lookaside entries",
+ "cache: pages requested from the cache",
"cache: pages selected for eviction unable to be evicted",
"cache: pages walked for eviction",
"cache: pages written from cache",
@@ -740,11 +751,16 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->cache_bytes_read = 0;
stats->cache_bytes_write = 0;
stats->cache_eviction_checkpoint = 0;
+ stats->cache_eviction_get_ref = 0;
+ stats->cache_eviction_get_ref_empty = 0;
+ stats->cache_eviction_get_ref_empty2 = 0;
/* not clearing cache_eviction_aggressive_set */
stats->cache_eviction_queue_empty = 0;
stats->cache_eviction_queue_not_empty = 0;
stats->cache_eviction_server_evicting = 0;
stats->cache_eviction_server_not_evicting = 0;
+ stats->cache_eviction_server_toobig = 0;
+ stats->cache_eviction_server_slept = 0;
stats->cache_eviction_slow = 0;
stats->cache_eviction_worker_evicting = 0;
stats->cache_eviction_force_fail = 0;
@@ -767,6 +783,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->cache_eviction_app = 0;
stats->cache_read = 0;
stats->cache_read_lookaside = 0;
+ stats->cache_pages_requested = 0;
stats->cache_eviction_fail = 0;
stats->cache_eviction_walk = 0;
stats->cache_write = 0;
@@ -928,6 +945,12 @@ __wt_stat_connection_aggregate(
to->cache_bytes_write += WT_STAT_READ(from, cache_bytes_write);
to->cache_eviction_checkpoint +=
WT_STAT_READ(from, cache_eviction_checkpoint);
+ to->cache_eviction_get_ref +=
+ WT_STAT_READ(from, cache_eviction_get_ref);
+ to->cache_eviction_get_ref_empty +=
+ WT_STAT_READ(from, cache_eviction_get_ref_empty);
+ to->cache_eviction_get_ref_empty2 +=
+ WT_STAT_READ(from, cache_eviction_get_ref_empty2);
to->cache_eviction_aggressive_set +=
WT_STAT_READ(from, cache_eviction_aggressive_set);
to->cache_eviction_queue_empty +=
@@ -938,6 +961,10 @@ __wt_stat_connection_aggregate(
WT_STAT_READ(from, cache_eviction_server_evicting);
to->cache_eviction_server_not_evicting +=
WT_STAT_READ(from, cache_eviction_server_not_evicting);
+ to->cache_eviction_server_toobig +=
+ WT_STAT_READ(from, cache_eviction_server_toobig);
+ to->cache_eviction_server_slept +=
+ WT_STAT_READ(from, cache_eviction_server_slept);
to->cache_eviction_slow += WT_STAT_READ(from, cache_eviction_slow);
to->cache_eviction_worker_evicting +=
WT_STAT_READ(from, cache_eviction_worker_evicting);
@@ -973,6 +1000,8 @@ __wt_stat_connection_aggregate(
to->cache_eviction_app += WT_STAT_READ(from, cache_eviction_app);
to->cache_read += WT_STAT_READ(from, cache_read);
to->cache_read_lookaside += WT_STAT_READ(from, cache_read_lookaside);
+ to->cache_pages_requested +=
+ WT_STAT_READ(from, cache_pages_requested);
to->cache_eviction_fail += WT_STAT_READ(from, cache_eviction_fail);
to->cache_eviction_walk += WT_STAT_READ(from, cache_eviction_walk);
to->cache_write += WT_STAT_READ(from, cache_write);
diff --git a/src/txn/txn.c b/src/txn/txn.c
index 7a768a8fe20..a2ae97fbd20 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -108,17 +108,17 @@ __wt_txn_release_snapshot(WT_SESSION_IMPL *session)
* __wt_txn_get_snapshot --
* Allocate a snapshot.
*/
-void
+int
__wt_txn_get_snapshot(WT_SESSION_IMPL *session)
{
WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
WT_TXN *txn;
WT_TXN_GLOBAL *txn_global;
WT_TXN_STATE *s, *txn_state;
uint64_t current_id, id;
uint64_t prev_oldest_id, snap_min;
uint32_t i, n, session_cnt;
- int32_t count;
conn = S2C(session);
txn = &session->txn;
@@ -126,15 +126,13 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
txn_state = WT_SESSION_TXN_STATE(session);
/*
- * We're going to scan. Increment the count of scanners to prevent the
- * oldest ID from moving forwards. Spin if the count is negative,
- * which indicates that some thread is moving the oldest ID forwards.
+ * Spin waiting for the lock: the sleeps in our blocking readlock
+ * implementation are too slow for scanning the transaction table.
*/
- do {
- if ((count = txn_global->scan_count) < 0)
- WT_PAUSE();
- } while (count < 0 ||
- !__wt_atomic_casiv32(&txn_global->scan_count, count, count + 1));
+ while ((ret =
+ __wt_try_readlock(session, txn_global->scan_rwlock)) == EBUSY)
+ WT_PAUSE();
+ WT_RET(ret);
current_id = snap_min = txn_global->current;
prev_oldest_id = txn_global->oldest_id;
@@ -145,11 +143,9 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
__txn_sort_snapshot(session, 0, current_id);
/* Check that the oldest ID has not moved in the meantime. */
- if (prev_oldest_id == txn_global->oldest_id) {
- WT_ASSERT(session, txn_global->scan_count > 0);
- (void)__wt_atomic_subiv32(&txn_global->scan_count, 1);
- return;
- }
+ WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
+ WT_RET(__wt_readunlock(session, txn_global->scan_rwlock));
+ return (0);
}
/* Walk the array of concurrent transactions. */
@@ -182,67 +178,35 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session)
WT_ASSERT(session, prev_oldest_id == txn_global->oldest_id);
txn_state->snap_min = snap_min;
- WT_ASSERT(session, txn_global->scan_count > 0);
- (void)__wt_atomic_subiv32(&txn_global->scan_count, 1);
+ WT_RET(__wt_readunlock(session, txn_global->scan_rwlock));
__txn_sort_snapshot(session, n, current_id);
+ return (0);
}
/*
- * __wt_txn_update_oldest --
- * Sweep the running transactions to update the oldest ID required.
- * !!!
- * If a data-source is calling the WT_EXTENSION_API.transaction_oldest
- * method (for the oldest transaction ID not yet visible to a running
- * transaction), and then comparing that oldest ID against committed
- * transactions to see if updates for a committed transaction are still
- * visible to running transactions, the oldest transaction ID may be
- * the same as the last committed transaction ID, if the transaction
- * state wasn't refreshed after the last transaction committed. Push
- * past the last committed transaction.
-*/
-void
-__wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force)
+ * __txn_oldest_scan --
+ * Sweep the running transactions to calculate the oldest ID required.
+ */
+static void
+__txn_oldest_scan(WT_SESSION_IMPL *session,
+ uint64_t *oldest_idp, uint64_t *last_runningp,
+ WT_SESSION_IMPL **oldest_sessionp)
{
WT_CONNECTION_IMPL *conn;
WT_SESSION_IMPL *oldest_session;
WT_TXN_GLOBAL *txn_global;
WT_TXN_STATE *s;
- uint64_t current_id, id, last_running, oldest_id, prev_oldest_id;
+ uint64_t id, last_running, oldest_id, prev_oldest_id;
uint32_t i, session_cnt;
- int32_t count;
- bool last_running_moved;
conn = S2C(session);
txn_global = &conn->txn_global;
-
-retry:
- current_id = last_running = txn_global->current;
oldest_session = NULL;
- prev_oldest_id = txn_global->oldest_id;
- /*
- * For pure read-only workloads, or if the update isn't forced and the
- * oldest ID isn't too far behind, avoid scanning.
- */
- if (prev_oldest_id == current_id ||
- (!force && WT_TXNID_LT(current_id, prev_oldest_id + 100)))
- return;
-
- /*
- * We're going to scan. Increment the count of scanners to prevent the
- * oldest ID from moving forwards. Spin if the count is negative,
- * which indicates that some thread is moving the oldest ID forwards.
- */
- do {
- if ((count = txn_global->scan_count) < 0)
- WT_PAUSE();
- } while (count < 0 ||
- !__wt_atomic_casiv32(&txn_global->scan_count, count, count + 1));
-
- /* The oldest ID cannot change until the scan count goes to zero. */
+ /* The oldest ID cannot change while we are holding the scan lock. */
prev_oldest_id = txn_global->oldest_id;
- current_id = oldest_id = last_running = txn_global->current;
+ oldest_id = last_running = txn_global->current;
/* Walk the array of concurrent transactions. */
WT_ORDERED_READ(session_cnt, conn->session_cnt);
@@ -264,7 +228,7 @@ retry:
* !!!
* Note: Don't ignore snap_min values older than the previous
* oldest ID. Read-uncommitted operations publish snap_min
- * values without incrementing scan_count to protect the global
+ * values without acquiring the scan lock to protect the global
* table. See the comment in __wt_txn_cursor_op for
* more details.
*/
@@ -283,76 +247,118 @@ retry:
WT_TXNID_LT(id, oldest_id))
oldest_id = id;
- /* Update the last running ID. */
- last_running_moved =
- WT_TXNID_LT(txn_global->last_running, last_running);
+ *oldest_idp = oldest_id;
+ *oldest_sessionp = oldest_session;
+ *last_runningp = last_running;
+}
- /* Update the oldest ID. */
- if (WT_TXNID_LT(prev_oldest_id, oldest_id) || last_running_moved) {
- /*
- * We know we want to update. Check if we're racing.
- */
- if (__wt_atomic_casiv32(&txn_global->scan_count, 1, -1)) {
- WT_ORDERED_READ(session_cnt, conn->session_cnt);
- for (i = 0, s = txn_global->states;
- i < session_cnt; i++, s++) {
- if ((id = s->id) != WT_TXN_NONE &&
- WT_TXNID_LT(id, last_running))
- last_running = id;
- if ((id = s->snap_min) != WT_TXN_NONE &&
- WT_TXNID_LT(id, oldest_id))
- oldest_id = id;
- }
-
- if (WT_TXNID_LT(last_running, oldest_id))
- oldest_id = last_running;
+/*
+ * __wt_txn_update_oldest --
+ * Sweep the running transactions to update the oldest ID required.
+ */
+int
+__wt_txn_update_oldest(WT_SESSION_IMPL *session, bool force)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *oldest_session;
+ WT_TXN_GLOBAL *txn_global;
+ uint64_t current_id, last_running, oldest_id;
+ uint64_t prev_last_running, prev_oldest_id;
+
+ conn = S2C(session);
+ txn_global = &conn->txn_global;
+
+ current_id = last_running = txn_global->current;
+ prev_last_running = txn_global->last_running;
+ prev_oldest_id = txn_global->oldest_id;
+
+ /*
+ * For pure read-only workloads, or if the update isn't forced and the
+ * oldest ID isn't too far behind, avoid scanning.
+ */
+ if (prev_oldest_id == current_id ||
+ (!force && WT_TXNID_LT(current_id, prev_oldest_id + 100)))
+ return (0);
+
+ /* First do a read-only scan. */
+ if (force)
+ WT_RET(__wt_readlock(session, txn_global->scan_rwlock));
+ else if ((ret =
+ __wt_try_readlock(session, txn_global->scan_rwlock)) != 0)
+ return (ret == EBUSY ? 0 : ret);
+ __txn_oldest_scan(session, &oldest_id, &last_running, &oldest_session);
+ WT_RET(__wt_readunlock(session, txn_global->scan_rwlock));
+
+ /*
+ * If the state hasn't changed (or hasn't moved far enough for
+ * non-forced updates), give up.
+ */
+ if ((oldest_id == prev_oldest_id ||
+ (!force && WT_TXNID_LT(oldest_id, prev_oldest_id + 100))) &&
+ ((last_running == prev_last_running) ||
+ (!force && WT_TXNID_LT(last_running, prev_last_running + 100))))
+ return (0);
+
+ /* It looks like an update is necessary, wait for exclusive access. */
+ if (force)
+ WT_RET(__wt_writelock(session, txn_global->scan_rwlock));
+ else if ((ret =
+ __wt_try_writelock(session, txn_global->scan_rwlock)) != 0)
+ return (ret == EBUSY ? 0 : ret);
+
+ /*
+ * If the oldest ID has been updated while we waited, don't bother
+ * scanning.
+ */
+ if (WT_TXNID_LE(oldest_id, txn_global->oldest_id) &&
+ WT_TXNID_LE(last_running, txn_global->last_running))
+ goto done;
+
+ /*
+ * Re-scan now that we have exclusive access. This is necessary because
+ * threads get transaction snapshots with read locks, and we have to be
+ * sure that there isn't a thread that has got a snapshot locally but
+ * not yet published its snap_min.
+ */
+ __txn_oldest_scan(session, &oldest_id, &last_running, &oldest_session);
#ifdef HAVE_DIAGNOSTIC
- /*
- * Make sure the ID doesn't move past any named
- * snapshots.
- *
- * Don't include the read/assignment in the assert
- * statement. Coverity complains if there are
- * assignments only done in diagnostic builds, and
- * when the read is from a volatile.
- */
- id = txn_global->nsnap_oldest_id;
- WT_ASSERT(session,
- id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id));
+ {
+ /*
+ * Make sure the ID doesn't move past any named snapshots.
+ *
+ * Don't include the read/assignment in the assert statement. Coverity
+ * complains if there are assignments only done in diagnostic builds,
+ * and when the read is from a volatile.
+ */
+ uint64_t id = txn_global->nsnap_oldest_id;
+ WT_ASSERT(session,
+ id == WT_TXN_NONE || !WT_TXNID_LT(id, oldest_id));
+ }
#endif
- if (WT_TXNID_LT(txn_global->last_running, last_running))
- txn_global->last_running = last_running;
- if (WT_TXNID_LT(txn_global->oldest_id, oldest_id))
- txn_global->oldest_id = oldest_id;
- WT_ASSERT(session, txn_global->scan_count == -1);
- txn_global->scan_count = 0;
- } else {
- /*
- * We wanted to update the oldest ID but we're racing
- * another thread. Retry if this is a forced update.
- */
- WT_ASSERT(session, txn_global->scan_count > 0);
- (void)__wt_atomic_subiv32(&txn_global->scan_count, 1);
- if (force) {
- __wt_yield();
- goto retry;
- }
- }
- } else {
+ /* Update the oldest ID. */
+ if (WT_TXNID_LT(txn_global->oldest_id, oldest_id))
+ txn_global->oldest_id = oldest_id;
+ if (WT_TXNID_LT(txn_global->last_running, last_running)) {
+ txn_global->last_running = last_running;
+
+ /* Output a verbose message about long-running transactions,
+ * but only when some progress is being made. */
if (WT_VERBOSE_ISSET(session, WT_VERB_TRANSACTION) &&
current_id - oldest_id > 10000 && oldest_session != NULL) {
- (void)__wt_verbose(session, WT_VERB_TRANSACTION,
+ WT_TRET(__wt_verbose(session, WT_VERB_TRANSACTION,
"old snapshot %" PRIu64
" pinned in session %" PRIu32 " [%s]"
" with snap_min %" PRIu64 "\n",
oldest_id, oldest_session->id,
oldest_session->lastop,
- oldest_session->txn.snap_min);
+ oldest_session->txn.snap_min));
}
- WT_ASSERT(session, txn_global->scan_count > 0);
- (void)__wt_atomic_subiv32(&txn_global->scan_count, 1);
}
+
+done: WT_TRET(__wt_writeunlock(session, txn_global->scan_rwlock));
+ return (ret);
}
/*
@@ -513,7 +519,7 @@ __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[])
*/
if (F_ISSET(txn, WT_TXN_SYNC_SET))
WT_RET_MSG(session, EINVAL,
- "Sync already set during begin_transaction.");
+ "Sync already set during begin_transaction");
if (WT_STRING_MATCH("background", cval.str, cval.len))
txn->txn_logsync = WT_LOG_BACKGROUND;
else if (WT_STRING_MATCH("off", cval.str, cval.len))
@@ -736,6 +742,8 @@ __wt_txn_global_init(WT_SESSION_IMPL *session, const char *cfg[])
WT_RET(__wt_spin_init(session,
&txn_global->id_lock, "transaction id lock"));
WT_RET(__wt_rwlock_alloc(session,
+ &txn_global->scan_rwlock, "transaction scan lock"));
+ WT_RET(__wt_rwlock_alloc(session,
&txn_global->nsnap_rwlock, "named snapshot lock"));
txn_global->nsnap_oldest_id = WT_TXN_NONE;
TAILQ_INIT(&txn_global->nsnaph);
@@ -768,6 +776,7 @@ __wt_txn_global_destroy(WT_SESSION_IMPL *session)
return (0);
__wt_spin_destroy(session, &txn_global->id_lock);
+ WT_TRET(__wt_rwlock_destroy(session, &txn_global->scan_rwlock));
WT_TRET(__wt_rwlock_destroy(session, &txn_global->nsnap_rwlock));
__wt_free(session, txn_global->states);
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index 27e18b254b8..c1b435d9897 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -404,7 +404,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[])
* This is particularly important for compact, so that all dirty pages
* can be fully written.
*/
- __wt_txn_update_oldest(session, true);
+ WT_ERR(__wt_txn_update_oldest(session, true));
/* Flush data-sources before we start the checkpoint. */
WT_ERR(__checkpoint_data_source(session, cfg));
@@ -792,6 +792,9 @@ __checkpoint_lock_tree(WT_SESSION_IMPL *session,
hot_backup_locked = false;
name_alloc = NULL;
+ /* Only referenced in diagnostic builds. */
+ WT_UNUSED(is_checkpoint);
+
/*
* Only referenced in diagnostic builds and gcc 5.1 isn't satisfied
* with wrapping the entire assert condition in the unused macro.
@@ -1281,7 +1284,7 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final)
* for active readers.
*/
if (!btree->modified && !bulk) {
- __wt_txn_update_oldest(session, true);
+ WT_RET(__wt_txn_update_oldest(session, true));
return (__wt_txn_visible_all(session, btree->rec_max_txn) ?
__wt_cache_op(session, WT_SYNC_DISCARD) : EBUSY);
}
diff --git a/src/txn/txn_nsnap.c b/src/txn/txn_nsnap.c
index eddcca9248f..5b8fed23a9f 100644
--- a/src/txn/txn_nsnap.c
+++ b/src/txn/txn_nsnap.c
@@ -343,7 +343,7 @@ __wt_txn_named_snapshot_config(WT_SESSION_IMPL *session,
if (!*has_create && !*has_drops)
WT_RET_MSG(session, EINVAL,
"WT_SESSION::snapshot API called without any drop or "
- "name option.");
+ "name option");
return (0);
}
diff --git a/src/utilities/util_backup.c b/src/utilities/util_backup.c
index 190c0878f38..5dc9671fb45 100644
--- a/src/utilities/util_backup.c
+++ b/src/utilities/util_backup.c
@@ -23,7 +23,7 @@ append_target(WT_SESSION *session, const char *target, char **bufp)
static char *buf = NULL;
/* 20 bytes of slop */
- if (remain < strlen(target) + 20) {
+ if (buf == NULL || remain < strlen(target) + 20) {
len += strlen(target) + 512;
remain += strlen(target) + 512;
if ((buf = realloc(buf, len)) == NULL)
diff --git a/src/utilities/util_dump.c b/src/utilities/util_dump.c
index aedd9168fbd..0f09009cd4c 100644
--- a/src/utilities/util_dump.c
+++ b/src/utilities/util_dump.c
@@ -8,24 +8,19 @@
#include "util.h"
-static int dump_config(WT_SESSION *, const char *, bool);
+static int dump_config(WT_SESSION *, const char *, bool, bool);
static int dump_json_begin(WT_SESSION *);
static int dump_json_end(WT_SESSION *);
static int dump_json_separator(WT_SESSION *);
-static int dump_json_table_begin(
- WT_SESSION *, WT_CURSOR *, const char *, const char *);
-static int dump_json_table_cg(
- WT_SESSION *, WT_CURSOR *, const char *, const char *, const char *);
-static int dump_json_table_config(WT_SESSION *, const char *);
static int dump_json_table_end(WT_SESSION *);
-static int dump_prefix(WT_SESSION *, bool);
+static int dump_prefix(WT_SESSION *, bool, bool);
static int dump_record(WT_CURSOR *, bool, bool);
-static int dump_suffix(WT_SESSION *);
-static int dump_table_config(WT_SESSION *, WT_CURSOR *, const char *);
+static int dump_suffix(WT_SESSION *, bool);
+static int dump_table_config(WT_SESSION *, WT_CURSOR *, const char *, bool);
static int dump_table_config_complex(
- WT_SESSION *, WT_CURSOR *, WT_CURSOR *, const char *, const char *);
+ WT_SESSION *, WT_CURSOR *, WT_CURSOR *, const char *, const char *, bool);
static int dup_json_string(const char *, char **);
-static int print_config(WT_SESSION *, const char *, char *[]);
+static int print_config(WT_SESSION *, const char *, char *[], bool, bool);
static int usage(void);
int
@@ -91,9 +86,7 @@ util_dump(WT_SESSION *session, int argc, char *argv[])
if ((name = util_name(session, argv[i], "table")) == NULL)
goto err;
- if (json && dump_json_table_config(session, name) != 0)
- goto err;
- if (!json && dump_config(session, name, hex) != 0)
+ if (dump_config(session, name, hex, json) != 0)
goto err;
len =
@@ -142,7 +135,7 @@ err: ret = 1;
* Dump the config for the uri.
*/
static int
-dump_config(WT_SESSION *session, const char *uri, bool hex)
+dump_config(WT_SESSION *session, const char *uri, bool hex, bool json)
{
WT_CURSOR *cursor;
WT_DECL_RET;
@@ -162,9 +155,9 @@ dump_config(WT_SESSION *session, const char *uri, bool hex)
*/
cursor->set_key(cursor, uri);
if ((ret = cursor->search(cursor)) == 0) {
- if (dump_prefix(session, hex) != 0 ||
- dump_table_config(session, cursor, uri) != 0 ||
- dump_suffix(session) != 0)
+ if (dump_prefix(session, hex, json) != 0 ||
+ dump_table_config(session, cursor, uri, json) != 0 ||
+ dump_suffix(session, json) != 0)
ret = 1;
} else if (ret == WT_NOTFOUND)
ret = util_err(session, 0, "%s: No such object exists", uri);
@@ -217,225 +210,6 @@ dump_json_separator(WT_SESSION *session)
}
/*
- * dump_json_table_begin --
- * Output the JSON syntax that starts a table, along with its config.
- */
-static int
-dump_json_table_begin(
- WT_SESSION *session, WT_CURSOR *cursor, const char *uri, const char *config)
-{
- WT_DECL_RET;
- const char *name;
- char *jsonconfig;
-
- jsonconfig = NULL;
-
- /* Get the table name. */
- if ((name = strchr(uri, ':')) == NULL) {
- fprintf(stderr, "%s: %s: corrupted uri\n", progname, uri);
- return (1);
- }
- ++name;
-
- if ((ret = dup_json_string(config, &jsonconfig)) != 0)
- return (util_cerr(cursor, "config dup", ret));
- if (printf(" \"%s\" : [\n {\n", uri) < 0)
- goto eio;
- if (printf(" \"config\" : \"%s\",\n", jsonconfig) < 0)
- goto eio;
-
- if ((ret = dump_json_table_cg(
- session, cursor, name, "colgroup:", "colgroups")) == 0) {
- if (printf(",\n") < 0)
- goto eio;
- ret = dump_json_table_cg(
- session, cursor, name, "index:", "indices");
- }
-
- if (printf("\n },\n {\n \"data\" : [") < 0)
- goto eio;
-
- if (0) {
-eio: ret = util_err(session, EIO, NULL);
- }
-
- free(jsonconfig);
- return (ret);
-}
-
-/*
- * dump_json_table_cg --
- * Dump the column groups or indices for a table.
- */
-static int
-dump_json_table_cg(WT_SESSION *session, WT_CURSOR *cursor,
- const char *name, const char *entry, const char *header)
-{
- static const char * const indent = " ";
- WT_DECL_RET;
- int exact;
- bool once;
- const char *key, *skip, *value;
- char *jsonconfig;
-
- once = false;
- if (printf(" \"%s\" : [", header) < 0)
- return (util_err(session, EIO, NULL));
-
- /*
- * For table dumps, we're done.
- */
- if (cursor == NULL) {
- if (printf("]") < 0)
- return (util_err(session, EIO, NULL));
- else
- return (0);
- }
-
- /*
- * Search the file looking for column group and index key/value pairs:
- * for each one, look up the related source information and append it
- * to the base record.
- */
- cursor->set_key(cursor, entry);
- if ((ret = cursor->search_near(cursor, &exact)) != 0) {
- if (ret == WT_NOTFOUND)
- return (0);
- return (util_cerr(cursor, "search_near", ret));
- }
- if (exact >= 0)
- goto match;
- while ((ret = cursor->next(cursor)) == 0) {
-match: if ((ret = cursor->get_key(cursor, &key)) != 0)
- return (util_cerr(cursor, "get_key", ret));
-
- /* Check if we've finished the list of entries. */
- if (!WT_PREFIX_MATCH(key, entry))
- break;
-
- /* Check for a table name match. */
- skip = key + strlen(entry);
- if (strncmp(
- skip, name, strlen(name)) != 0 || skip[strlen(name)] != ':')
- continue;
-
- /* Get the value. */
- if ((ret = cursor->get_value(cursor, &value)) != 0)
- return (util_cerr(cursor, "get_value", ret));
-
- if ((ret = dup_json_string(value, &jsonconfig)) != 0)
- return (util_cerr(cursor, "config dup", ret));
- ret = printf("%s\n"
- "%s{\n"
- "%s \"uri\" : \"%s\",\n"
- "%s \"config\" : \"%s\"\n"
- "%s}",
- once ? "," : "",
- indent, indent, key, indent, jsonconfig, indent);
- free(jsonconfig);
- if (ret < 0)
- return (util_err(session, EIO, NULL));
-
- once = true;
- }
- if (printf("%s]", once ? "\n " : "") < 0)
- return (util_err(session, EIO, NULL));
- if (ret == 0 || ret == WT_NOTFOUND)
- return (0);
- return (util_cerr(cursor, "next", ret));
-}
-
-/*
- * dump_json_table_config --
- * Dump the config for the uri.
- */
-static int
-dump_json_table_config(WT_SESSION *session, const char *uri)
-{
- WT_CONFIG_ITEM cval;
- WT_CURSOR *cursor;
- WT_DECL_RET;
- size_t len;
- int tret;
- const char *name, *value;
- char *p;
-
- p = NULL;
-
- /* Get the table name. */
- if ((name = strchr(uri, ':')) == NULL) {
- fprintf(stderr, "%s: %s: corrupted uri\n", progname, uri);
- return (1);
- }
- ++name;
-
- /* Open a metadata cursor. */
- if ((ret = session->open_cursor(
- session, "metadata:create", NULL, NULL, &cursor)) != 0) {
- fprintf(stderr, "%s: %s: session.open_cursor: %s\n",
- progname, "metadata:create",
- session->strerror(session, ret));
- return (1);
- }
-
- /*
- * Search for the object itself, just to make sure it exists, we don't
- * want to output a header if the user entered the wrong name. This is
- * where we find out a table doesn't exist, use a simple error message.
- *
- * Workaround for WiredTiger "simple" table handling. Simple tables
- * have column-group entries, but they aren't listed in the metadata's
- * table entry. Figure out if it's a simple table and in that case,
- * retrieve the column-group entry and use the value from its "source"
- * file.
- */
- if (WT_PREFIX_MATCH(uri, "table:")) {
- len = strlen("colgroup:") + strlen(name) + 1;
- if ((p = malloc(len)) == NULL)
- return (util_err(session, errno, NULL));
- (void)snprintf(p, len, "colgroup:%s", name);
- cursor->set_key(cursor, p);
- if ((ret = cursor->search(cursor)) == 0) {
- if ((ret = cursor->get_value(cursor, &value)) != 0)
- return (util_cerr(cursor, "get_value", ret));
- if ((ret = __wt_config_getones(
- (WT_SESSION_IMPL *)session,
- value, "source", &cval)) != 0)
- return (util_err(
- session, ret, "%s: source entry", p));
- free(p);
- len = cval.len + 10;
- if ((p = malloc(len)) == NULL)
- return (util_err(session, errno, NULL));
- (void)snprintf(p, len, "%.*s", (int)cval.len, cval.str);
- cursor->set_key(cursor, p);
- } else
- cursor->set_key(cursor, uri);
- } else
- cursor->set_key(cursor, uri);
-
- if ((ret = cursor->search(cursor)) == 0) {
- if ((ret = cursor->get_value(cursor, &value)) != 0)
- ret = util_cerr(cursor, "get_value", ret);
- else if (dump_json_table_begin(
- session, cursor, uri, value) != 0)
- ret = 1;
- } else if (ret == WT_NOTFOUND)
- ret = util_err(session, 0, "%s: No such object exists", uri);
- else
- ret = util_err(session, ret, "%s", uri);
-
- if ((tret = cursor->close(cursor)) != 0) {
- tret = util_cerr(cursor, "close", tret);
- if (ret == 0)
- ret = tret;
- }
-
- free(p);
- return (ret);
-}
-
-/*
* dump_json_table_end --
* Output the JSON syntax that ends a table.
*/
@@ -452,7 +226,8 @@ dump_json_table_end(WT_SESSION *session)
* Dump the config for a table.
*/
static int
-dump_table_config(WT_SESSION *session, WT_CURSOR *cursor, const char *uri)
+dump_table_config(
+ WT_SESSION *session, WT_CURSOR *cursor, const char *uri, bool json)
{
WT_CONFIG_ITEM cval;
WT_CURSOR *srch;
@@ -479,11 +254,11 @@ dump_table_config(WT_SESSION *session, WT_CURSOR *cursor, const char *uri)
*/
cursor->set_key(cursor, uri);
if ((ret = cursor->search(cursor)) != 0)
- return (util_cerr(cursor, "search", ret));
+ WT_ERR(util_cerr(cursor, "search", ret));
if ((ret = cursor->get_value(cursor, &v)) != 0)
- return (util_cerr(cursor, "get_value", ret));
+ WT_ERR(util_cerr(cursor, "get_value", ret));
if ((*--cfg = strdup(v)) == NULL)
- return (util_err(session, errno, NULL));
+ WT_ERR(util_err(session, errno, NULL));
/*
* Workaround for WiredTiger "simple" table handling. Simple tables
@@ -497,37 +272,36 @@ dump_table_config(WT_SESSION *session, WT_CURSOR *cursor, const char *uri)
if (WT_PREFIX_MATCH(uri, "table:")) {
len = strlen("colgroup:") + strlen(name) + 1;
if ((p = malloc(len)) == NULL)
- return (util_err(session, errno, NULL));
+ WT_ERR(util_err(session, errno, NULL));
(void)snprintf(p, len, "colgroup:%s", name);
cursor->set_key(cursor, p);
if ((ret = cursor->search(cursor)) == 0) {
if ((ret = cursor->get_value(cursor, &v)) != 0)
- return (util_cerr(cursor, "get_value", ret));
+ WT_ERR(util_cerr(cursor, "get_value", ret));
if ((*--cfg = strdup(v)) == NULL)
- return (util_err(session, errno, NULL));
+ WT_ERR(util_err(session, errno, NULL));
if ((ret =__wt_config_getones(
(WT_SESSION_IMPL *)session,
*cfg, "source", &cval)) != 0)
- return (util_err(
+ WT_ERR(util_err(
session, ret, "%s: source entry", p));
free(p);
len = cval.len + 10;
if ((p = malloc(len)) == NULL)
- return (util_err(session, errno, NULL));
+ WT_ERR(util_err(session, errno, NULL));
(void)snprintf(p, len, "%.*s", (int)cval.len, cval.str);
cursor->set_key(cursor, p);
if ((ret = cursor->search(cursor)) != 0)
- return (util_cerr(cursor, "search", ret));
+ WT_ERR(util_cerr(cursor, "search", ret));
if ((ret = cursor->get_value(cursor, &v)) != 0)
- return (util_cerr(cursor, "get_value", ret));
+ WT_ERR(util_cerr(cursor, "get_value", ret));
if ((*--cfg = strdup(v)) == NULL)
- return (util_err(session, errno, NULL));
+ WT_ERR(util_err(session, errno, NULL));
} else
complex_table = true;
}
- if (print_config(session, uri, cfg) != 0)
- return (1);
+ WT_ERR(print_config(session, uri, cfg, json, true));
if (complex_table) {
/*
@@ -537,21 +311,24 @@ dump_table_config(WT_SESSION *session, WT_CURSOR *cursor, const char *uri)
*/
if ((ret = session->open_cursor(
session, "metadata:", NULL, NULL, &srch)) != 0)
- return (util_cerr(cursor, "open_cursor", ret));
+ WT_ERR(util_cerr(cursor, "open_cursor", ret));
if ((ret = dump_table_config_complex(
- session, cursor, srch, name, "colgroup:")) == 0)
+ session, cursor, srch, name, "colgroup:", json)) == 0)
ret = dump_table_config_complex(
- session, cursor, srch, name, "index:");
+ session, cursor, srch, name, "index:", json);
if ((tret = srch->close(srch)) != 0) {
tret = util_cerr(cursor, "close", tret);
if (ret == 0)
ret = tret;
}
- }
+ } else if (json && printf(
+ " \"colgroups\" : [],\n"
+ " \"indices\" : []\n") < 0)
+ WT_ERR(util_cerr(cursor, NULL, EIO));
- free(p);
+err: free(p);
free(_cfg[0]);
free(_cfg[1]);
free(_cfg[2]);
@@ -563,17 +340,31 @@ dump_table_config(WT_SESSION *session, WT_CURSOR *cursor, const char *uri)
* Dump the column groups or indices for a table.
*/
static int
-dump_table_config_complex(WT_SESSION *session,
- WT_CURSOR *cursor, WT_CURSOR *srch, const char *name, const char *entry)
+dump_table_config_complex(WT_SESSION *session, WT_CURSOR *cursor,
+ WT_CURSOR *srch, const char *name, const char *entry, bool json)
{
WT_CONFIG_ITEM cval;
WT_DECL_RET;
- const char *key;
+ bool multiple;
+ const char *groupname, *key, *sep;
size_t len;
int exact;
const char *v;
char *p, *cfg[3] = {NULL, NULL, NULL};
+ multiple = false;
+ sep = "";
+
+ if (json) {
+ if (strcmp(entry, "colgroup:") == 0) {
+ groupname = "colgroups";
+ sep = ",";
+ } else {
+ groupname = "indices";
+ }
+ if (printf(" \"%s\" : [", groupname) < 0)
+ return (util_err(session, EIO, NULL));
+ }
/*
* Search the file looking for column group and index key/value pairs:
* for each one, look up the related source information and append it
@@ -594,7 +385,7 @@ match: if ((ret = cursor->get_key(cursor, &key)) != 0)
/* Check if we've finished the list of entries. */
if (!WT_PREFIX_MATCH(key, entry))
- return (0);
+ break;
/*
* Check for a table name match. This test will match "simple"
@@ -635,14 +426,19 @@ match: if ((ret = cursor->get_key(cursor, &key)) != 0)
if ((cfg[0] = strdup(v)) == NULL)
return (util_err(session, errno, NULL));
+ if (json && printf("%s\n", multiple ? "," : "") < 0)
+ return (util_err(session, EIO, NULL));
/*
* The dumped configuration string is the original key plus the
* source's configuration, where the values of the original key
* override any source configurations of the same name.
*/
- if (print_config(session, key, cfg) != 0)
+ if (print_config(session, key, cfg, json, false) != 0)
return (util_err(session, EIO, NULL));
+ multiple = true;
}
+ if (json && printf("\n ]%s\n", sep) < 0)
+ return (util_err(session, EIO, NULL));
free(cfg[0]);
free(cfg[1]);
@@ -656,10 +452,13 @@ match: if ((ret = cursor->get_key(cursor, &key)) != 0)
* Output the dump file header prefix.
*/
static int
-dump_prefix(WT_SESSION *session, bool hex)
+dump_prefix(WT_SESSION *session, bool hex, bool json)
{
int vmajor, vminor, vpatch;
+ if (json)
+ return (0);
+
(void)wiredtiger_version(&vmajor, &vminor, &vpatch);
if (printf(
@@ -718,10 +517,18 @@ dump_record(WT_CURSOR *cursor, bool reverse, bool json)
* Output the dump file header suffix.
*/
static int
-dump_suffix(WT_SESSION *session)
+dump_suffix(WT_SESSION *session, bool json)
{
- if (printf("Data\n") < 0)
- return (util_err(session, EIO, NULL));
+ if (json) {
+ if (printf(
+ " },\n"
+ " {\n"
+ " \"data\" : [") < 0)
+ return (util_err(session, EIO, NULL));
+ } else {
+ if (printf("Data\n") < 0)
+ return (util_err(session, EIO, NULL));
+ }
return (0);
}
@@ -759,21 +566,40 @@ dup_json_string(const char *str, char **result)
* Output a key/value URI pair by combining v1 and v2.
*/
static int
-print_config(WT_SESSION *session, const char *key, char *cfg[])
+print_config(
+ WT_SESSION *session, const char *key, char *cfg[], bool json, bool toplevel)
{
WT_DECL_RET;
- char *value_ret;
+ char *jsonconfig, *value_ret;
/*
* We have all of the object configuration, but don't have the default
* session.create configuration. Have the underlying library add in the
* defaults and collapse it all into one load configuration string.
*/
+ jsonconfig = NULL;
if ((ret = __wt_schema_create_final(
(WT_SESSION_IMPL *)session, cfg, &value_ret)) != 0)
return (util_err(session, ret, NULL));
- ret = printf("%s\n%s\n", key, value_ret);
+ if (json && (ret = dup_json_string(value_ret, &jsonconfig)) != 0) {
+ free(value_ret);
+ return (util_err(session, ret, NULL));
+ }
+ if (json) {
+ if (toplevel)
+ ret = printf(
+ " \"%s\" : [\n {\n "
+ "\"config\" : \"%s\",\n", key, jsonconfig);
+ else
+ ret = printf(
+ " {\n"
+ " \"uri\" : \"%s\",\n"
+ " \"config\" : \"%s\"\n"
+ " }", key, jsonconfig);
+ } else
+ ret = printf("%s\n%s\n", key, value_ret);
free(value_ret);
+ free(jsonconfig);
if (ret < 0)
return (util_err(session, EIO, NULL));
return (0);
diff --git a/src/utilities/util_main.c b/src/utilities/util_main.c
index e18d8d7d1f5..2054b94e3ce 100644
--- a/src/utilities/util_main.c
+++ b/src/utilities/util_main.c
@@ -36,7 +36,6 @@ main(int argc, char *argv[])
conn = NULL;
p = NULL;
- secretkey = NULL;
/* Get the program name. */
if ((progname = strrchr(argv[0], '/')) == NULL)
diff --git a/test/bloom/Makefile.am b/test/bloom/Makefile.am
index 86d87c70071..0592cec7e42 100644
--- a/test/bloom/Makefile.am
+++ b/test/bloom/Makefile.am
@@ -11,4 +11,4 @@ TESTS = $(noinst_PROGRAMS)
LOG_COMPILER = $(TEST_WRAPPER)
clean-local:
- rm -rf WiredTiger* *.core __*
+ rm -rf WiredTiger* *.core
diff --git a/test/bloom/test_bloom.c b/test/bloom/test_bloom.c
index f95bc7faaf9..6955813dc68 100644
--- a/test/bloom/test_bloom.c
+++ b/test/bloom/test_bloom.c
@@ -189,9 +189,7 @@ run(void)
* ensure the value doesn't overlap with existing values.
*/
item.size = g.c_key_max + 10;
- item.data = calloc(item.size, 1);
- if (item.data == NULL)
- testutil_die(ENOMEM, "value buffer malloc");
+ item.data = dcalloc(item.size, 1);
memset((void *)item.data, 'a', item.size);
for (i = 0, fp = 0; i < g.c_ops; i++) {
((uint8_t *)item.data)[i % item.size] =
@@ -232,14 +230,10 @@ populate_entries(void)
srand(g.c_srand);
- entries = calloc(g.c_ops, sizeof(uint8_t *));
- if (entries == NULL)
- testutil_die(ENOMEM, "key buffer malloc");
+ entries = dcalloc(g.c_ops, sizeof(uint8_t *));
for (i = 0; i < g.c_ops; i++) {
- entries[i] = calloc(g.c_key_max, sizeof(uint8_t));
- if (entries[i] == NULL)
- testutil_die(ENOMEM, "key buffer malloc 2");
+ entries[i] = dcalloc(g.c_key_max, sizeof(uint8_t));
for (j = 0; j < g.c_key_max; j++)
entries[i][j] = 'a' + ((uint8_t)rand() % 26);
}
diff --git a/test/checkpoint/test_checkpoint.c b/test/checkpoint/test_checkpoint.c
index c5524b3c63e..307cfd914bd 100644
--- a/test/checkpoint/test_checkpoint.c
+++ b/test/checkpoint/test_checkpoint.c
@@ -61,8 +61,7 @@ main(int argc, char *argv[])
working_dir = NULL;
ttype = MIX;
g.checkpoint_name = "WiredTigerCheckpoint";
- if ((g.home = malloc(512)) == NULL)
- testutil_die(ENOMEM, "Unable to allocate memory");
+ g.home = dmalloc(512);
g.nkeys = 10000;
g.nops = 100000;
g.ntables = 3;
diff --git a/test/checkpoint/test_checkpoint.h b/test/checkpoint/test_checkpoint.h
index 09edaeb84bc..10e21289dd3 100644
--- a/test/checkpoint/test_checkpoint.h
+++ b/test/checkpoint/test_checkpoint.h
@@ -26,19 +26,9 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
-#include <sys/types.h>
-#include <sys/time.h>
+#include "test_util.i"
-#include <errno.h>
-#include <inttypes.h>
-#include <pthread.h>
#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-#include "test_util.i"
#define URI_BASE "table:__wt" /* File name */
diff --git a/test/cursor_order/Makefile.am b/test/cursor_order/Makefile.am
index c0c0ed639bf..8afb8f122d8 100644
--- a/test/cursor_order/Makefile.am
+++ b/test/cursor_order/Makefile.am
@@ -10,4 +10,4 @@ cursor_order_LDFLAGS = -static
TESTS = $(noinst_PROGRAMS)
clean-local:
- rm -rf WiredTiger* wt.* *.core __stats
+ rm -rf WT_TEST *.core
diff --git a/test/cursor_order/cursor_order_ops.c b/test/cursor_order/cursor_order_ops.c
index d44505ab2f3..a2185dd123f 100644
--- a/test/cursor_order/cursor_order_ops.c
+++ b/test/cursor_order/cursor_order_ops.c
@@ -59,22 +59,16 @@ ops_start(SHARED_CONFIG *cfg)
total_nops = 0;
/* Create per-thread structures. */
- if ((run_info = calloc(
- (size_t)(cfg->reverse_scanners + cfg->append_inserters),
- sizeof(*run_info))) == NULL)
- testutil_die(errno, "calloc");
-
- if ((tids = calloc(
- (size_t)(cfg->reverse_scanners + cfg->append_inserters),
- sizeof(*tids))) == NULL)
- testutil_die(errno, "calloc");
+ run_info = dcalloc((size_t)
+ (cfg->reverse_scanners + cfg->append_inserters), sizeof(*run_info));
+ tids = dcalloc((size_t)
+ (cfg->reverse_scanners + cfg->append_inserters), sizeof(*tids));
/* Create the files and load the initial records. */
for (i = 0; i < cfg->append_inserters; ++i) {
run_info[i].cfg = cfg;
if (i == 0 || cfg->multiple_files) {
- if ((run_info[i].name = malloc(64)) == NULL)
- testutil_die(errno, "malloc");
+ run_info[i].name = dmalloc(64);
snprintf(run_info[i].name, 64, FNAME, (int)i);
/* Vary by orders of magnitude */
@@ -96,8 +90,7 @@ ops_start(SHARED_CONFIG *cfg)
offset = i + cfg->append_inserters;
run_info[offset].cfg = cfg;
if (cfg->multiple_files) {
- if ((run_info[offset].name = malloc(64)) == NULL)
- testutil_die(errno, "malloc");
+ run_info[offset].name = dmalloc(64);
/* Have reverse scans read from tables with writes. */
name_index = i % cfg->append_inserters;
snprintf(
diff --git a/test/fops/fops.c b/test/fops/fops.c
index 3333ff16858..3c4de161423 100644
--- a/test/fops/fops.c
+++ b/test/fops/fops.c
@@ -59,10 +59,8 @@ fop_start(u_int nthreads)
tids = NULL; /* Silence GCC 4.1 warning. */
/* Create statistics and thread structures. */
- if ((run_stats = calloc(
- (size_t)(nthreads), sizeof(*run_stats))) == NULL ||
- (tids = calloc((size_t)(nthreads), sizeof(*tids))) == NULL)
- testutil_die(errno, "calloc");
+ run_stats = dcalloc((size_t)(nthreads), sizeof(*run_stats));
+ tids = dcalloc((size_t)(nthreads), sizeof(*tids));
(void)gettimeofday(&start, NULL);
diff --git a/test/fops/thread.h b/test/fops/thread.h
index f9707c14590..630c2061285 100644
--- a/test/fops/thread.h
+++ b/test/fops/thread.h
@@ -26,25 +26,9 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
-#include <sys/types.h>
-#ifndef _WIN32
-#include <sys/time.h>
-#endif
+#include "test_util.i"
-#include <errno.h>
-#include <inttypes.h>
-#ifndef _WIN32
-#include <pthread.h>
-#endif
#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include "test_util.i"
extern WT_CONNECTION *conn; /* WiredTiger connection */
diff --git a/test/format/backup.c b/test/format/backup.c
index 2b1463bd0e3..69fdf771de9 100644
--- a/test/format/backup.c
+++ b/test/format/backup.c
@@ -38,7 +38,7 @@ check_copy(void)
WT_CONNECTION *conn;
WT_SESSION *session;
- wts_open(g.home_backup, 0, &conn);
+ wts_open(g.home_backup, false, &conn);
testutil_checkfmt(
conn->open_session(conn, NULL, NULL, &session),
@@ -53,27 +53,30 @@ check_copy(void)
/*
* copy_file --
- * Copy a single file into the backup directory.
+ * Copy a single file into the backup directories.
*/
static void
-copy_file(const char *name)
+copy_file(WT_SESSION *session, const char *name)
{
size_t len;
- char *cmd;
-
- len = strlen(g.home) + strlen(g.home_backup) + strlen(name) * 2 + 20;
- cmd = dmalloc(len);
- (void)snprintf(cmd, len,
- "cp %s/%s %s/%s", g.home, name, g.home_backup, name);
- testutil_checkfmt(system(cmd), "backup copy: %s", cmd);
- free(cmd);
-
- len = strlen(g.home) + strlen(g.home_backup2) + strlen(name) * 2 + 20;
- cmd = dmalloc(len);
- (void)snprintf(cmd, len,
- "cp %s/%s %s/%s", g.home, name, g.home_backup2, name);
- testutil_checkfmt(system(cmd), "backup copy: %s", cmd);
- free(cmd);
+ char *first, *second;
+
+ len = strlen("BACKUP") + strlen(name) + 10;
+ first = dmalloc(len);
+ (void)snprintf(first, len, "BACKUP/%s", name);
+ testutil_check(__wt_copy_and_sync(session, name, first));
+
+ /*
+ * Save another copy of the original file to make debugging recovery
+ * errors easier.
+ */
+ len = strlen("BACKUP_COPY") + strlen(name) + 10;
+ second = dmalloc(len);
+ (void)snprintf(second, len, "BACKUP_COPY/%s", name);
+ testutil_check(__wt_copy_and_sync(session, first, second));
+
+ free(first);
+ free(second);
}
/*
@@ -85,10 +88,11 @@ backup(void *arg)
{
WT_CONNECTION *conn;
WT_CURSOR *backup_cursor;
+ WT_DECL_RET;
WT_SESSION *session;
- u_int period;
- int ret;
- const char *key;
+ u_int incremental, period;
+ bool full;
+ const char *config, *key;
(void)(arg);
@@ -102,48 +106,86 @@ backup(void *arg)
testutil_check(conn->open_session(conn, NULL, NULL, &session));
/*
- * Perform a backup at somewhere under 10 seconds (so we get at
- * least one done), and then at 45 second intervals.
+ * Perform a full backup at somewhere under 10 seconds (that way there's
+ * at least one), then at larger intervals, optionally do incremental
+ * backups between full backups.
*/
- for (period = mmrand(NULL, 1, 10);; period = 45) {
+ incremental = 0;
+ for (period = mmrand(NULL, 1, 10);; period = mmrand(NULL, 20, 45)) {
/* Sleep for short periods so we don't make the run wait. */
while (period > 0 && !g.workers_finished) {
--period;
sleep(1);
}
- if (g.workers_finished)
- break;
- /* Lock out named checkpoints */
+ /*
+ * We can't drop named checkpoints while there's a backup in
+ * progress, serialize backups with named checkpoints. Wait
+ * for the checkpoint to complete, otherwise backups might be
+ * starved out.
+ */
testutil_check(pthread_rwlock_wrlock(&g.backup_lock));
+ if (g.workers_finished) {
+ testutil_check(pthread_rwlock_unlock(&g.backup_lock));
+ break;
+ }
- /* Re-create the backup directory. */
- testutil_checkfmt(
- system(g.home_backup_init),
- "%s", "backup directory creation failed");
+ if (incremental) {
+ config = "target=(\"log:\")";
+ full = false;
+ } else {
+ /* Re-create the backup directory. */
+ testutil_checkfmt(
+ system(g.home_backup_init),
+ "%s", "backup directory creation failed");
+
+ config = NULL;
+ full = true;
+ }
/*
- * open_cursor can return EBUSY if a metadata operation is
- * currently happening - retry in that case.
+ * open_cursor can return EBUSY if concurrent with a metadata
+ * operation, retry in that case.
*/
- while ((ret = session->open_cursor(session,
- "backup:", NULL, NULL, &backup_cursor)) == EBUSY)
- sleep(1);
+ while ((ret = session->open_cursor(
+ session, "backup:", NULL, config, &backup_cursor)) == EBUSY)
+ __wt_yield();
if (ret != 0)
testutil_die(ret, "session.open_cursor: backup");
while ((ret = backup_cursor->next(backup_cursor)) == 0) {
testutil_check(
backup_cursor->get_key(backup_cursor, &key));
- copy_file(key);
+ copy_file(session, key);
}
+ if (ret != WT_NOTFOUND)
+ testutil_die(ret, "backup-cursor");
+
+ /* After an incremental backup, truncate the log files. */
+ if (incremental)
+ testutil_check(session->truncate(
+ session, "log:", backup_cursor, NULL, NULL));
testutil_check(backup_cursor->close(backup_cursor));
testutil_check(pthread_rwlock_unlock(&g.backup_lock));
- check_copy();
+ /*
+ * If automatic log archival isn't configured, optionally do
+ * incremental backups after each full backup. If we're not
+ * doing any more incrementals, verify the backup (we can't
+ * verify intermediate states, once we perform recovery on the
+ * backup database, we can't do any more incremental backups).
+ */
+ if (full)
+ incremental =
+ g.c_logging_archive ? 1 : mmrand(NULL, 1, 5);
+ if (--incremental == 0)
+ check_copy();
}
+ if (incremental != 0)
+ check_copy();
+
testutil_check(session->close(session, NULL));
return (NULL);
diff --git a/test/format/bdb.c b/test/format/bdb.c
index 823fc8ff888..48229cfd5e7 100644
--- a/test/format/bdb.c
+++ b/test/format/bdb.c
@@ -30,7 +30,7 @@
#include "format.h"
static DBT key, value;
-static uint8_t *keybuf;
+static WT_ITEM keyitem;
static int
bdb_compare_reverse(DB *dbp, const DBT *k1, const DBT *k2
@@ -78,7 +78,7 @@ bdb_open(void)
assert(db->cursor(db, NULL, &dbc, 0) == 0);
g.dbc = dbc;
- key_gen_setup(&keybuf);
+ key_gen_setup(&keyitem);
}
void
@@ -95,8 +95,7 @@ bdb_close(void)
assert(db->close(db, 0) == 0);
assert(dbenv->close(dbenv, 0) == 0);
- free(keybuf);
- keybuf = NULL;
+ free(keyitem.mem);
}
void
@@ -144,12 +143,11 @@ void
bdb_read(uint64_t keyno, void *valuep, size_t *valuesizep, int *notfoundp)
{
DBC *dbc = g.dbc;
- size_t size;
int ret;
- key_gen(keybuf, &size, keyno);
- key.data = keybuf;
- key.size = (uint32_t)size;
+ key_gen(&keyitem, keyno);
+ key.data = (void *)keyitem.data;
+ key.size = keyitem.size;
*notfoundp = 0;
if ((ret = dbc->get(dbc, &key, &value, DB_SET)) != 0) {
@@ -165,7 +163,7 @@ bdb_read(uint64_t keyno, void *valuep, size_t *valuesizep, int *notfoundp)
void
bdb_update(const void *arg_key, size_t arg_key_size,
- const void *arg_value, size_t arg_value_size, int *notfoundp)
+ const void *arg_value, size_t arg_value_size)
{
DBC *dbc = g.dbc;
int ret;
@@ -175,15 +173,10 @@ bdb_update(const void *arg_key, size_t arg_key_size,
value.data = (void *)arg_value;
value.size = (uint32_t)arg_value_size;
- *notfoundp = 0;
- if ((ret = dbc->put(dbc, &key, &value, DB_KEYFIRST)) != 0) {
- if (ret != DB_NOTFOUND) {
- testutil_die(ret, "dbc.put: DB_KEYFIRST: {%.*s}{%.*s}",
- (int)key.size, (char *)key.data,
- (int)value.size, (char *)value.data);
- }
- *notfoundp = 1;
- }
+ if ((ret = dbc->put(dbc, &key, &value, DB_KEYFIRST)) != 0)
+ testutil_die(ret, "dbc.put: DB_KEYFIRST: {%.*s}{%.*s}",
+ (int)key.size, (char *)key.data,
+ (int)value.size, (char *)value.data);
}
void
@@ -193,9 +186,9 @@ bdb_remove(uint64_t keyno, int *notfoundp)
size_t size;
int ret;
- key_gen(keybuf, &size, keyno);
- key.data = keybuf;
- key.size = (uint32_t)size;
+ key_gen(&keyitem, keyno);
+ key.data = (void *)keyitem.data;
+ key.size = keyitem.size;
bdb_read(keyno, &value.data, &size, notfoundp);
value.size = (uint32_t)size;
diff --git a/test/format/bulk.c b/test/format/bulk.c
index 64b005d294f..dab23bed404 100644
--- a/test/format/bulk.c
+++ b/test/format/bulk.c
@@ -33,13 +33,12 @@ wts_load(void)
{
WT_CONNECTION *conn;
WT_CURSOR *cursor;
+ WT_DECL_RET;
WT_ITEM key, value;
WT_SESSION *session;
- uint8_t *keybuf, *valbuf;
bool is_bulk;
conn = g.wts_conn;
- keybuf = valbuf = NULL;
testutil_check(conn->open_session(conn, NULL, NULL, &session));
@@ -63,8 +62,8 @@ wts_load(void)
is_bulk ? "bulk,append" : NULL, &cursor));
/* Set up the key/value buffers. */
- key_gen_setup(&keybuf);
- val_gen_setup(NULL, &valbuf);
+ key_gen_setup(&key);
+ val_gen_setup(NULL, &value);
for (;;) {
if (++g.key_cnt > g.c_rows) {
@@ -73,13 +72,11 @@ wts_load(void)
}
/* Report on progress every 100 inserts. */
- if (g.key_cnt % 100 == 0)
+ if (g.key_cnt % 1000 == 0)
track("bulk load", g.key_cnt, NULL);
- key_gen(keybuf, &key.size, (uint64_t)g.key_cnt);
- key.data = keybuf;
- val_gen(NULL, valbuf, &value.size, (uint64_t)g.key_cnt);
- value.data = valbuf;
+ key_gen(&key, g.key_cnt);
+ val_gen(NULL, &value, g.key_cnt);
switch (g.type) {
case FIX:
@@ -88,7 +85,7 @@ wts_load(void)
cursor->set_value(cursor, *(uint8_t *)value.data);
if (g.logging == LOG_OPS)
(void)g.wt_api->msg_printf(g.wt_api, session,
- "%-10s %" PRIu32 " {0x%02" PRIx8 "}",
+ "%-10s %" PRIu64 " {0x%02" PRIx8 "}",
"bulk V",
g.key_cnt, ((uint8_t *)value.data)[0]);
break;
@@ -98,7 +95,7 @@ wts_load(void)
cursor->set_value(cursor, &value);
if (g.logging == LOG_OPS)
(void)g.wt_api->msg_printf(g.wt_api, session,
- "%-10s %" PRIu32 " {%.*s}", "bulk V",
+ "%-10s %" PRIu64 " {%.*s}", "bulk V",
g.key_cnt,
(int)value.size, (char *)value.data);
break;
@@ -106,18 +103,40 @@ wts_load(void)
cursor->set_key(cursor, &key);
if (g.logging == LOG_OPS)
(void)g.wt_api->msg_printf(g.wt_api, session,
- "%-10s %" PRIu32 " {%.*s}", "bulk K",
+ "%-10s %" PRIu64 " {%.*s}", "bulk K",
g.key_cnt, (int)key.size, (char *)key.data);
cursor->set_value(cursor, &value);
if (g.logging == LOG_OPS)
(void)g.wt_api->msg_printf(g.wt_api, session,
- "%-10s %" PRIu32 " {%.*s}", "bulk V",
+ "%-10s %" PRIu64 " {%.*s}", "bulk V",
g.key_cnt,
(int)value.size, (char *)value.data);
break;
}
- testutil_check(cursor->insert(cursor));
+ /*
+ * We don't want to size the cache to ensure the initial data
+ * set can load in the in-memory case, guaranteeing the load
+ * succeeds probably means future updates are also guaranteed
+ * to succeed, which isn't what we want. If we run out of space
+ * in the initial load, reset the row counter and continue.
+ *
+ * Decrease inserts, they can't be successful if we're at the
+ * cache limit, and increase the delete percentage to get some
+ * extra space once the run starts.
+ */
+ if ((ret = cursor->insert(cursor)) != 0) {
+ if (ret != WT_CACHE_FULL)
+ testutil_die(ret, "cursor.insert");
+ g.rows = --g.key_cnt;
+ g.c_rows = (uint32_t)g.key_cnt;
+
+ if (g.c_insert_pct > 5)
+ g.c_insert_pct = 5;
+ if (g.c_delete_pct < 20)
+ g.c_delete_pct += 20;
+ break;
+ }
#ifdef HAVE_BERKELEY_DB
if (SINGLETHREADED)
@@ -133,6 +152,6 @@ wts_load(void)
testutil_check(session->close(session, NULL));
- free(keybuf);
- free(valbuf);
+ free(key.mem);
+ free(value.mem);
}
diff --git a/test/format/compact.c b/test/format/compact.c
index a75ee4f2adf..240e5553697 100644
--- a/test/format/compact.c
+++ b/test/format/compact.c
@@ -36,9 +36,9 @@ void *
compact(void *arg)
{
WT_CONNECTION *conn;
+ WT_DECL_RET;
WT_SESSION *session;
u_int period;
- int ret;
(void)(arg);
diff --git a/test/format/config.c b/test/format/config.c
index 042316d8344..1b09916bd88 100644
--- a/test/format/config.c
+++ b/test/format/config.c
@@ -35,6 +35,7 @@ static void config_encryption(void);
static const char *config_file_type(u_int);
static CONFIG *config_find(const char *, size_t);
static void config_in_memory(void);
+static void config_in_memory_check(void);
static int config_is_perm(const char *);
static void config_isolation(void);
static void config_lrt(void);
@@ -43,6 +44,7 @@ static void config_map_compression(const char *, u_int *);
static void config_map_encryption(const char *, u_int *);
static void config_map_file_type(const char *, u_int *);
static void config_map_isolation(const char *, u_int *);
+static void config_reset(void);
/*
* config_setup --
@@ -54,14 +56,10 @@ config_setup(void)
CONFIG *cp;
/* Clear any temporary values. */
- config_clear();
+ config_reset();
- /*
- * Periodically, run in-memory; don't do it on the first run, all our
- * smoke tests would hit it.
- */
- if (!config_is_perm("in_memory") && g.run_cnt % 20 == 19)
- g.c_in_memory = 1;
+ /* Periodically run in-memory. */
+ config_in_memory();
/*
* Choose a data source type and a file type: they're interrelated (LSM
@@ -145,7 +143,7 @@ config_setup(void)
/* Some data-sources don't support user-specified collations. */
if (DATASOURCE("helium") || DATASOURCE("kvsbdb"))
- g.c_reverse = 0;
+ config_single("reverse=off", 0);
/*
* Periodically, run single-threaded so we can compare the results to
@@ -159,7 +157,6 @@ config_setup(void)
config_compression("compression");
config_compression("logging_compression");
config_encryption();
- config_in_memory();
config_isolation();
config_lrt();
@@ -169,7 +166,7 @@ config_setup(void)
* Don't do it on the first run, all our smoke tests would hit it.
*/
if (!g.replay && g.run_cnt % 10 == 9 && !config_is_perm("delete_pct"))
- g.c_delete_pct = 0;
+ config_single("delete_pct=0", 0);
/*
* If this is an LSM run, set the cache size and crank up the insert
@@ -187,9 +184,12 @@ config_setup(void)
if (!config_is_perm("cache") && g.c_cache < g.c_threads)
g.c_cache = g.c_threads;
+ /* Give in-memory configuration a final review. */
+ config_in_memory_check();
+
/* Make the default maximum-run length 20 minutes. */
if (!config_is_perm("timer"))
- g.c_timer = 20;
+ config_single("timer=20", 0);
/*
* Key/value minimum/maximum are related, correct unless specified by
@@ -329,43 +329,89 @@ config_encryption(void)
/*
* config_in_memory --
- * In-memory configuration.
+ * Periodically set up an in-memory configuration.
*/
static void
config_in_memory(void)
{
+ /*
+ * Configure in-memory before configuring anything else, in-memory has
+ * many related requirements. Don't configure in-memory if there's any
+ * incompatible configurations, so we don't have to configure in-memory
+ * every time we configure something like LSM, that's too painful.
+ */
+ if (config_is_perm("backups"))
+ return;
+ if (config_is_perm("checkpoints"))
+ return;
+ if (config_is_perm("compression"))
+ return;
+ if (config_is_perm("data_source") && DATASOURCE("lsm"))
+ return;
+ if (config_is_perm("logging"))
+ return;
+ if (config_is_perm("rebalance"))
+ return;
+ if (config_is_perm("salvage"))
+ return;
+ if (config_is_perm("verify"))
+ return;
+
+ if (!config_is_perm("in_memory") && mmrand(NULL, 1, 20) == 1)
+ g.c_in_memory = 1;
+}
+
+/*
+ * config_in_memory_check --
+ * In-memory configuration review.
+ */
+static void
+config_in_memory_check(void)
+{
+ uint32_t cache;
+
if (g.c_in_memory == 0)
return;
/* Turn off a lot of stuff. */
if (!config_is_perm("backups"))
- g.c_backups = 0;
+ config_single("backups=off", 0);
if (!config_is_perm("checkpoints"))
- g.c_checkpoints = 0;
- if (!config_is_perm("compression")) {
- g.c_compression = dstrdup("none");
- g.c_compression_flag = COMPRESS_NONE;
- }
+ config_single("checkpoints=off", 0);
+ if (!config_is_perm("compression"))
+ config_single("compression=none", 0);
if (!config_is_perm("logging"))
- g.c_logging = 0;
+ config_single("logging=off", 0);
if (!config_is_perm("rebalance"))
- g.c_rebalance = 0;
+ config_single("rebalance=off", 0);
if (!config_is_perm("salvage"))
- g.c_salvage = 0;
+ config_single("salvage=off", 0);
if (!config_is_perm("verify"))
- g.c_verify = 0;
+ config_single("verify=off", 0);
/*
- * Ensure there is 250MB of cache per thread; keep keys/values small,
- * overflow items aren't an issue for in-memory configurations and it
- * keeps us from overflowing the cache.
+ * Keep keys/values small, overflow items aren't an issue for in-memory
+ * configurations and it keeps us from overflowing the cache.
*/
- if (!config_is_perm("cache"))
- g.c_cache = g.c_threads * 250;
if (!config_is_perm("key_max"))
- g.c_value_max = 64;
+ config_single("key_max=32", 0);
if (!config_is_perm("value_max"))
- g.c_value_max = 128;
+ config_single("value_max=80", 0);
+
+ /*
+ * Size the cache relative to the initial data set, use 2x the base
+ * size as a minimum.
+ */
+ if (!config_is_perm("cache")) {
+ cache = g.c_value_max;
+ if (g.type == ROW)
+ cache += g.c_key_max;
+ cache *= g.c_rows;
+ cache *= 2;
+ cache /= WT_MEGABYTE;
+ if (g.c_cache < cache)
+ g.c_cache = cache;
+ }
}
/*
@@ -413,11 +459,11 @@ config_lrt(void)
* stores.
*/
if (g.type == FIX) {
- if (g.c_long_running_txn && config_is_perm("long_running_txn"))
+ if (config_is_perm("long_running_txn"))
testutil_die(EINVAL,
"long_running_txn not supported with fixed-length "
"column store");
- g.c_long_running_txn = 0;
+ config_single("long_running_txn=off", 0);
}
}
@@ -503,18 +549,36 @@ config_file(const char *name)
/*
* config_clear --
- * Clear per-run values.
+ * Clear all configuration values.
*/
void
config_clear(void)
{
CONFIG *cp;
- /* Clear configuration data. */
+ /* Clear all allocated configuration data. */
+ for (cp = c; cp->name != NULL; ++cp)
+ if (cp->vstr != NULL) {
+ free((void *)*cp->vstr);
+ *cp->vstr = NULL;
+ }
+ free(g.uri);
+ g.uri = NULL;
+}
+
+/*
+ * config_reset --
+ * Clear per-run configuration values.
+ */
+static void
+config_reset(void)
+{
+ CONFIG *cp;
+
+ /* Clear temporary allocated configuration data. */
for (cp = c; cp->name != NULL; ++cp) {
F_CLR(cp, C_TEMP);
- if (!F_ISSET(cp, C_PERM) &&
- F_ISSET(cp, C_STRING) && cp->vstr != NULL) {
+ if (!F_ISSET(cp, C_PERM) && cp->vstr != NULL) {
free((void *)*cp->vstr);
*cp->vstr = NULL;
}
@@ -531,7 +595,7 @@ void
config_single(const char *s, int perm)
{
CONFIG *cp;
- uint32_t v;
+ long v;
char *p;
const char *ep;
@@ -557,43 +621,59 @@ config_single(const char *s, int perm)
exit(EXIT_FAILURE);
}
+ /*
+ * Free the previous setting if a configuration has been
+ * passed in twice.
+ */
+ if (*cp->vstr != NULL) {
+ free(*cp->vstr);
+ *cp->vstr = NULL;
+ }
+
if (strncmp(s, "checksum", strlen("checksum")) == 0) {
config_map_checksum(ep, &g.c_checksum_flag);
- *cp->vstr = strdup(ep);
+ *cp->vstr = dstrdup(ep);
} else if (strncmp(
s, "compression", strlen("compression")) == 0) {
config_map_compression(ep, &g.c_compression_flag);
- *cp->vstr = strdup(ep);
+ *cp->vstr = dstrdup(ep);
} else if (strncmp(
s, "encryption", strlen("encryption")) == 0) {
config_map_encryption(ep, &g.c_encryption_flag);
- *cp->vstr = strdup(ep);
+ *cp->vstr = dstrdup(ep);
} else if (strncmp(s, "isolation", strlen("isolation")) == 0) {
config_map_isolation(ep, &g.c_isolation_flag);
- *cp->vstr = strdup(ep);
+ *cp->vstr = dstrdup(ep);
} else if (strncmp(s, "file_type", strlen("file_type")) == 0) {
config_map_file_type(ep, &g.type);
- *cp->vstr = strdup(config_file_type(g.type));
+ *cp->vstr = dstrdup(config_file_type(g.type));
} else if (strncmp(s, "logging_compression",
strlen("logging_compression")) == 0) {
config_map_compression(ep,
&g.c_logging_compression_flag);
- *cp->vstr = strdup(ep);
+ *cp->vstr = dstrdup(ep);
} else {
free((void *)*cp->vstr);
- *cp->vstr = strdup(ep);
+ *cp->vstr = dstrdup(ep);
}
- if (*cp->vstr == NULL)
- testutil_die(errno, "malloc");
return;
}
- v = (uint32_t)strtoul(ep, &p, 10);
- if (*p != '\0') {
- fprintf(stderr, "%s: %s: illegal numeric value\n",
- g.progname, s);
- exit(EXIT_FAILURE);
+ v = -1;
+ if (F_ISSET(cp, C_BOOL)) {
+ if (strncmp(ep, "off", strlen("off")) == 0)
+ v = 0;
+ else if (strncmp(ep, "on", strlen("on")) == 0)
+ v = 1;
+ }
+ if (v == -1) {
+ v = strtol(ep, &p, 10);
+ if (*p != '\0') {
+ fprintf(stderr, "%s: %s: illegal numeric value\n",
+ g.progname, s);
+ exit(EXIT_FAILURE);
+ }
}
if (F_ISSET(cp, C_BOOL)) {
if (v != 0 && v != 1) {
@@ -607,7 +687,7 @@ config_single(const char *s, int perm)
g.progname, s, cp->min, cp->maxset);
exit(EXIT_FAILURE);
}
- *cp->v = v;
+ *cp->v = (uint32_t)v;
}
/*
diff --git a/test/format/config.h b/test/format/config.h
index a17614bc044..16fffb6fafe 100644
--- a/test/format/config.h
+++ b/test/format/config.h
@@ -294,6 +294,10 @@ static CONFIG c[] = {
"maximum time to run in minutes (default 20 minutes)",
C_IGNORE, 0, UINT_MAX, UINT_MAX, &g.c_timer, NULL },
+ { "transaction-frequency",
+ "percent operations done inside an explicit transaction",
+ 0x0, 1, 100, 100, &g.c_txn_freq, NULL },
+
{ "value_max",
"maximum size of values",
0x0, 32, 4096, MEGABYTE(10), &g.c_value_max, NULL },
diff --git a/test/format/format.h b/test/format/format.h
index a129c5395fd..beaabe7e83c 100644
--- a/test/format/format.h
+++ b/test/format/format.h
@@ -26,33 +26,10 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
-#include <sys/stat.h>
-#ifndef _WIN32
-#include <sys/time.h>
-#endif
-#include <sys/types.h>
-
-#include <assert.h>
-#include <ctype.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <inttypes.h>
-#include <limits.h>
-#ifndef _WIN32
-#include <pthread.h>
-#endif
-#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-#include <time.h>
-
#include "test_util.i"
#ifdef BDB
+#include <assert.h>
#include <db.h>
#endif
@@ -109,7 +86,6 @@ typedef struct {
char *home; /* Home directory */
char *home_backup; /* Hot-backup directory */
- char *home_backup2; /* Saved Hot-backup directory */
char *home_backup_init; /* Initialize backup command */
char *home_bdb; /* BDB directory */
char *home_config; /* Run CONFIG file path */
@@ -145,7 +121,8 @@ typedef struct {
int replay; /* Replaying a run. */
int workers_finished; /* Operations completed */
- pthread_rwlock_t backup_lock; /* Hot backup running */
+ pthread_rwlock_t backup_lock; /* Backup running */
+ pthread_rwlock_t checkpoint_lock; /* Checkpoint running */
WT_RAND_STATE rnd; /* Global RNG state */
@@ -224,6 +201,7 @@ typedef struct {
uint32_t c_statistics_server;
uint32_t c_threads;
uint32_t c_timer;
+ uint32_t c_txn_freq;
uint32_t c_value_max;
uint32_t c_value_min;
uint32_t c_verify;
@@ -297,7 +275,7 @@ void bdb_np(int, void *, size_t *, void *, size_t *, int *);
void bdb_open(void);
void bdb_read(uint64_t, void *, size_t *, int *);
void bdb_remove(uint64_t, int *);
-void bdb_update(const void *, size_t, const void *, size_t, int *);
+void bdb_update(const void *, size_t, const void *, size_t);
#endif
void *backup(void *);
@@ -308,25 +286,23 @@ void config_file(const char *);
void config_print(int);
void config_setup(void);
void config_single(const char *, int);
-void *dmalloc(size_t);
-char *dstrdup(const char *);
void fclose_and_clear(FILE **);
-void key_gen(uint8_t *, size_t *, uint64_t);
-void key_gen_insert(WT_RAND_STATE *, uint8_t *, size_t *, uint64_t);
-void key_gen_setup(uint8_t **);
+void key_gen(WT_ITEM *, uint64_t);
+void key_gen_insert(WT_RAND_STATE *, WT_ITEM *, uint64_t);
+void key_gen_setup(WT_ITEM *);
void key_len_setup(void);
void *lrt(void *);
void path_setup(const char *);
-int read_row(WT_CURSOR *, WT_ITEM *, uint64_t, int);
+int read_row(WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t);
uint32_t rng(WT_RAND_STATE *);
void track(const char *, uint64_t, TINFO *);
-void val_gen(WT_RAND_STATE *, uint8_t *, size_t *, uint64_t);
-void val_gen_setup(WT_RAND_STATE *, uint8_t **);
+void val_gen(WT_RAND_STATE *, WT_ITEM *, uint64_t);
+void val_gen_setup(WT_RAND_STATE *, WT_ITEM *);
void wts_close(void);
-void wts_create(void);
void wts_dump(const char *, int);
+void wts_init(void);
void wts_load(void);
-void wts_open(const char *, int, WT_CONNECTION **);
+void wts_open(const char *, bool, WT_CONNECTION **);
void wts_ops(int);
void wts_read_scan(void);
void wts_rebalance(void);
diff --git a/test/format/lrt.c b/test/format/lrt.c
index 451d2f4fa3c..937525522fa 100644
--- a/test/format/lrt.c
+++ b/test/format/lrt.c
@@ -43,17 +43,15 @@ lrt(void *arg)
uint64_t keyno, saved_keyno;
u_int period;
int pinned, ret;
- uint8_t bitfield, *keybuf;
+ uint8_t bitfield;
void *buf;
(void)(arg); /* Unused parameter */
saved_keyno = 0; /* [-Werror=maybe-uninitialized] */
- key_gen_setup(&keybuf);
- memset(&key, 0, sizeof(key));
- key.data = keybuf;
- memset(&value, 0, sizeof(value));
+ key_gen_setup(&key);
+ val_gen_setup(NULL, &value);
buf = NULL;
buf_len = buf_size = 0;
@@ -67,8 +65,8 @@ lrt(void *arg)
for (pinned = 0;;) {
if (pinned) {
/* Re-read the record at the end of the table. */
- while ((ret = read_row(cursor,
- &key, saved_keyno, 1)) == WT_ROLLBACK)
+ while ((ret = read_row(
+ cursor, &key, &value, saved_keyno)) == WT_ROLLBACK)
;
if (ret != 0)
testutil_die(ret,
@@ -112,7 +110,7 @@ lrt(void *arg)
(u_int)(g.key_cnt - g.key_cnt / 10),
(u_int)g.key_cnt);
while ((ret = read_row(cursor,
- &key, saved_keyno, 1)) == WT_ROLLBACK)
+ &key, &value, saved_keyno)) == WT_ROLLBACK)
;
} while (ret == WT_NOTFOUND);
if (ret != 0)
@@ -129,9 +127,8 @@ lrt(void *arg)
if (ret != 0)
testutil_die(ret,
"cursor.get_value: %" PRIu64, saved_keyno);
- if (buf_len < value.size &&
- (buf = realloc(buf, buf_len = value.size)) == NULL)
- testutil_die(errno, "malloc");
+ if (buf_len < value.size)
+ buf = drealloc(buf, buf_len = value.size);
memcpy(buf, value.data, buf_size = value.size);
/*
@@ -142,7 +139,7 @@ lrt(void *arg)
do {
keyno = mmrand(NULL, 1, (u_int)g.key_cnt / 5);
while ((ret = read_row(cursor,
- &key, keyno, 1)) == WT_ROLLBACK)
+ &key, &value, keyno)) == WT_ROLLBACK)
;
} while (ret == WT_NOTFOUND);
if (ret != 0)
@@ -165,7 +162,8 @@ lrt(void *arg)
testutil_check(session->close(session, NULL));
- free(keybuf);
+ free(key.mem);
+ free(value.mem);
free(buf);
return (NULL);
diff --git a/test/format/ops.c b/test/format/ops.c
index 5d66f4d5391..9275d7f3856 100644
--- a/test/format/ops.c
+++ b/test/format/ops.c
@@ -28,14 +28,14 @@
#include "format.h"
-static int col_insert(TINFO *, WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t *);
-static int col_remove(WT_CURSOR *, WT_ITEM *, uint64_t, int *);
-static int col_update(TINFO *, WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t);
-static int nextprev(WT_CURSOR *, int, int *);
+static int col_insert(WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t *);
+static int col_remove(WT_CURSOR *, WT_ITEM *, uint64_t);
+static int col_update(WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t);
+static int nextprev(WT_CURSOR *, int);
static void *ops(void *);
-static int row_insert(TINFO *, WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t);
-static int row_remove(WT_CURSOR *, WT_ITEM *, uint64_t, int *);
-static int row_update(TINFO *, WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t);
+static int row_insert(WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t);
+static int row_remove(WT_CURSOR *, WT_ITEM *, uint64_t);
+static int row_update(WT_CURSOR *, WT_ITEM *, WT_ITEM *, uint64_t);
static void table_append_init(void);
#ifdef HAVE_BERKELEY_DB
@@ -103,8 +103,7 @@ wts_ops(int lastrun)
}
/* Create thread structure; start the worker threads. */
- if ((tinfo = calloc((size_t)g.c_threads, sizeof(*tinfo))) == NULL)
- testutil_die(errno, "calloc");
+ tinfo = dcalloc((size_t)g.c_threads, sizeof(*tinfo));
for (i = 0; i < g.c_threads; ++i) {
tinfo[i].id = (int)i + 1;
tinfo[i].state = TINFO_RUNNING;
@@ -184,6 +183,7 @@ wts_ops(int lastrun)
(void)pthread_join(compact_tid, NULL);
if (!SINGLETHREADED && g.c_long_running_txn)
(void)pthread_join(lrt_tid, NULL);
+ g.workers_finished = 0;
if (g.logging != 0) {
(void)g.wt_api->msg_printf(g.wt_api, session,
@@ -193,57 +193,229 @@ wts_ops(int lastrun)
}
/*
- * ops_session_config --
- * Return the current session configuration.
+ * isolation_config --
+ * Return an isolation configuration.
*/
-static const char *
-ops_session_config(WT_RAND_STATE *rnd)
+static inline const char *
+isolation_config(WT_RAND_STATE *rnd, bool *iso_snapshotp)
{
u_int v;
- /*
- * The only current session configuration is the isolation level.
- */
if ((v = g.c_isolation_flag) == ISOLATION_RANDOM)
v = mmrand(rnd, 2, 4);
switch (v) {
case ISOLATION_READ_UNCOMMITTED:
+ *iso_snapshotp = false;
return ("isolation=read-uncommitted");
case ISOLATION_READ_COMMITTED:
+ *iso_snapshotp = false;
return ("isolation=read-committed");
case ISOLATION_SNAPSHOT:
default:
+ *iso_snapshotp = true;
return ("isolation=snapshot");
}
}
+typedef struct {
+ uint64_t keyno; /* Row number */
+
+ void *kdata; /* If an insert, the generated key */
+ size_t ksize;
+ size_t kmemsize;
+
+ void *vdata; /* If not a delete, the value */
+ size_t vsize;
+ size_t vmemsize;
+
+ bool deleted; /* Delete operation */
+ bool insert; /* Insert operation */
+} SNAP_OPS;
+
+/*
+ * snap_track --
+ * Add a single snapshot isolation returned value to the list.
+ */
+static void
+snap_track(SNAP_OPS *snap, uint64_t keyno, WT_ITEM *key, WT_ITEM *value)
+{
+ snap->keyno = keyno;
+ if (key == NULL)
+ snap->insert = false;
+ else {
+ snap->insert = true;
+
+ if (snap->kmemsize < key->size) {
+ snap->kdata = drealloc(snap->kdata, key->size);
+ snap->kmemsize = key->size;
+ }
+ memcpy(snap->kdata, key->data, snap->ksize = key->size);
+ }
+ if (value == NULL)
+ snap->deleted = true;
+ else {
+ snap->deleted = false;
+ if (snap->vmemsize < value->size) {
+ snap->vdata = drealloc(snap->vdata, value->size);
+ snap->vmemsize = value->size;
+ }
+ memcpy(snap->vdata, value->data, snap->vsize = value->size);
+ }
+}
+
+/*
+ * snap_check --
+ * Check snapshot isolation operations are repeatable.
+ */
+static int
+snap_check(WT_CURSOR *cursor,
+ SNAP_OPS *start, SNAP_OPS *stop, WT_ITEM *key, WT_ITEM *value)
+{
+ WT_DECL_RET;
+ SNAP_OPS *p;
+ uint8_t bitfield;
+
+ for (; start < stop; ++start) {
+ /* Check for subsequent changes to this record. */
+ for (p = start + 1; p < stop && p->keyno != start->keyno; ++p)
+ ;
+ if (p != stop)
+ continue;
+
+ /*
+ * Retrieve the key/value pair by key. Row-store inserts have a
+ * unique generated key we saved, else generate the key from the
+ * key number.
+ */
+ if (start->insert == 0) {
+ switch (g.type) {
+ case FIX:
+ case VAR:
+ cursor->set_key(cursor, start->keyno);
+ break;
+ case ROW:
+ key_gen(key, start->keyno);
+ cursor->set_key(cursor, key);
+ break;
+ }
+ } else {
+ key->data = start->kdata;
+ key->size = start->ksize;
+ cursor->set_key(cursor, key);
+ }
+ if ((ret = cursor->search(cursor)) == 0) {
+ if (g.type == FIX) {
+ testutil_check(
+ cursor->get_value(cursor, &bitfield));
+ *(uint8_t *)(value->data) = bitfield;
+ value->size = 1;
+ } else
+ testutil_check(
+ cursor->get_value(cursor, value));
+ } else
+ if (ret != WT_NOTFOUND)
+ return (ret);
+
+ /* Check for simple matches. */
+ if (ret == 0 && !start->deleted &&
+ value->size == start->vsize &&
+ memcmp(value->data, start->vdata, value->size) == 0)
+ continue;
+ if (ret == WT_NOTFOUND && start->deleted)
+ continue;
+
+ /*
+ * In fixed length stores, zero values at the end of the key
+ * space are returned as not-found, and not-found row reads
+ * are saved as zero values. Map back-and-forth for simplicity.
+ */
+ if (g.type == FIX) {
+ if (ret == WT_NOTFOUND &&
+ start->vsize == 1 && *(uint8_t *)start->vdata == 0)
+ continue;
+ if (start->deleted &&
+ value->size == 1 && *(uint8_t *)value->data == 0)
+ continue;
+ }
+
+ /* Things went pear-shaped. */
+ switch (g.type) {
+ case FIX:
+ testutil_die(ret,
+ "snap_check: %" PRIu64 " search: "
+ "expected {0x%02x}, found {0x%02x}",
+ start->keyno,
+ start->deleted ? 0 : *(uint8_t *)start->vdata,
+ ret == WT_NOTFOUND ? 0 : *(uint8_t *)value->data);
+ /* NOTREACHED */
+ case ROW:
+ testutil_die(ret,
+ "snap_check: %.*s search: "
+ "expected {%.*s}, found {%.*s}",
+ (int)key->size, key->data,
+ start->deleted ?
+ (int)strlen("deleted") : (int)start->vsize,
+ start->deleted ? "deleted" : start->vdata,
+ ret == WT_NOTFOUND ?
+ (int)strlen("deleted") : (int)value->size,
+ ret == WT_NOTFOUND ? "deleted" : value->data);
+ /* NOTREACHED */
+ case VAR:
+ testutil_die(ret,
+ "snap_check: %" PRIu64 " search: "
+ "expected {%.*s}, found {%.*s}",
+ start->keyno,
+ start->deleted ?
+ (int)strlen("deleted") : (int)start->vsize,
+ start->deleted ? "deleted" : start->vdata,
+ ret == WT_NOTFOUND ?
+ (int)strlen("deleted") : (int)value->size,
+ ret == WT_NOTFOUND ? "deleted" : value->data);
+ /* NOTREACHED */
+ }
+ }
+ return (0);
+}
+
+/*
+ * ops --
+ * Per-thread operations.
+ */
static void *
ops(void *arg)
{
+ SNAP_OPS *snap, snap_list[64];
TINFO *tinfo;
WT_CONNECTION *conn;
WT_CURSOR *cursor, *cursor_insert;
+ WT_DECL_RET;
+ WT_ITEM *key, _key, *value, _value;
WT_SESSION *session;
- WT_ITEM key, value;
uint64_t keyno, ckpt_op, reset_op, session_op;
- uint32_t op;
- uint8_t *keybuf, *valbuf;
- u_int np;
- int ckpt_available, dir, insert, intxn, notfound, readonly;
+ uint32_t op, rnd;
+ u_int i;
+ int dir;
char *ckpt_config, ckpt_name[64];
+ bool ckpt_available, intxn, iso_snapshot, positioned, readonly;
tinfo = arg;
conn = g.wts_conn;
- keybuf = valbuf = NULL;
- readonly = 0; /* -Wconditional-uninitialized */
+ readonly = false; /* -Wconditional-uninitialized */
+
+ /* Initialize tracking of snapshot isolation transaction returns. */
+ snap = NULL;
+ iso_snapshot = false;
+ memset(snap_list, 0, sizeof(snap_list));
/* Initialize the per-thread random number generator. */
__wt_random_init(&tinfo->rnd);
/* Set up the default key and value buffers. */
- key_gen_setup(&keybuf);
- val_gen_setup(&tinfo->rnd, &valbuf);
+ key = &_key;
+ key_gen_setup(key);
+ value = &_value;
+ val_gen_setup(&tinfo->rnd, value);
/* Set the first operation where we'll create sessions and cursors. */
session_op = 0;
@@ -252,12 +424,12 @@ ops(void *arg)
/* Set the first operation where we'll perform checkpoint operations. */
ckpt_op = g.c_checkpoints ? mmrand(&tinfo->rnd, 100, 10000) : 0;
- ckpt_available = 0;
+ ckpt_available = false;
/* Set the first operation where we'll reset the session. */
reset_op = mmrand(&tinfo->rnd, 100, 10000);
- for (intxn = 0; !tinfo->quit; ++tinfo->ops) {
+ for (intxn = false; !tinfo->quit; ++tinfo->ops) {
/*
* We can't checkpoint or swap sessions/cursors while in a
* transaction, resolve any running transaction.
@@ -267,7 +439,7 @@ ops(void *arg)
testutil_check(
session->commit_transaction(session, NULL));
++tinfo->commit;
- intxn = 0;
+ intxn = false;
}
/* Open up a new session and cursors. */
@@ -276,8 +448,8 @@ ops(void *arg)
if (session != NULL)
testutil_check(session->close(session, NULL));
- testutil_check(conn->open_session(conn, NULL,
- ops_session_config(&tinfo->rnd), &session));
+ testutil_check(
+ conn->open_session(conn, NULL, NULL, &session));
/*
* 10% of the time, perform some read-only operations
@@ -299,7 +471,7 @@ ops(void *arg)
session_op += 250;
/* Checkpoints are read-only. */
- readonly = 1;
+ readonly = true;
} else {
/*
* Open two cursors: one for overwriting and one
@@ -325,21 +497,32 @@ ops(void *arg)
session_op += mmrand(&tinfo->rnd, 100, 5000);
/* Updates supported. */
- readonly = 0;
+ readonly = false;
}
}
/* Checkpoint the database. */
if (tinfo->ops == ckpt_op && g.c_checkpoints) {
/*
- * LSM and data-sources don't support named checkpoints,
+ * Checkpoints are single-threaded inside WiredTiger,
+ * skip our checkpoint if another thread is already
+ * doing one.
+ */
+ ret = pthread_rwlock_trywrlock(&g.checkpoint_lock);
+ if (ret == EBUSY)
+ goto skip_checkpoint;
+ testutil_check(ret);
+
+ /*
+ * LSM and data-sources don't support named checkpoints
* and we can't drop a named checkpoint while there's a
- * cursor open on it, otherwise 20% of the time name the
- * checkpoint.
+ * backup in progress, otherwise name the checkpoint 5%
+ * of the time.
*/
- if (DATASOURCE("helium") || DATASOURCE("kvsbdb") ||
- DATASOURCE("lsm") ||
- readonly || mmrand(&tinfo->rnd, 1, 5) == 1)
+ if (mmrand(&tinfo->rnd, 1, 20) != 1 ||
+ DATASOURCE("helium") ||
+ DATASOURCE("kvsbdb") || DATASOURCE("lsm") ||
+ pthread_rwlock_trywrlock(&g.backup_lock) == EBUSY)
ckpt_config = NULL;
else {
(void)snprintf(ckpt_name, sizeof(ckpt_name),
@@ -347,11 +530,6 @@ ops(void *arg)
ckpt_config = ckpt_name;
}
- /* Named checkpoints lock out backups */
- if (ckpt_config != NULL)
- testutil_check(
- pthread_rwlock_wrlock(&g.backup_lock));
-
testutil_checkfmt(
session->checkpoint(session, ckpt_config),
"%s", ckpt_config == NULL ? "" : ckpt_config);
@@ -359,6 +537,8 @@ ops(void *arg)
if (ckpt_config != NULL)
testutil_check(
pthread_rwlock_unlock(&g.backup_lock));
+ testutil_check(
+ pthread_rwlock_unlock(&g.checkpoint_lock));
/* Rephrase the checkpoint name for cursor open. */
if (ckpt_config == NULL)
@@ -367,9 +547,9 @@ ops(void *arg)
else
(void)snprintf(ckpt_name, sizeof(ckpt_name),
"checkpoint=thread-%d", tinfo->id);
- ckpt_available = 1;
+ ckpt_available = true;
- /* Pick the next checkpoint operation. */
+skip_checkpoint: /* Pick the next checkpoint operation. */
ckpt_op += mmrand(&tinfo->rnd, 5000, 20000);
}
@@ -386,21 +566,24 @@ ops(void *arg)
}
/*
- * If we're not single-threaded and we're not in a transaction,
- * start a transaction 20% of the time.
+ * If we're not single-threaded and not in a transaction, choose
+ * an isolation level and start a transaction some percentage of
+ * the time.
*/
if (!SINGLETHREADED &&
- !intxn && mmrand(&tinfo->rnd, 1, 10) >= 8) {
+ !intxn && mmrand(&tinfo->rnd, 1, 100) >= g.c_txn_freq) {
+ testutil_check(
+ session->reconfigure(session,
+ isolation_config(&tinfo->rnd, &iso_snapshot)));
testutil_check(
session->begin_transaction(session, NULL));
- intxn = 1;
- }
- insert = notfound = 0;
+ snap = iso_snapshot ? snap_list : NULL;
+ intxn = true;
+ }
keyno = mmrand(&tinfo->rnd, 1, (u_int)g.rows);
- key.data = keybuf;
- value.data = valbuf;
+ positioned = false;
/*
* Perform some number of operations: the percentage of deletes,
@@ -414,27 +597,30 @@ ops(void *arg)
++tinfo->remove;
switch (g.type) {
case ROW:
- /*
- * If deleting a non-existent record, the cursor
- * won't be positioned, and so can't do a next.
- */
- if (row_remove(cursor, &key, keyno, &notfound))
- goto deadlock;
+ ret = row_remove(cursor, key, keyno);
break;
case FIX:
case VAR:
- if (col_remove(cursor, &key, keyno, &notfound))
- goto deadlock;
+ ret = col_remove(cursor, key, keyno);
break;
}
+ if (ret == 0) {
+ positioned = true;
+ if (snap != NULL && (size_t)
+ (snap - snap_list) < WT_ELEMENTS(snap_list))
+ snap_track(snap++, keyno, NULL, NULL);
+ } else {
+ positioned = false;
+ if (ret == WT_ROLLBACK && intxn)
+ goto deadlock;
+ }
} else if (op < g.c_delete_pct + g.c_insert_pct) {
++tinfo->insert;
switch (g.type) {
case ROW:
- if (row_insert(
- tinfo, cursor, &key, &value, keyno))
- goto deadlock;
- insert = 1;
+ key_gen_insert(&tinfo->rnd, key, keyno);
+ val_gen(&tinfo->rnd, value, keyno);
+ ret = row_insert(cursor, key, value, keyno);
break;
case FIX:
case VAR:
@@ -447,37 +633,60 @@ ops(void *arg)
goto skip_insert;
/* Insert, then reset the insert cursor. */
- if (col_insert(tinfo,
- cursor_insert, &key, &value, &keyno))
- goto deadlock;
+ val_gen(&tinfo->rnd, value, g.rows + 1);
+ ret = col_insert(
+ cursor_insert, key, value, &keyno);
testutil_check(
cursor_insert->reset(cursor_insert));
-
- insert = 1;
break;
}
+ positioned = false;
+ if (ret == 0) {
+ if (snap != NULL && (size_t)
+ (snap - snap_list) < WT_ELEMENTS(snap_list))
+ snap_track(snap++, keyno,
+ g.type == ROW ? key : NULL, value);
+ } else
+ if (ret == WT_ROLLBACK && intxn)
+ goto deadlock;
} else if (
op < g.c_delete_pct + g.c_insert_pct + g.c_write_pct) {
++tinfo->update;
switch (g.type) {
case ROW:
- if (row_update(
- tinfo, cursor, &key, &value, keyno))
- goto deadlock;
+ key_gen(key, keyno);
+ val_gen(&tinfo->rnd, value, keyno);
+ ret = row_update(cursor, key, value, keyno);
break;
case FIX:
case VAR:
-skip_insert: if (col_update(tinfo,
- cursor, &key, &value, keyno))
- goto deadlock;
+skip_insert: val_gen(&tinfo->rnd, value, keyno);
+ ret = col_update(cursor, key, value, keyno);
break;
}
+ if (ret == 0) {
+ positioned = true;
+ if (snap != NULL && (size_t)
+ (snap - snap_list) < WT_ELEMENTS(snap_list))
+ snap_track(snap++, keyno, NULL, value);
+ } else {
+ positioned = false;
+ if (ret == WT_ROLLBACK && intxn)
+ goto deadlock;
+ }
} else {
++tinfo->search;
- if (read_row(cursor, &key, keyno, 0))
- if (intxn)
+ ret = read_row(cursor, key, value, keyno);
+ if (ret == 0) {
+ positioned = true;
+ if (snap != NULL && (size_t)
+ (snap - snap_list) < WT_ELEMENTS(snap_list))
+ snap_track(snap++, keyno, NULL, value);
+ } else {
+ positioned = false;
+ if (ret == WT_ROLLBACK && intxn)
goto deadlock;
- continue;
+ }
}
/*
@@ -485,55 +694,64 @@ skip_insert: if (col_update(tinfo,
* insert, do a small number of next/prev cursor operations in
* a random direction.
*/
- if (!insert) {
+ if (positioned) {
dir = (int)mmrand(&tinfo->rnd, 0, 1);
- for (np = 0; np < mmrand(&tinfo->rnd, 1, 100); ++np) {
- if (notfound)
- break;
- if (nextprev(cursor, dir, &notfound))
+ for (i = 0; i < mmrand(&tinfo->rnd, 1, 100); ++i) {
+ if ((ret = nextprev(cursor, dir)) == 0)
+ continue;
+ if (ret == WT_ROLLBACK && intxn)
goto deadlock;
+ break;
}
}
- /* Read to confirm the operation. */
- ++tinfo->search;
- if (read_row(cursor, &key, keyno, 0))
- goto deadlock;
-
/* Reset the cursor: there is no reason to keep pages pinned. */
testutil_check(cursor->reset(cursor));
/*
- * If we're in the transaction, commit 40% of the time and
+ * If we're in a transaction, commit 40% of the time and
* rollback 10% of the time.
*/
- if (intxn)
- switch (mmrand(&tinfo->rnd, 1, 10)) {
- case 1: case 2: case 3: case 4: /* 40% */
- testutil_check(session->commit_transaction(
- session, NULL));
- ++tinfo->commit;
- intxn = 0;
- break;
- case 5: /* 10% */
- if (0) {
-deadlock: ++tinfo->deadlock;
- }
- testutil_check(session->rollback_transaction(
- session, NULL));
- ++tinfo->rollback;
- intxn = 0;
- break;
- default:
- break;
+ if (!intxn || (rnd = mmrand(&tinfo->rnd, 1, 10)) > 5)
+ continue;
+
+ /*
+ * Ending the transaction. If in snapshot isolation, repeat the
+ * operations and confirm they're unchanged.
+ */
+ if (snap != NULL && (ret = snap_check(
+ cursor, snap_list, snap, key, value)) == WT_ROLLBACK)
+ goto deadlock;
+
+ switch (rnd) {
+ case 1: case 2: case 3: case 4: /* 40% */
+ testutil_check(
+ session->commit_transaction(session, NULL));
+ ++tinfo->commit;
+ break;
+ case 5: /* 10% */
+ if (0) {
+deadlock: ++tinfo->deadlock;
}
+ testutil_check(
+ session->rollback_transaction(session, NULL));
+ ++tinfo->rollback;
+ break;
+ }
+
+ intxn = false;
+ snap = NULL;
}
if (session != NULL)
testutil_check(session->close(session, NULL));
- free(keybuf);
- free(valbuf);
+ for (i = 0; i < WT_ELEMENTS(snap_list); ++i) {
+ free(snap_list[i].kdata);
+ free(snap_list[i].vdata);
+ }
+ free(key->mem);
+ free(value->mem);
tinfo->state = TINFO_COMPLETE;
return (NULL);
@@ -548,40 +766,47 @@ wts_read_scan(void)
{
WT_CONNECTION *conn;
WT_CURSOR *cursor;
- WT_ITEM key;
+ WT_DECL_RET;
+ WT_ITEM key, value;
WT_SESSION *session;
- uint64_t cnt, last_cnt;
- uint8_t *keybuf;
+ uint64_t keyno, last_keyno;
conn = g.wts_conn;
- /* Set up the default key buffer. */
- key_gen_setup(&keybuf);
+ /* Set up the default key/value buffers. */
+ key_gen_setup(&key);
+ val_gen_setup(NULL, &value);
/* Open a session and cursor pair. */
- testutil_check(conn->open_session(
- conn, NULL, ops_session_config(NULL), &session));
- testutil_check(session->open_cursor(
- session, g.uri, NULL, NULL, &cursor));
+ testutil_check(conn->open_session(conn, NULL, NULL, &session));
+ testutil_check(
+ session->open_cursor(session, g.uri, NULL, NULL, &cursor));
/* Check a random subset of the records using the key. */
- for (last_cnt = cnt = 0; cnt < g.key_cnt;) {
- cnt += mmrand(NULL, 1, 17);
- if (cnt > g.rows)
- cnt = g.rows;
- if (cnt - last_cnt > 1000) {
- track("read row scan", cnt, NULL);
- last_cnt = cnt;
+ for (last_keyno = keyno = 0; keyno < g.key_cnt;) {
+ keyno += mmrand(NULL, 1, 17);
+ if (keyno > g.rows)
+ keyno = g.rows;
+ if (keyno - last_keyno > 1000) {
+ track("read row scan", keyno, NULL);
+ last_keyno = keyno;
}
- key.data = keybuf;
- testutil_checkfmt(
- read_row(cursor, &key, cnt, 0), "%s", "read_scan");
+ switch (ret = read_row(cursor, &key, &value, keyno)) {
+ case 0:
+ case WT_NOTFOUND:
+ case WT_ROLLBACK:
+ break;
+ default:
+ testutil_die(
+ ret, "wts_read_scan: read row %" PRIu64, keyno);
+ }
}
testutil_check(session->close(session, NULL));
- free(keybuf);
+ free(key.mem);
+ free(value.mem);
}
/*
@@ -589,10 +814,9 @@ wts_read_scan(void)
* Read and verify a single element in a row- or column-store file.
*/
int
-read_row(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int notfound_err)
+read_row(WT_CURSOR *cursor, WT_ITEM *key, WT_ITEM *value, uint64_t keyno)
{
static int sn = 0;
- WT_ITEM value;
WT_SESSION *session;
int exact, ret;
uint8_t bitfield;
@@ -611,7 +835,7 @@ read_row(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int notfound_err)
cursor->set_key(cursor, keyno);
break;
case ROW:
- key_gen((uint8_t *)key->data, &key->size, keyno);
+ key_gen(key, keyno);
cursor->set_key(cursor, key);
break;
}
@@ -628,37 +852,33 @@ read_row(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int notfound_err)
switch (ret) {
case 0:
if (g.type == FIX) {
- ret = cursor->get_value(cursor, &bitfield);
- value.data = &bitfield;
- value.size = 1;
+ testutil_check(cursor->get_value(cursor, &bitfield));
+ *(uint8_t *)(value->data) = bitfield;
+ value->size = 1;
} else
- ret = cursor->get_value(cursor, &value);
+ testutil_check(cursor->get_value(cursor, value));
break;
- case WT_ROLLBACK:
- return (WT_ROLLBACK);
case WT_NOTFOUND:
- if (notfound_err)
- return (WT_NOTFOUND);
+ /*
+ * In fixed length stores, zero values at the end of the key
+ * space are returned as not found. Treat this the same as
+ * a zero value in the key space, to match BDB's behavior.
+ */
+ if (g.type == FIX) {
+ *(uint8_t *)(value->data) = 0;
+ value->size = 1;
+ ret = 0;
+ }
break;
+ case WT_ROLLBACK:
+ return (WT_ROLLBACK);
default:
testutil_die(ret, "read_row: read row %" PRIu64, keyno);
}
#ifdef HAVE_BERKELEY_DB
if (!SINGLETHREADED)
- return (0);
-
- /*
- * In fixed length stores, zero values at the end of the key space are
- * returned as not found. Treat this the same as a zero value in the
- * key space, to match BDB's behavior.
- */
- if (ret == WT_NOTFOUND && g.type == FIX) {
- bitfield = 0;
- value.data = &bitfield;
- value.size = 1;
- ret = 0;
- }
+ return (ret);
/* Retrieve the BDB value. */
{
@@ -669,20 +889,20 @@ read_row(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int notfound_err)
/* Check for not-found status. */
if (notfound_chk("read_row", ret, notfound, keyno))
- return (0);
+ return (ret);
/* Compare the two. */
- if (value.size != bdb_value.size ||
- memcmp(value.data, bdb_value.data, value.size) != 0) {
+ if (value->size != bdb_value.size ||
+ memcmp(value->data, bdb_value.data, value->size) != 0) {
fprintf(stderr,
"read_row: value mismatch %" PRIu64 ":\n", keyno);
print_item("bdb", &bdb_value);
- print_item(" wt", &value);
+ print_item(" wt", value);
testutil_die(0, NULL);
}
}
#endif
- return (0);
+ return (ret);
}
/*
@@ -690,21 +910,19 @@ read_row(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int notfound_err)
* Read and verify the next/prev element in a row- or column-store file.
*/
static int
-nextprev(WT_CURSOR *cursor, int next, int *notfoundp)
+nextprev(WT_CURSOR *cursor, int next)
{
+ WT_DECL_RET;
WT_ITEM key, value;
uint64_t keyno;
- int ret;
uint8_t bitfield;
const char *which;
+ keyno = 0;
which = next ? "next" : "prev";
- keyno = 0;
- ret = next ? cursor->next(cursor) : cursor->prev(cursor);
- if (ret == WT_ROLLBACK)
- return (WT_ROLLBACK);
- if (ret == 0)
+ switch (ret = (next ? cursor->next(cursor) : cursor->prev(cursor))) {
+ case 0:
switch (g.type) {
case FIX:
if ((ret = cursor->get_key(cursor, &keyno)) == 0 &&
@@ -722,13 +940,20 @@ nextprev(WT_CURSOR *cursor, int next, int *notfoundp)
ret = cursor->get_value(cursor, &value);
break;
}
- if (ret != 0 && ret != WT_NOTFOUND)
+ if (ret != 0)
+ testutil_die(ret, "nextprev: get_key/get_value");
+ break;
+ case WT_NOTFOUND:
+ break;
+ case WT_ROLLBACK:
+ return (WT_ROLLBACK);
+ default:
testutil_die(ret, "%s", which);
- *notfoundp = (ret == WT_NOTFOUND);
+ }
#ifdef HAVE_BERKELEY_DB
if (!SINGLETHREADED)
- return (0);
+ return (ret);
{
WT_ITEM bdb_key, bdb_value;
@@ -743,7 +968,7 @@ nextprev(WT_CURSOR *cursor, int next, int *notfoundp)
&bdb_value.data, &bdb_value.size, &notfound);
if (notfound_chk(
next ? "nextprev(next)" : "nextprev(prev)", ret, notfound, keyno))
- return (0);
+ return (ret);
/* Compare the two. */
if (g.type == ROW) {
@@ -794,7 +1019,7 @@ nextprev(WT_CURSOR *cursor, int next, int *notfoundp)
}
}
#endif
- return (0);
+ return (ret);
}
/*
@@ -802,43 +1027,38 @@ nextprev(WT_CURSOR *cursor, int next, int *notfoundp)
* Update a row in a row-store file.
*/
static int
-row_update(TINFO *tinfo,
- WT_CURSOR *cursor, WT_ITEM *key, WT_ITEM *value, uint64_t keyno)
+row_update(WT_CURSOR *cursor, WT_ITEM *key, WT_ITEM *value, uint64_t keyno)
{
+ WT_DECL_RET;
WT_SESSION *session;
- int ret;
session = cursor->session;
- key_gen((uint8_t *)key->data, &key->size, keyno);
- val_gen(&tinfo->rnd, (uint8_t *)value->data, &value->size, keyno);
-
/* Log the operation */
if (g.logging == LOG_OPS)
(void)g.wt_api->msg_printf(g.wt_api, session,
- "%-10s{%.*s}\n%-10s{%.*s}",
- "putK", (int)key->size, (char *)key->data,
- "putV", (int)value->size, (char *)value->data);
+ "%-10s{%.*s}, {%.*s}",
+ "put",
+ (int)key->size, key->data, (int)value->size, value->data);
cursor->set_key(cursor, key);
cursor->set_value(cursor, value);
- ret = cursor->update(cursor);
- if (ret == WT_ROLLBACK)
+ switch (ret = cursor->update(cursor)) {
+ case 0:
+ break;
+ case WT_CACHE_FULL:
+ case WT_ROLLBACK:
return (WT_ROLLBACK);
- if (ret != 0 && ret != WT_NOTFOUND)
+ default:
testutil_die(ret,
"row_update: update row %" PRIu64 " by key", keyno);
+ }
#ifdef HAVE_BERKELEY_DB
if (!SINGLETHREADED)
return (0);
- {
- int notfound;
-
- bdb_update(key->data, key->size, value->data, value->size, &notfound);
- (void)notfound_chk("row_update", ret, notfound, keyno);
- }
+ bdb_update(key->data, key->size, value->data, value->size);
#endif
return (0);
}
@@ -848,16 +1068,13 @@ row_update(TINFO *tinfo,
* Update a row in a column-store file.
*/
static int
-col_update(TINFO *tinfo,
- WT_CURSOR *cursor, WT_ITEM *key, WT_ITEM *value, uint64_t keyno)
+col_update(WT_CURSOR *cursor, WT_ITEM *key, WT_ITEM *value, uint64_t keyno)
{
+ WT_DECL_RET;
WT_SESSION *session;
- int ret;
session = cursor->session;
- val_gen(&tinfo->rnd, (uint8_t *)value->data, &value->size, keyno);
-
/* Log the operation */
if (g.logging == LOG_OPS) {
if (g.type == FIX)
@@ -877,23 +1094,22 @@ col_update(TINFO *tinfo,
cursor->set_value(cursor, *(uint8_t *)value->data);
else
cursor->set_value(cursor, value);
- ret = cursor->update(cursor);
- if (ret == WT_ROLLBACK)
+ switch (ret = cursor->update(cursor)) {
+ case 0:
+ break;
+ case WT_CACHE_FULL:
+ case WT_ROLLBACK:
return (WT_ROLLBACK);
- if (ret != 0 && ret != WT_NOTFOUND)
+ default:
testutil_die(ret, "col_update: %" PRIu64, keyno);
+ }
#ifdef HAVE_BERKELEY_DB
if (!SINGLETHREADED)
return (0);
- {
- int notfound;
-
- key_gen((uint8_t *)key->data, &key->size, keyno);
- bdb_update(key->data, key->size, value->data, value->size, &notfound);
- (void)notfound_chk("col_update", ret, notfound, keyno);
- }
+ key_gen(key, keyno);
+ bdb_update(key->data, key->size, value->data, value->size);
#else
(void)key; /* [-Wunused-variable] */
#endif
@@ -912,8 +1128,7 @@ table_append_init(void)
g.append_cnt = 0;
free(g.append);
- if ((g.append = calloc(g.append_max, sizeof(uint64_t))) == NULL)
- testutil_die(errno, "calloc");
+ g.append = dcalloc(g.append_max, sizeof(uint64_t));
}
/*
@@ -1005,43 +1220,38 @@ table_append(uint64_t keyno)
* Insert a row in a row-store file.
*/
static int
-row_insert(TINFO *tinfo,
- WT_CURSOR *cursor, WT_ITEM *key, WT_ITEM *value, uint64_t keyno)
+row_insert(WT_CURSOR *cursor, WT_ITEM *key, WT_ITEM *value, uint64_t keyno)
{
+ WT_DECL_RET;
WT_SESSION *session;
- int ret;
session = cursor->session;
- key_gen_insert(&tinfo->rnd, (uint8_t *)key->data, &key->size, keyno);
- val_gen(&tinfo->rnd, (uint8_t *)value->data, &value->size, keyno);
-
/* Log the operation */
if (g.logging == LOG_OPS)
(void)g.wt_api->msg_printf(g.wt_api, session,
- "%-10s{%.*s}\n%-10s{%.*s}",
- "insertK", (int)key->size, (char *)key->data,
- "insertV", (int)value->size, (char *)value->data);
+ "%-10s{%.*s}, {%.*s}",
+ "insert",
+ (int)key->size, key->data, (int)value->size, value->data);
cursor->set_key(cursor, key);
cursor->set_value(cursor, value);
- ret = cursor->insert(cursor);
- if (ret == WT_ROLLBACK)
+ switch (ret = cursor->insert(cursor)) {
+ case 0:
+ break;
+ case WT_CACHE_FULL:
+ case WT_ROLLBACK:
return (WT_ROLLBACK);
- if (ret != 0 && ret != WT_NOTFOUND)
+ default:
testutil_die(ret,
"row_insert: insert row %" PRIu64 " by key", keyno);
+ }
#ifdef HAVE_BERKELEY_DB
if (!SINGLETHREADED)
return (0);
- {
- int notfound;
-
- bdb_update(key->data, key->size, value->data, value->size, &notfound);
- (void)notfound_chk("row_insert", ret, notfound, keyno);
- }
+ bdb_update(key->data, key->size, value->data, value->size);
#endif
return (0);
}
@@ -1051,24 +1261,25 @@ row_insert(TINFO *tinfo,
* Insert an element in a column-store file.
*/
static int
-col_insert(TINFO *tinfo,
- WT_CURSOR *cursor, WT_ITEM *key, WT_ITEM *value, uint64_t *keynop)
+col_insert(WT_CURSOR *cursor, WT_ITEM *key, WT_ITEM *value, uint64_t *keynop)
{
+ WT_DECL_RET;
WT_SESSION *session;
uint64_t keyno;
- int ret;
session = cursor->session;
- val_gen(&tinfo->rnd, (uint8_t *)value->data, &value->size, g.rows + 1);
-
if (g.type == FIX)
cursor->set_value(cursor, *(uint8_t *)value->data);
else
cursor->set_value(cursor, value);
- if ((ret = cursor->insert(cursor)) != 0) {
- if (ret == WT_ROLLBACK)
- return (WT_ROLLBACK);
+ switch (ret = cursor->insert(cursor)) {
+ case 0:
+ break;
+ case WT_CACHE_FULL:
+ case WT_ROLLBACK:
+ return (WT_ROLLBACK);
+ default:
testutil_die(ret, "cursor.insert");
}
testutil_check(cursor->get_key(cursor, &keyno));
@@ -1093,12 +1304,8 @@ col_insert(TINFO *tinfo,
if (!SINGLETHREADED)
return (0);
- {
- int notfound;
-
- key_gen((uint8_t *)key->data, &key->size, keyno);
- bdb_update(key->data, key->size, value->data, value->size, &notfound);
- }
+ key_gen(key, keyno);
+ bdb_update(key->data, key->size, value->data, value->size);
#else
(void)key; /* [-Wunused-variable] */
#endif
@@ -1110,14 +1317,14 @@ col_insert(TINFO *tinfo,
* Remove an row from a row-store file.
*/
static int
-row_remove(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int *notfoundp)
+row_remove(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno)
{
+ WT_DECL_RET;
WT_SESSION *session;
- int ret;
session = cursor->session;
- key_gen((uint8_t *)key->data, &key->size, keyno);
+ key_gen(key, keyno);
/* Log the operation */
if (g.logging == LOG_OPS)
@@ -1128,16 +1335,20 @@ row_remove(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int *notfoundp)
/* We use the cursor in overwrite mode, check for existence. */
if ((ret = cursor->search(cursor)) == 0)
ret = cursor->remove(cursor);
- if (ret == WT_ROLLBACK)
+ switch (ret) {
+ case 0:
+ case WT_NOTFOUND:
+ break;
+ case WT_ROLLBACK:
return (WT_ROLLBACK);
- if (ret != 0 && ret != WT_NOTFOUND)
+ default:
testutil_die(ret,
"row_remove: remove %" PRIu64 " by key", keyno);
- *notfoundp = (ret == WT_NOTFOUND);
+ }
#ifdef HAVE_BERKELEY_DB
if (!SINGLETHREADED)
- return (0);
+ return (ret);
{
int notfound;
@@ -1148,7 +1359,7 @@ row_remove(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int *notfoundp)
#else
(void)key; /* [-Wunused-variable] */
#endif
- return (0);
+ return (ret);
}
/*
@@ -1156,10 +1367,10 @@ row_remove(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int *notfoundp)
* Remove a row from a column-store file.
*/
static int
-col_remove(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int *notfoundp)
+col_remove(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno)
{
+ WT_DECL_RET;
WT_SESSION *session;
- int ret;
session = cursor->session;
@@ -1172,35 +1383,38 @@ col_remove(WT_CURSOR *cursor, WT_ITEM *key, uint64_t keyno, int *notfoundp)
/* We use the cursor in overwrite mode, check for existence. */
if ((ret = cursor->search(cursor)) == 0)
ret = cursor->remove(cursor);
- if (ret == WT_ROLLBACK)
+ switch (ret) {
+ case 0:
+ case WT_NOTFOUND:
+ break;
+ case WT_ROLLBACK:
return (WT_ROLLBACK);
- if (ret != 0 && ret != WT_NOTFOUND)
+ default:
testutil_die(ret,
"col_remove: remove %" PRIu64 " by key", keyno);
- *notfoundp = (ret == WT_NOTFOUND);
+ }
#ifdef HAVE_BERKELEY_DB
if (!SINGLETHREADED)
- return (0);
-
- {
- int notfound;
+ return (ret);
/*
* Deleting a fixed-length item is the same as setting the bits to 0;
* do the same thing for the BDB store.
*/
if (g.type == FIX) {
- key_gen((uint8_t *)key->data, &key->size, keyno);
- bdb_update(key->data, key->size, "\0", 1, &notfound);
- } else
+ key_gen(key, keyno);
+ bdb_update(key->data, key->size, "\0", 1);
+ } else {
+ int notfound;
+
bdb_remove(keyno, &notfound);
- (void)notfound_chk("col_remove", ret, notfound, keyno);
+ (void)notfound_chk("col_remove", ret, notfound, keyno);
}
#else
(void)key; /* [-Wunused-variable] */
#endif
- return (0);
+ return (ret);
}
#ifdef HAVE_BERKELEY_DB
diff --git a/test/format/salvage.c b/test/format/salvage.c
index 526e1563390..8274c556364 100644
--- a/test/format/salvage.c
+++ b/test/format/salvage.c
@@ -36,8 +36,8 @@ static void
salvage(void)
{
WT_CONNECTION *conn;
+ WT_DECL_RET;
WT_SESSION *session;
- int ret;
conn = g.wts_conn;
track("salvage", 0ULL, NULL);
@@ -141,7 +141,7 @@ found: if (fstat(fd, &sb) == -1)
void
wts_salvage(void)
{
- int ret;
+ WT_DECL_RET;
/* Some data-sources don't support salvage. */
if (DATASOURCE("helium") || DATASOURCE("kvsbdb"))
@@ -158,7 +158,7 @@ wts_salvage(void)
testutil_die(ret, "salvage copy step failed");
/* Salvage, then verify. */
- wts_open(g.home, 1, &g.wts_conn);
+ wts_open(g.home, true, &g.wts_conn);
salvage();
wts_verify("post-salvage verify");
wts_close();
@@ -174,7 +174,7 @@ wts_salvage(void)
/* Corrupt the file randomly, salvage, then verify. */
if (corrupt()) {
- wts_open(g.home, 1, &g.wts_conn);
+ wts_open(g.home, true, &g.wts_conn);
salvage();
wts_verify("post-corrupt-salvage verify");
wts_close();
diff --git a/test/format/t.c b/test/format/t.c
index 28c22e23cb8..085163befe2 100644
--- a/test/format/t.c
+++ b/test/format/t.c
@@ -181,6 +181,7 @@ main(int argc, char *argv[])
*/
testutil_check(pthread_rwlock_init(&g.append_lock, NULL));
testutil_check(pthread_rwlock_init(&g.backup_lock, NULL));
+ testutil_check(pthread_rwlock_init(&g.checkpoint_lock, NULL));
testutil_check(pthread_rwlock_init(&g.death_lock, NULL));
printf("%s: process %" PRIdMAX "\n", g.progname, (intmax_t)getpid());
@@ -198,8 +199,8 @@ main(int argc, char *argv[])
if (SINGLETHREADED)
bdb_open(); /* Initial file config */
#endif
- wts_open(g.home, 1, &g.wts_conn);
- wts_create();
+ wts_open(g.home, true, &g.wts_conn);
+ wts_init();
wts_load(); /* Load initial records */
wts_verify("post-bulk verify"); /* Verify */
@@ -275,6 +276,8 @@ main(int argc, char *argv[])
testutil_check(pthread_rwlock_destroy(&g.append_lock));
testutil_check(pthread_rwlock_destroy(&g.backup_lock));
+ testutil_check(pthread_rwlock_destroy(&g.checkpoint_lock));
+ testutil_check(pthread_rwlock_destroy(&g.death_lock));
config_clear();
@@ -288,7 +291,7 @@ main(int argc, char *argv[])
static void
startup(void)
{
- int ret;
+ WT_DECL_RET;
/* Flush/close any logging information. */
fclose_and_clear(&g.logfp);
diff --git a/test/format/util.c b/test/format/util.c
index 2e4c869366c..f2b4d18029e 100644
--- a/test/format/util.c
+++ b/test/format/util.c
@@ -32,56 +32,11 @@
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
#endif
-/*
- * dmalloc --
- * Call malloc, dying on failure.
- */
-void *
-dmalloc(size_t len)
-{
- void *p;
-
- if ((p = malloc(len)) == NULL)
- testutil_die(errno, "malloc");
- return (p);
-}
-
-/*
- * dstrdup --
- * Call strdup, dying on failure.
- */
-char *
-dstrdup(const char *str)
-{
- char *p;
-
- if ((p = strdup(str)) == NULL)
- testutil_die(errno, "strdup");
- return (p);
-}
-
-static inline uint32_t
-kv_len(WT_RAND_STATE *rnd, uint64_t keyno, uint32_t min, uint32_t max)
-{
- /*
- * Focus on relatively small key/value items, admitting the possibility
- * of larger items. Pick a size close to the minimum most of the time,
- * only create a larger item 1 in 20 times, and a really big item 1 in
- * 1000 times. (Configuration can force large key/value minimum sizes,
- * where every key/value item is an overflow.)
- */
- if (keyno % 1000 == 0 && max < KILOBYTE(80)) {
- min = KILOBYTE(80);
- max = KILOBYTE(100);
- } else if (keyno % 20 != 0 && max > min + 20)
- max = min + 20;
- return (mmrand(rnd, min, max));
-}
-
void
key_len_setup(void)
{
size_t i;
+ uint32_t max;
/*
* The key is a variable length item with a leading 10-digit value.
@@ -91,72 +46,113 @@ key_len_setup(void)
* the pre-loaded lengths.
*
* Fill in the random key lengths.
+ *
+ * Focus on relatively small items, admitting the possibility of larger
+ * items. Pick a size close to the minimum most of the time, only create
+ * a larger item 1 in 20 times.
*/
- for (i = 0; i < sizeof(g.key_rand_len) / sizeof(g.key_rand_len[0]); ++i)
- g.key_rand_len[i] =
- kv_len(NULL, (uint64_t)i, g.c_key_min, g.c_key_max);
+ for (i = 0;
+ i < sizeof(g.key_rand_len) / sizeof(g.key_rand_len[0]); ++i) {
+ max = g.c_key_max;
+ if (i % 20 != 0 && max > g.c_key_min + 20)
+ max = g.c_key_min + 20;
+ g.key_rand_len[i] = mmrand(NULL, g.c_key_min, max);
+ }
}
void
-key_gen_setup(uint8_t **keyp)
+key_gen_setup(WT_ITEM *key)
{
- uint8_t *key;
size_t i, len;
-
- *keyp = NULL;
+ char *p;
len = MAX(KILOBYTE(100), g.c_key_max);
- key = dmalloc(len);
+ p = dmalloc(len);
for (i = 0; i < len; ++i)
- key[i] = (uint8_t)("abcdefghijklmnopqrstuvwxyz"[i % 26]);
- *keyp = key;
+ p[i] = "abcdefghijklmnopqrstuvwxyz"[i % 26];
+
+ key->mem = p;
+ key->memsize = len;
+ key->data = key->mem;
+ key->size = 0;
}
static void
-key_gen_common(uint8_t *key, size_t *sizep, uint64_t keyno, int suffix)
+key_gen_common(WT_ITEM *key, uint64_t keyno, int suffix)
{
int len;
+ char *p;
+
+ p = key->mem;
/*
* The key always starts with a 10-digit string (the specified cnt)
* followed by two digits, a random number between 1 and 15 if it's
* an insert, otherwise 00.
*/
- len = sprintf((char *)key, "%010" PRIu64 ".%02d", keyno, suffix);
+ len = sprintf(p, "%010" PRIu64 ".%02d", keyno, suffix);
/*
- * In a column-store, the key is only used for BDB, and so it doesn't
- * need a random length.
+ * In a column-store, the key is only used for Berkeley DB inserts,
+ * and so it doesn't need a random length.
*/
if (g.type == ROW) {
- key[len] = '/';
- len = (int)g.key_rand_len[keyno %
- (sizeof(g.key_rand_len) / sizeof(g.key_rand_len[0]))];
+ p[len] = '/';
+
+ /*
+ * Because we're doing table lookup for key sizes, we weren't
+ * able to set really big keys sizes in the table, the table
+ * isn't big enough to keep our hash from selecting too many
+ * big keys and blowing out the cache. Handle that here, use a
+ * really big key 1 in 2500 times.
+ */
+ len = keyno % 2500 == 0 && g.c_key_max < KILOBYTE(80) ?
+ KILOBYTE(80) :
+ (int)g.key_rand_len[keyno % WT_ELEMENTS(g.key_rand_len)];
}
- *sizep = (size_t)len;
+
+ key->data = key->mem;
+ key->size = (size_t)len;
}
void
-key_gen(uint8_t *key, size_t *sizep, uint64_t keyno)
+key_gen(WT_ITEM *key, uint64_t keyno)
{
- key_gen_common(key, sizep, keyno, 0);
+ key_gen_common(key, keyno, 0);
}
void
-key_gen_insert(WT_RAND_STATE *rnd, uint8_t *key, size_t *sizep, uint64_t keyno)
+key_gen_insert(WT_RAND_STATE *rnd, WT_ITEM *key, uint64_t keyno)
{
- key_gen_common(key, sizep, keyno, (int)mmrand(rnd, 1, 15));
+ key_gen_common(key, keyno, (int)mmrand(rnd, 1, 15));
}
static uint32_t val_dup_data_len; /* Length of duplicate data items */
+static inline uint32_t
+value_len(WT_RAND_STATE *rnd, uint64_t keyno, uint32_t min, uint32_t max)
+{
+ /*
+ * Focus on relatively small items, admitting the possibility of larger
+ * items. Pick a size close to the minimum most of the time, only create
+ * a larger item 1 in 20 times, and a really big item 1 in somewhere
+ * around 2500 items.
+ */
+ if (keyno % 2500 == 0 && max < KILOBYTE(80)) {
+ min = KILOBYTE(80);
+ max = KILOBYTE(100);
+ } else if (keyno % 20 != 0 && max > min + 20)
+ max = min + 20;
+ return (mmrand(rnd, min, max));
+}
+
void
-val_gen_setup(WT_RAND_STATE *rnd, uint8_t **valp)
+val_gen_setup(WT_RAND_STATE *rnd, WT_ITEM *value)
{
- uint8_t *val;
size_t i, len;
+ char *p;
- *valp = NULL;
+ memset(value, 0, sizeof(WT_ITEM));
/*
* Set initial buffer contents to recognizable text.
@@ -166,35 +162,43 @@ val_gen_setup(WT_RAND_STATE *rnd, uint8_t **valp)
* data for column-store run-length encoded files.
*/
len = MAX(KILOBYTE(100), g.c_value_max) + 20;
- val = dmalloc(len);
+ p = dmalloc(len);
for (i = 0; i < len; ++i)
- val[i] = (uint8_t)("ABCDEFGHIJKLMNOPQRSTUVWXYZ"[i % 26]);
+ p[i] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"[i % 26];
- *valp = val;
+ value->mem = p;
+ value->memsize = len;
+ value->data = value->mem;
+ value->size = 0;
- val_dup_data_len = kv_len(rnd,
+ val_dup_data_len = value_len(rnd,
(uint64_t)mmrand(rnd, 1, 20), g.c_value_min, g.c_value_max);
}
void
-val_gen(WT_RAND_STATE *rnd, uint8_t *val, size_t *sizep, uint64_t keyno)
+val_gen(WT_RAND_STATE *rnd, WT_ITEM *value, uint64_t keyno)
{
+ char *p;
+
+ p = value->mem;
+ value->data = value->mem;
+
/*
* Fixed-length records: take the low N bits from the last digit of
* the record number.
*/
if (g.type == FIX) {
switch (g.c_bitcnt) {
- case 8: val[0] = (uint8_t)mmrand(rnd, 1, 0xff); break;
- case 7: val[0] = (uint8_t)mmrand(rnd, 1, 0x7f); break;
- case 6: val[0] = (uint8_t)mmrand(rnd, 1, 0x3f); break;
- case 5: val[0] = (uint8_t)mmrand(rnd, 1, 0x1f); break;
- case 4: val[0] = (uint8_t)mmrand(rnd, 1, 0x0f); break;
- case 3: val[0] = (uint8_t)mmrand(rnd, 1, 0x07); break;
- case 2: val[0] = (uint8_t)mmrand(rnd, 1, 0x03); break;
- case 1: val[0] = 1; break;
+ case 8: p[0] = (char)mmrand(rnd, 1, 0xff); break;
+ case 7: p[0] = (char)mmrand(rnd, 1, 0x7f); break;
+ case 6: p[0] = (char)mmrand(rnd, 1, 0x3f); break;
+ case 5: p[0] = (char)mmrand(rnd, 1, 0x1f); break;
+ case 4: p[0] = (char)mmrand(rnd, 1, 0x0f); break;
+ case 3: p[0] = (char)mmrand(rnd, 1, 0x07); break;
+ case 2: p[0] = (char)mmrand(rnd, 1, 0x03); break;
+ case 1: p[0] = 1; break;
}
- *sizep = 1;
+ value->size = 1;
return;
}
@@ -203,8 +207,8 @@ val_gen(WT_RAND_STATE *rnd, uint8_t *val, size_t *sizep, uint64_t keyno)
* test that by inserting a zero-length data item every so often.
*/
if (keyno % 63 == 0) {
- val[0] = '\0';
- *sizep = 0;
+ p[0] = '\0';
+ value->size = 0;
return;
}
@@ -219,13 +223,14 @@ val_gen(WT_RAND_STATE *rnd, uint8_t *val, size_t *sizep, uint64_t keyno)
if ((g.type == ROW || g.type == VAR) &&
g.c_repeat_data_pct != 0 &&
mmrand(rnd, 1, 100) < g.c_repeat_data_pct) {
- (void)strcpy((char *)val, "DUPLICATEV");
- val[10] = '/';
- *sizep = val_dup_data_len;
+ (void)strcpy(p, "DUPLICATEV");
+ p[10] = '/';
+ value->size = val_dup_data_len;
} else {
- (void)sprintf((char *)val, "%010" PRIu64, keyno);
- val[10] = '/';
- *sizep = kv_len(rnd, keyno, g.c_value_min, g.c_value_max);
+ (void)sprintf(p, "%010" PRIu64, keyno);
+ p[10] = '/';
+ value->size =
+ value_len(rnd, keyno, g.c_value_min, g.c_value_max);
}
}
@@ -305,15 +310,6 @@ path_setup(const char *home)
g.home_stats = dmalloc(len);
snprintf(g.home_stats, len, "%s/%s", g.home, "stats");
- /* Backup directory. */
- len = strlen(g.home) + strlen("BACKUP") + 2;
- g.home_backup = dmalloc(len);
- snprintf(g.home_backup, len, "%s/%s", g.home, "BACKUP");
-
- len = strlen(g.home) + strlen("BACKUP2") + 2;
- g.home_backup2 = dmalloc(len);
- snprintf(g.home_backup2, len, "%s/%s", g.home, "BACKUP2");
-
/* BDB directory. */
len = strlen(g.home) + strlen("bdb") + 2;
g.home_bdb = dmalloc(len);
@@ -341,18 +337,27 @@ path_setup(const char *home)
g.home_init = dmalloc(len);
snprintf(g.home_init, len, CMD, g.home, g.home, g.home);
- /* Backup directory initialize command, remove and re-create it. */
+ /* Primary backup directory. */
+ len = strlen(g.home) + strlen("BACKUP") + 2;
+ g.home_backup = dmalloc(len);
+ snprintf(g.home_backup, len, "%s/%s", g.home, "BACKUP");
+
+ /*
+ * Backup directory initialize command, remove and re-create the primary
+ * backup directory, plus a copy we maintain for recovery testing.
+ */
#undef CMD
#ifdef _WIN32
-#define CMD "del /s /q >:nul && mkdir %s %s"
+#define CMD "del %s/%s %s/%s /s /q >:nul && mkdir %s/%s %s/%s"
#else
-#define CMD "rm -rf %s %s && mkdir %s %s"
+#define CMD "rm -rf %s/%s %s/%s && mkdir %s/%s %s/%s"
#endif
- len = strlen(g.home_backup) * 2 +
- strlen(g.home_backup2) * 2 + strlen(CMD) + 1;
+ len = strlen(g.home) * 4 +
+ strlen("BACKUP") * 2 + strlen("BACKUP_COPY") * 2 + strlen(CMD) + 1;
g.home_backup_init = dmalloc(len);
- snprintf(g.home_backup_init, len, CMD, g.home_backup, g.home_backup2,
- g.home_backup, g.home_backup2);
+ snprintf(g.home_backup_init, len, CMD,
+ g.home, "BACKUP", g.home, "BACKUP_COPY",
+ g.home, "BACKUP", g.home, "BACKUP_COPY");
/*
* Salvage command, save the interesting files so we can replay the
diff --git a/test/format/wts.c b/test/format/wts.c
index 81e484296e2..2ee01aa75b5 100644
--- a/test/format/wts.c
+++ b/test/format/wts.c
@@ -126,10 +126,10 @@ static WT_EVENT_HANDLER event_handler = {
* Open a connection to a WiredTiger database.
*/
void
-wts_open(const char *home, int set_api, WT_CONNECTION **connp)
+wts_open(const char *home, bool set_api, WT_CONNECTION **connp)
{
WT_CONNECTION *conn;
- int ret;
+ WT_DECL_RET;
char *config, *end, *p, helium_config[1024];
*connp = NULL;
@@ -138,10 +138,11 @@ wts_open(const char *home, int set_api, WT_CONNECTION **connp)
end = config + sizeof(g.wiredtiger_open_config);
p += snprintf(p, REMAIN(p, end),
- "create,checkpoint_sync=false,cache_size=%" PRIu32 "MB",
- g.c_cache);
-
- p += snprintf(p, REMAIN(p, end), ",error_prefix=\"%s\"", g.progname);
+ "create=true,"
+ "cache_size=%" PRIu32 "MB,"
+ "checkpoint_sync=false,"
+ "error_prefix=\"%s\"",
+ g.c_cache, g.progname);
/* In-memory configuration. */
if (g.c_in_memory != 0)
@@ -273,8 +274,13 @@ wts_open(const char *home, int set_api, WT_CONNECTION **connp)
void
wts_reopen(void)
{
+ WT_CONNECTION *conn;
+
testutil_checkfmt(wiredtiger_open(g.home, &event_handler,
- g.wiredtiger_open_config, &g.wts_conn), "%s", g.home);
+ g.wiredtiger_open_config, &conn), "%s", g.home);
+
+ g.wt_api = conn->get_extension_api(conn);
+ g.wts_conn = conn;
}
/*
@@ -282,7 +288,7 @@ wts_reopen(void)
* Create the underlying store.
*/
void
-wts_create(void)
+wts_init(void)
{
WT_CONNECTION *conn;
WT_SESSION *session;
@@ -497,8 +503,8 @@ void
wts_verify(const char *tag)
{
WT_CONNECTION *conn;
+ WT_DECL_RET;
WT_SESSION *session;
- int ret;
if (g.c_verify == 0)
return;
@@ -531,12 +537,12 @@ wts_stats(void)
{
WT_CONNECTION *conn;
WT_CURSOR *cursor;
+ WT_DECL_RET;
WT_SESSION *session;
FILE *fp;
char *stat_name;
const char *pval, *desc;
uint64_t v;
- int ret;
/* Ignore statistics if they're not configured. */
if (g.c_statistics == 0)
diff --git a/test/huge/Makefile.am b/test/huge/Makefile.am
index bc76bdc0f3c..151d3a40dd4 100644
--- a/test/huge/Makefile.am
+++ b/test/huge/Makefile.am
@@ -10,4 +10,4 @@ t_LDFLAGS = -static
TESTS = smoke.sh
clean-local:
- rm -rf WiredTiger* *.core __*
+ rm -rf WT_TEST *.core
diff --git a/test/huge/huge.c b/test/huge/huge.c
index ad19035ff99..e7bfd08882f 100644
--- a/test/huge/huge.c
+++ b/test/huge/huge.c
@@ -26,14 +26,6 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
-#include <errno.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
#include "test_util.i"
static char home[512]; /* Program working dir */
@@ -205,8 +197,7 @@ main(int argc, char *argv[])
/* Allocate a buffer to use. */
len = small ? ((size_t)SMALL_MAX) : ((size_t)4 * GIGABYTE);
- if ((big = malloc(len)) == NULL)
- testutil_die(errno, "");
+ big = dmalloc(len);
memset(big, 'a', len);
/* Make sure the configurations all work. */
diff --git a/test/manydbs/Makefile.am b/test/manydbs/Makefile.am
index 53559b25243..d347868aa4f 100644
--- a/test/manydbs/Makefile.am
+++ b/test/manydbs/Makefile.am
@@ -10,4 +10,4 @@ t_LDFLAGS = -static
TESTS = smoke.sh
clean-local:
- rm -rf WiredTiger* *.core __*
+ rm -rf WT_TEST *.core
diff --git a/test/manydbs/manydbs.c b/test/manydbs/manydbs.c
index 1d3412a7b06..4ab455f3620 100644
--- a/test/manydbs/manydbs.c
+++ b/test/manydbs/manydbs.c
@@ -26,22 +26,10 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
-#include <sys/wait.h>
-#include <errno.h>
-#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <wiredtiger.h>
-
#include "test_util.i"
#define HOME_SIZE 512
-#define HOME_BASE "WT_HOME"
+#define HOME_BASE "WT_TEST"
static char home[HOME_SIZE]; /* Base home directory */
static char hometmp[HOME_SIZE]; /* Each conn home directory */
static const char *progname; /* Program name */
@@ -172,17 +160,10 @@ main(int argc, char *argv[])
* Allocate arrays for connection handles, sessions, statistics
* cursors and, if needed, data cursors.
*/
- if ((connections = calloc(
- (size_t)dbs, sizeof(WT_CONNECTION *))) == NULL)
- testutil_die(ENOMEM, "connection array malloc");
- if ((sessions = calloc(
- (size_t)dbs, sizeof(WT_SESSION *))) == NULL)
- testutil_die(ENOMEM, "session array malloc");
- if ((cond_reset_orig = calloc((size_t)dbs, sizeof(uint64_t))) == NULL)
- testutil_die(ENOMEM, "orig stat malloc");
- if (!idle && ((cursors = calloc(
- (size_t)dbs, sizeof(WT_CURSOR *))) == NULL))
- testutil_die(ENOMEM, "cursor array malloc");
+ connections = dcalloc((size_t)dbs, sizeof(WT_CONNECTION *));
+ sessions = dcalloc((size_t)dbs, sizeof(WT_SESSION *));
+ cond_reset_orig = dcalloc((size_t)dbs, sizeof(uint64_t));
+ cursors = idle ? NULL : dcalloc((size_t)dbs, sizeof(WT_CURSOR *));
memset(cmd, 0, sizeof(cmd));
/*
* Set up all the directory names.
@@ -257,8 +238,7 @@ main(int argc, char *argv[])
free(connections);
free(sessions);
free(cond_reset_orig);
- if (!idle)
- free(cursors);
+ free(cursors);
return (EXIT_SUCCESS);
}
diff --git a/test/packing/Makefile.am b/test/packing/Makefile.am
index a9e7e16e5c2..0e7c8cc8b2e 100644
--- a/test/packing/Makefile.am
+++ b/test/packing/Makefile.am
@@ -1,4 +1,5 @@
-AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include
+AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include \
+ -I$(top_srcdir)/test/utility
noinst_PROGRAMS = intpack-test intpack-test2 intpack-test3 packing-test
LDADD = $(top_builddir)/libwiredtiger.la
diff --git a/test/packing/intpack-test.c b/test/packing/intpack-test.c
index 08cc3807725..6412ed296aa 100644
--- a/test/packing/intpack-test.c
+++ b/test/packing/intpack-test.c
@@ -26,9 +26,9 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "wt_internal.h" /* For __wt_XXX */
+#include "test_util.i"
-#include <assert.h>
+void (*custom_die)(void) = NULL;
int
main(void)
@@ -47,9 +47,10 @@ main(void)
#if 1
p = buf;
- assert(__wt_vpack_uint(&p, sizeof(buf), r) == 0);
+ testutil_check(__wt_vpack_uint(&p, sizeof(buf), r));
cp = buf;
- assert(__wt_vunpack_uint(&cp, sizeof(buf), &r2) == 0);
+ testutil_check(
+ __wt_vunpack_uint(&cp, sizeof(buf), &r2));
#else
/*
* Note: use memmove for comparison because GCC does
diff --git a/test/packing/intpack-test2.c b/test/packing/intpack-test2.c
index 7555d2724e7..e9443ad7ed1 100644
--- a/test/packing/intpack-test2.c
+++ b/test/packing/intpack-test2.c
@@ -26,9 +26,9 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "wt_internal.h" /* For __wt_XXX */
+#include "test_util.i"
-#include <assert.h>
+void (*custom_die)(void) = NULL;
int
main(void)
@@ -38,14 +38,15 @@ main(void)
for (i = 1; i < 1LL << 60; i <<= 1) {
end = buf;
- assert(__wt_vpack_uint(&end, sizeof(buf), (uint64_t)i) == 0);
+ testutil_check(
+ __wt_vpack_uint(&end, sizeof(buf), (uint64_t)i));
printf("%" PRId64 " ", i);
for (p = buf; p < end; p++)
printf("%02x", *p);
printf("\n");
end = buf;
- assert(__wt_vpack_int(&end, sizeof(buf), -i) == 0);
+ testutil_check(__wt_vpack_int(&end, sizeof(buf), -i));
printf("%" PRId64 " ", -i);
for (p = buf; p < end; p++)
printf("%02x", *p);
diff --git a/test/packing/intpack-test3.c b/test/packing/intpack-test3.c
index 2ebc01f9e2e..328b45d1bf7 100644
--- a/test/packing/intpack-test3.c
+++ b/test/packing/intpack-test3.c
@@ -26,9 +26,9 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "wt_internal.h" /* For __wt_XXX */
+#include "test_util.i"
-#include <assert.h>
+void (*custom_die)(void) = NULL;
void test_value(int64_t);
void test_spread(int64_t, int64_t, int64_t);
@@ -43,11 +43,12 @@ test_value(int64_t val)
size_t used_len;
sinput = val;
+ soutput = 0; /* Make GCC happy. */
p = buf;
- assert(__wt_vpack_int(&p, sizeof(buf), sinput) == 0);
+ testutil_check(__wt_vpack_int(&p, sizeof(buf), sinput));
used_len = (size_t)(p - buf);
cp = buf;
- assert(__wt_vunpack_int(&cp, used_len, &soutput) == 0);
+ testutil_check(__wt_vunpack_int(&cp, used_len, &soutput));
/* Ensure we got the correct value back */
if (sinput != soutput) {
fprintf(stderr, "mismatch %" PRIu64 ", %" PRIu64 "\n",
@@ -69,10 +70,9 @@ test_value(int64_t val)
uinput = (uint64_t)val;
p = buf;
- assert(__wt_vpack_uint(&p, sizeof(buf), uinput) == 0);
+ testutil_check(__wt_vpack_uint(&p, sizeof(buf), uinput));
cp = buf;
- assert(__wt_vunpack_uint(
- &cp, sizeof(buf), &uoutput) == 0);
+ testutil_check(__wt_vunpack_uint(&cp, sizeof(buf), &uoutput));
/* Ensure we got the correct value back */
if (sinput != soutput) {
fprintf(stderr, "mismatch %" PRIu64 ", %" PRIu64 "\n",
diff --git a/test/packing/packing-test.c b/test/packing/packing-test.c
index 9b7105d7d4a..706eeb0935c 100644
--- a/test/packing/packing-test.c
+++ b/test/packing/packing-test.c
@@ -26,9 +26,9 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
-#include "wt_internal.h" /* For __wt_XXX */
+#include "test_util.i"
-#include <assert.h>
+void (*custom_die)(void) = NULL;
static void
check(const char *fmt, ...)
@@ -40,13 +40,15 @@ check(const char *fmt, ...)
len = 0; /* -Werror=maybe-uninitialized */
va_start(ap, fmt);
- assert(__wt_struct_sizev(NULL, &len, fmt, ap) == 0);
+ testutil_check(__wt_struct_sizev(NULL, &len, fmt, ap));
va_end(ap);
- assert(len > 0 && len < sizeof(buf));
+ if (len < 1 || len >= sizeof(buf))
+ testutil_die(EINVAL,
+ "Unexpected length from __wt_struct_sizev");
va_start(ap, fmt);
- assert(__wt_struct_packv(NULL, buf, sizeof(buf), fmt, ap) == 0);
+ testutil_check(__wt_struct_packv(NULL, buf, sizeof(buf), fmt, ap));
va_end(ap);
printf("%s ", fmt);
diff --git a/test/readonly/Makefile.am b/test/readonly/Makefile.am
index 3abcd2386a1..8028e2ab845 100644
--- a/test/readonly/Makefile.am
+++ b/test/readonly/Makefile.am
@@ -10,4 +10,4 @@ t_LDFLAGS = -static
TESTS = smoke.sh
clean-local:
- rm -rf WT_RD* WiredTiger* *.core __*
+ rm -rf WT_RD* *.core
diff --git a/test/readonly/readonly.c b/test/readonly/readonly.c
index 41400da2605..a35e7ee23fc 100644
--- a/test/readonly/readonly.c
+++ b/test/readonly/readonly.c
@@ -26,20 +26,10 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
-#include <sys/wait.h>
-#include <errno.h>
-#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <wiredtiger.h>
-
#include "test_util.i"
+#include <sys/wait.h>
+
#define HOME_SIZE 512
static char home[HOME_SIZE]; /* Program working dir lock file */
#define HOME_WR_SUFFIX ".WRNOLOCK" /* Writable dir copy no lock file */
diff --git a/test/recovery/Makefile.am b/test/recovery/Makefile.am
index 35f8dd15823..6865d5edf3e 100644
--- a/test/recovery/Makefile.am
+++ b/test/recovery/Makefile.am
@@ -15,4 +15,4 @@ TESTS = $(noinst_PROGRAMS)
LOG_COMPILER = $(TEST_WRAPPER)
clean-local:
- rm -rf WT_TEST* *.core __*
+ rm -rf WT_TEST.* *.core
diff --git a/test/salvage/Makefile.am b/test/salvage/Makefile.am
index 3e686dd2951..0fd46aefcb1 100644
--- a/test/salvage/Makefile.am
+++ b/test/salvage/Makefile.am
@@ -11,4 +11,4 @@ TESTS = $(noinst_PROGRAMS)
LOG_COMPILER = $(TEST_WRAPPER)
clean-local:
- rm -rf WiredTiger* *.core __*
+ rm -rf WiredTiger* __slvg* *.core
diff --git a/test/salvage/salvage.c b/test/salvage/salvage.c
index a1517d70787..f264be99e2b 100644
--- a/test/salvage/salvage.c
+++ b/test/salvage/salvage.c
@@ -159,7 +159,7 @@ int
usage(void)
{
(void)fprintf(stderr,
- "usage: %s [-v] [-r run] [-t fix|rle|var|row]\n", progname);
+ "usage: %s [-v] [-r run] [-t fix|var|row]\n", progname);
return (EXIT_FAILURE);
}
@@ -170,7 +170,7 @@ run(int r)
printf("\t%s: run %d\n", __wt_page_type_string(page_type), r);
- CHECK(system("rm -f WiredTiger* __slvg.* __schema.*") == 0);
+ CHECK(system("rm -f WiredTiger* __slvg.*") == 0);
CHECK((res_fp = fopen(RSLT, "w")) != NULL);
/*
diff --git a/test/suite/test_backup05.py b/test/suite/test_backup05.py
index 991a9f71b19..fbe219d8de8 100644
--- a/test/suite/test_backup05.py
+++ b/test/suite/test_backup05.py
@@ -37,10 +37,12 @@ import fnmatch, os, shutil, time
from suite_subprocess import suite_subprocess
from wtscenario import multiply_scenarios, number_scenarios, prune_scenarios
from helper import copy_wiredtiger_home
-import wttest
+import wiredtiger, wttest
class test_backup05(wttest.WiredTigerTestCase, suite_subprocess):
uri = 'table:test_backup05'
+ emptyuri = 'table:test_empty05'
+ newuri = 'table:test_new05'
create_params = 'key_format=i,value_format=i'
freq = 5
@@ -51,12 +53,35 @@ class test_backup05(wttest.WiredTigerTestCase, suite_subprocess):
# With the connection still open, copy files to new directory.
# Half the time use an unaligned copy.
- aligned = (i % (self.freq * 2) != 0) or os.name == "nt"
+ even = i % (self.freq * 2) == 0
+ aligned = even or os.name == "nt"
copy_wiredtiger_home(olddir, newdir, aligned)
+ # Half the time try to rename a table and the other half try
+ # to remove a table. They should fail.
+ if not even:
+ self.assertRaises(wiredtiger.WiredTigerError,
+ lambda: self.session.rename(
+ self.emptyuri, self.newuri, None))
+ else:
+ self.assertRaises(wiredtiger.WiredTigerError,
+ lambda: self.session.drop(self.emptyuri, None))
+
# Now simulate fsyncUnlock by closing the backup cursor.
cbkup.close()
+ # Once the backup cursor is closed we should be able to perform
+ # schema operations. Test that and then reset the files to their
+ # expected initial names.
+ if not even:
+ self.session.rename(self.emptyuri, self.newuri, None)
+ self.session.drop(self.newuri, None)
+ self.session.create(self.emptyuri, self.create_params)
+ else:
+ self.session.drop(self.emptyuri, None)
+ self.session.create(self.emptyuri, self.create_params)
+
+
# Open the new directory and verify
conn = self.setUpConnectionOpen(newdir)
session = self.setUpSessionOpen(conn)
@@ -77,6 +102,10 @@ class test_backup05(wttest.WiredTigerTestCase, suite_subprocess):
#
# If the metadata isn't flushed, eventually the metadata we copy will
# be sufficiently out-of-sync with the data file that it won't verify.
+
+ self.session.create(self.emptyuri, self.create_params)
+ self.reopen_conn()
+
self.session.create(self.uri, self.create_params)
for i in range(100):
c = self.session.open_cursor(self.uri)
@@ -88,7 +117,7 @@ class test_backup05(wttest.WiredTigerTestCase, suite_subprocess):
self.session.verify(self.uri)
def test_backup(self):
- with self.expectedStdoutPattern('Recreating metadata'):
+ with self.expectedStdoutPattern('recreating metadata'):
self.backup()
if __name__ == '__main__':
diff --git a/test/suite/test_join01.py b/test/suite/test_join01.py
index 4aa2bc6e269..f8d96a2718a 100644
--- a/test/suite/test_join01.py
+++ b/test/suite/test_join01.py
@@ -35,10 +35,44 @@ from wtscenario import check_scenarios, multiply_scenarios, number_scenarios
class test_join01(wttest.WiredTigerTestCase):
nentries = 100
- scenarios = [
+ type_scen = [
('table', dict(ref='table')),
('index', dict(ref='index'))
]
+ bloom0_scen = [
+ ('bloom0=0', dict(joincfg0='')),
+ ('bloom0=1000', dict(joincfg0=',strategy=bloom,count=1000')),
+ ('bloom0=10000', dict(joincfg0=',strategy=bloom,count=10000')),
+ ]
+ bloom1_scen = [
+ ('bloom1=0', dict(joincfg1='')),
+ ('bloom1=1000', dict(joincfg1=',strategy=bloom,count=1000')),
+ ('bloom1=10000', dict(joincfg1=',strategy=bloom,count=10000')),
+ ]
+ projection_scen = [
+ ('no-projection', dict(do_proj=False)),
+ ('projection', dict(do_proj=True))
+ ]
+ nested_scen = [
+ ('simple', dict(do_nested=False)),
+ ('nested', dict(do_nested=True))
+ ]
+ stats_scen = [
+ ('no-stats', dict(do_stats=False)),
+ ('stats', dict(do_stats=True))
+ ]
+ order_scen = [
+ ('order=0', dict(join_order=0)),
+ ('order=1', dict(join_order=1)),
+ ('order=2', dict(join_order=2)),
+ ('order=3', dict(join_order=3)),
+ ]
+ scenarios = number_scenarios(multiply_scenarios('.', type_scen,
+ bloom0_scen, bloom1_scen,
+ projection_scen,
+ nested_scen, stats_scen,
+ order_scen))
+
# We need statistics for these tests.
conn_config = 'statistics=(all)'
@@ -52,9 +86,29 @@ class test_join01(wttest.WiredTigerTestCase):
return [s, rs, sort3]
# Common function for testing iteration of join cursors
- def iter_common(self, jc, do_proj):
+ def iter_common(self, jc, do_proj, do_nested, join_order):
# See comments in join_common()
- expect = [73, 82, 62, 83, 92]
+ # The order that the results are seen depends on
+ # the ordering of the joins. Specifically, the first
+ # join drives the order that results are seen.
+ if do_nested:
+ if join_order == 0:
+ expect = [73, 82, 83, 92]
+ elif join_order == 1:
+ expect = [73, 82, 83, 92]
+ elif join_order == 2:
+ expect = [82, 92, 73, 83]
+ elif join_order == 3:
+ expect = [92, 73, 82, 83]
+ else:
+ if join_order == 0:
+ expect = [73, 82, 62, 83, 92]
+ elif join_order == 1:
+ expect = [62, 73, 82, 83, 92]
+ elif join_order == 2:
+ expect = [62, 82, 92, 73, 83]
+ elif join_order == 3:
+ expect = [73, 82, 62, 83, 92]
while jc.next() == 0:
[k] = jc.get_keys()
i = k - 1
@@ -64,7 +118,9 @@ class test_join01(wttest.WiredTigerTestCase):
[v0,v1,v2] = jc.get_values()
self.assertEquals(self.gen_values(i), [v0,v1,v2])
if len(expect) == 0 or i != expect[0]:
- self.tty(' result ' + str(i) + ' is not in: ' + str(expect))
+ self.tty('ERROR: ' + str(i) + ' is not next in: ' +
+ str(expect))
+ self.tty('JOIN ORDER=' + str(join_order) + ', NESTED=' + str(do_nested))
self.assertTrue(i == expect[0])
expect.remove(i)
self.assertEquals(0, len(expect))
@@ -81,6 +137,8 @@ class test_join01(wttest.WiredTigerTestCase):
'join: index:join01:index2: ' + statdesc ]
if self.ref == 'index':
expectstats.append('join: index:join01:index0: ' + statdesc)
+ elif self.do_proj:
+ expectstats.append('join: table:join01(v2,v1,v0): ' + statdesc)
else:
expectstats.append('join: table:join01: ' + statdesc)
self.check_stats(statcur, expectstats)
@@ -118,11 +176,46 @@ class test_join01(wttest.WiredTigerTestCase):
self.assertTrue(len(expectstats) == 0,
'missing expected values in stats: ' + str(expectstats))
+ def session_record_join(self, jc, refc, config, order, joins):
+ joins.append([order, [jc, refc, config]])
+
+ def session_play_one_join(self, firsturi, jc, refc, config):
+ if refc.uri == firsturi and config != None:
+ config = config.replace('strategy=bloom','')
+ #self.tty('->join(jc, uri="' + refc.uri +
+ # '", config="' + str(config) + '"')
+ self.session.join(jc, refc, config)
+
+ def session_play_joins(self, joins, join_order):
+ #self.tty('->')
+ firsturi = None
+ for [i, joinargs] in joins:
+ if i >= join_order:
+ if firsturi == None:
+ firsturi = joinargs[1].uri
+ self.session_play_one_join(firsturi, *joinargs)
+ for [i, joinargs] in joins:
+ if i < join_order:
+ if firsturi == None:
+ firsturi = joinargs[1].uri
+ self.session_play_one_join(firsturi, *joinargs)
+
# Common function for testing the most basic functionality
# of joins
- def join_common(self, joincfg0, joincfg1, do_proj, do_stats):
+ def test_join(self):
+ joincfg0 = self.joincfg0
+ joincfg1 = self.joincfg1
+ do_proj = self.do_proj
+ do_nested = self.do_nested
+ do_stats = self.do_stats
+ join_order = self.join_order
#self.tty('join_common(' + joincfg0 + ',' + joincfg1 + ',' +
- # str(do_proj) + ')')
+ # str(do_proj) + ',' + str(do_nested) + ',' +
+ # str(do_stats) + ',' + str(join_order) + ')')
+
+ closeme = []
+ joins = [] # cursors to be joined
+
self.session.create('table:join01', 'key_format=r' +
',value_format=SSi,columns=(k,v0,v1,v2)')
self.session.create('index:join01:index0','columns=(v0)')
@@ -143,7 +236,7 @@ class test_join01(wttest.WiredTigerTestCase):
# We join on index2 first, not using bloom indices.
# This defines the order that items are returned.
- # index2 is sorts multiples of 3 first (see gen_values())
+ # index2 sorts multiples of 3 first (see gen_values())
# and by using 'gt' and key 99, we'll skip multiples of 3,
# and examine primary keys 2,5,8,...,95,98,1,4,7,...,94,97.
jc = self.session.open_cursor('join:table:join01' + proj_suffix,
@@ -152,7 +245,7 @@ class test_join01(wttest.WiredTigerTestCase):
c2 = self.session.open_cursor('index:join01:index2(v1)', None, None)
c2.set_key(99) # skips all entries w/ primary key divisible by three
self.assertEquals(0, c2.search())
- self.session.join(jc, c2, 'compare=gt')
+ self.session_record_join(jc, c2, 'compare=gt', 0, joins)
# Then select all the numbers 0-99 whose string representation
# sort >= '60'.
@@ -163,285 +256,87 @@ class test_join01(wttest.WiredTigerTestCase):
c0 = self.session.open_cursor('table:join01', None, None)
c0.set_key(60)
self.assertEquals(0, c0.search())
- self.session.join(jc, c0, 'compare=ge' + joincfg0)
+ self.session_record_join(jc, c0, 'compare=ge' + joincfg0, 1, joins)
# Then select all numbers whose reverse string representation
# is in '20' < x < '40'.
c1a = self.session.open_cursor('index:join01:index1(v1)', None, None)
c1a.set_key('21')
self.assertEquals(0, c1a.search())
- self.session.join(jc, c1a, 'compare=gt' + joincfg1)
+ self.session_record_join(jc, c1a, 'compare=gt' + joincfg1, 2, joins)
c1b = self.session.open_cursor('index:join01:index1(v1)', None, None)
c1b.set_key('41')
self.assertEquals(0, c1b.search())
- self.session.join(jc, c1b, 'compare=lt' + joincfg1)
+ self.session_record_join(jc, c1b, 'compare=lt' + joincfg1, 2, joins)
# Numbers that satisfy these 3 conditions (with ordering implied by c2):
# [73, 82, 62, 83, 92].
#
# After iterating, we should be able to reset and iterate again.
+ if do_nested:
+ # To test nesting, we create two new levels of conditions:
+ #
+ # x == 72 or x == 73 or x == 82 or x == 83 or
+ # (x >= 90 and x <= 99)
+ #
+ # that will get AND-ed into our existing join. The expected
+ # result is [73, 82, 83, 92].
+ #
+ # We don't specify the projection here, it should be picked up
+ # from the 'enclosing' join.
+ nest1 = self.session.open_cursor('join:table:join01', None, None)
+ nest2 = self.session.open_cursor('join:table:join01', None, None)
+
+ nc = self.session.open_cursor('index:join01:index0', None, None)
+ nc.set_key('90')
+ self.assertEquals(0, nc.search())
+ self.session.join(nest2, nc, 'compare=ge') # joincfg left out
+ closeme.append(nc)
+
+ nc = self.session.open_cursor('index:join01:index0', None, None)
+ nc.set_key('99')
+ self.assertEquals(0, nc.search())
+ self.session.join(nest2, nc, 'compare=le')
+ closeme.append(nc)
+
+ self.session.join(nest1, nest2, "operation=or")
+
+ for val in [ '72', '73', '82', '83' ]:
+ nc = self.session.open_cursor('index:join01:index0', None, None)
+ nc.set_key(val)
+ self.assertEquals(0, nc.search())
+ self.session.join(nest1, nc, 'compare=eq,operation=or' +
+ joincfg0)
+ closeme.append(nc)
+ self.session_record_join(jc, nest1, None, 3, joins)
+
+ self.session_play_joins(joins, join_order)
+ self.iter_common(jc, do_proj, do_nested, join_order)
if do_stats:
self.stats(jc, 0)
- self.iter_common(jc, do_proj)
+ jc.reset()
+ self.iter_common(jc, do_proj, do_nested, join_order)
if do_stats:
self.stats(jc, 1)
jc.reset()
- self.iter_common(jc, do_proj)
+ self.iter_common(jc, do_proj, do_nested, join_order)
if do_stats:
self.stats(jc, 2)
jc.reset()
- self.iter_common(jc, do_proj)
+ self.iter_common(jc, do_proj, do_nested, join_order)
jc.close()
c2.close()
c1a.close()
c1b.close()
c0.close()
+ if do_nested:
+ nest1.close()
+ nest2.close()
+ for c in closeme:
+ c.close()
self.session.drop('table:join01')
- # Test joins with basic functionality
- def test_join(self):
- bloomcfg1000 = ',strategy=bloom,count=1000'
- bloomcfg10000 = ',strategy=bloom,count=10000'
- for cfga in [ '', bloomcfg1000, bloomcfg10000 ]:
- for cfgb in [ '', bloomcfg1000, bloomcfg10000 ]:
- for do_proj in [ False, True ]:
- #self.tty('cfga=' + cfga +
- # ', cfgb=' + cfgb +
- # ', doproj=' + str(do_proj))
- self.join_common(cfga, cfgb, do_proj, False)
-
- def test_join_errors(self):
- self.session.create('table:join01', 'key_format=r,value_format=SS'
- ',columns=(k,v0,v1)')
- self.session.create('table:join01B', 'key_format=r,value_format=SS'
- ',columns=(k,v0,v1)')
- self.session.create('index:join01:index0','columns=(v0)')
- self.session.create('index:join01:index1','columns=(v1)')
- self.session.create('index:join01B:index0','columns=(v0)')
- jc = self.session.open_cursor('join:table:join01', None, None)
- tc = self.session.open_cursor('table:join01', None, None)
- fc = self.session.open_cursor('file:join01.wt', None, None)
- ic0 = self.session.open_cursor('index:join01:index0', None, None)
- ic0again = self.session.open_cursor('index:join01:index0', None, None)
- ic1 = self.session.open_cursor('index:join01:index1', None, None)
- icB = self.session.open_cursor('index:join01B:index0', None, None)
- tcB = self.session.open_cursor('table:join01B', None, None)
-
- tc.set_key(1)
- tc.set_value('val1', 'val1')
- tc.insert()
- tcB.set_key(1)
- tcB.set_value('val1', 'val1')
- tcB.insert()
- fc.next()
-
- # Joining using a non join-cursor
- self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: self.session.join(tc, ic0, 'compare=ge'),
- '/not a join cursor/')
- # Joining a table cursor, not index
- self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: self.session.join(jc, fc, 'compare=ge'),
- '/not an index or table cursor/')
- # Joining a non positioned cursor
- self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: self.session.join(jc, ic0, 'compare=ge'),
- '/requires reference cursor be positioned/')
- ic0.set_key('val1')
- # Joining a non positioned cursor (no search or next has been done)
- self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: self.session.join(jc, ic0, 'compare=ge'),
- '/requires reference cursor be positioned/')
- ic0.set_key('valXX')
- self.assertEqual(ic0.search(), wiredtiger.WT_NOTFOUND)
- # Joining a non positioned cursor after failed search
- self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: self.session.join(jc, ic0, 'compare=ge'),
- '/requires reference cursor be positioned/')
-
- # position the cursors now
- ic0.set_key('val1')
- ic0.search()
- ic0again.next()
- icB.next()
-
- # Joining non matching index
- self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: self.session.join(jc, icB, 'compare=ge'),
- '/table for join cursor does not match/')
-
- # The cursor must be positioned
- self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: self.session.join(jc, ic1, 'compare=ge'),
- '/requires reference cursor be positioned/')
- ic1.next()
-
- # The first cursor joined cannot be bloom
- self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: self.session.join(jc, ic1,
- 'compare=ge,strategy=bloom,count=1000'),
- '/first joined cursor cannot specify strategy=bloom/')
-
- # This succeeds.
- self.session.join(jc, ic1, 'compare=ge'),
-
- # With bloom filters, a count is required
- self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: self.session.join(jc, ic0, 'compare=ge,strategy=bloom'),
- '/count must be nonzero/')
-
- # This succeeds.
- self.session.join(jc, ic0, 'compare=ge,strategy=bloom,count=1000'),
-
- bloom_config = ',strategy=bloom,count=1000'
- # Cannot use the same index cursor
- self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: self.session.join(jc, ic0,
- 'compare=le' + bloom_config),
- '/index cursor already used in a join/')
-
- # When joining with the same index, need compatible compares
- self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: self.session.join(jc, ic0again, 'compare=ge' + bloom_config),
- '/join has overlapping ranges/')
-
- # Another incompatible compare
- self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: self.session.join(jc, ic0again, 'compare=gt' + bloom_config),
- '/join has overlapping ranges/')
-
- # Compare is compatible, but bloom args need to match
- self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: self.session.join(jc, ic0again, 'compare=le'),
- '/join has incompatible strategy/')
-
- # Counts need to match for bloom filters
- self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: self.session.join(jc, ic0again, 'compare=le,strategy=bloom,'
- 'count=100'), '/count.* does not match previous count/')
-
- # This succeeds
- self.session.join(jc, ic0again, 'compare=le,strategy=bloom,count=1000')
-
- # Need to do initial next() before getting key/values
- self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: jc.get_keys(),
- '/join cursor must be advanced with next/')
-
- self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: jc.get_values(),
- '/join cursor must be advanced with next/')
-
- # Operations on the joined cursor are frozen until the join is closed.
- self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: ic0.next(),
- '/index cursor is being used in a join/')
-
- # Operations on the joined cursor are frozen until the join is closed.
- self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: ic0.prev(),
- '/index cursor is being used in a join/')
-
- self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: ic0.reset(),
- '/index cursor is being used in a join/')
-
- # Only a small number of operations allowed on a join cursor
- msg = "/Unsupported cursor/"
- self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: jc.search(), msg)
-
- self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
- lambda: jc.prev(), msg)
-
- self.assertEquals(jc.next(), 0)
- self.assertEquals(jc.next(), wiredtiger.WT_NOTFOUND)
-
- # Only after the join cursor is closed can we use the index cursor
- # normally
- jc.close()
- self.assertEquals(ic0.next(), wiredtiger.WT_NOTFOUND)
- self.assertEquals(ic0.prev(), 0)
-
- # common code for making sure that cursors can be
- # implicitly closed, no matter the order they are created
- def cursor_close_common(self, joinfirst):
- self.session.create('table:join01', 'key_format=r' +
- ',value_format=SS,columns=(k,v0,v1)')
- self.session.create('index:join01:index0','columns=(v0)')
- self.session.create('index:join01:index1','columns=(v1)')
- c = self.session.open_cursor('table:join01', None, None)
- for i in range(0, self.nentries):
- c.set_key(*self.gen_key(i))
- c.set_value(*self.gen_values(i))
- c.insert()
- c.close()
-
- if joinfirst:
- jc = self.session.open_cursor('join:table:join01', None, None)
- c0 = self.session.open_cursor('index:join01:index0', None, None)
- c1 = self.session.open_cursor('index:join01:index1', None, None)
- c0.next() # index cursors must be positioned
- c1.next()
- if not joinfirst:
- jc = self.session.open_cursor('join:table:join01', None, None)
- self.session.join(jc, c0, 'compare=ge')
- self.session.join(jc, c1, 'compare=ge')
- self.session.close()
- self.session = None
-
- def test_cursor_close1(self):
- self.cursor_close_common(True)
-
- def test_cursor_close2(self):
- self.cursor_close_common(False)
-
- # test statistics using the framework set up for this test
- def test_stats(self):
- bloomcfg1000 = ',strategy=bloom,count=1000'
- bloomcfg10 = ',strategy=bloom,count=10'
- self.join_common(bloomcfg1000, bloomcfg1000, False, True)
-
- # Intentially run with an underconfigured Bloom filter,
- # statistics should pick up some false positives.
- self.join_common(bloomcfg10, bloomcfg10, False, True)
-
- # test statistics with a simple one index join cursor
- def test_simple_stats(self):
- self.session.create("table:join01b",
- "key_format=i,value_format=i,columns=(k,v)")
- self.session.create("index:join01b:index", "columns=(v)")
-
- cursor = self.session.open_cursor("table:join01b", None, None)
- cursor[1] = 11
- cursor[2] = 12
- cursor[3] = 13
- cursor.close()
-
- cursor = self.session.open_cursor("index:join01b:index", None, None)
- cursor.set_key(11)
- cursor.search()
-
- jcursor = self.session.open_cursor("join:table:join01b", None, None)
- self.session.join(jcursor, cursor, "compare=gt")
-
- while jcursor.next() == 0:
- [k] = jcursor.get_keys()
- [v] = jcursor.get_values()
-
- statcur = self.session.open_cursor("statistics:join", jcursor, None)
- found = False
- while statcur.next() == 0:
- [desc, pvalue, value] = statcur.get_values()
- #self.tty(str(desc) + "=" + str(pvalue))
- found = True
- self.assertEquals(found, True)
-
- jcursor.close()
- cursor.close()
-
-
if __name__ == '__main__':
wttest.run()
diff --git a/test/suite/test_join07.py b/test/suite/test_join07.py
new file mode 100644
index 00000000000..36e91361329
--- /dev/null
+++ b/test/suite/test_join07.py
@@ -0,0 +1,548 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2016 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import os, re, run
+import wiredtiger, wttest, suite_random
+from wtscenario import check_scenarios, multiply_scenarios, number_scenarios
+
+class ParseException(Exception):
+ def __init__(self, msg):
+ super(ParseException, self).__init__(msg)
+
+class Token:
+ UNKNOWN = '<unknown>'
+ NUMBER = 'Number'
+ STRING = 'String'
+ COLUMN = 'Column'
+ LPAREN = '('
+ RPAREN = ')'
+ LBRACKET = '{'
+ RBRACKET = '}'
+ COMMA = ','
+ OR = '||'
+ AND = '&&'
+ LT = '<'
+ GT = '>'
+ LE = '<='
+ GE = '>='
+ EQ = '=='
+ ATTRIBUTE = 'Attribute' # bracketed key value pair
+
+ COMPARE_OPS = [LT, GT, LE, GE, EQ]
+ COMPARATORS = [NUMBER, STRING]
+
+ def __init__(self, kind, tokenizer):
+ self.kind = kind
+ self.pos = tokenizer.off + tokenizer.pos
+ self.n = 0
+ self.s = ''
+ self.index = ''
+ self.attr_key = ''
+ self.attr_value = ''
+ self.groups = None
+
+ def __str__(self):
+ return '<Token ' + self.kind + ' at char ' + str(self.pos) + '>'
+
+class Tokenizer:
+ def __init__(self, s):
+ self.off = 0
+ self.s = s + '?' # add a char that won't match anything
+ self.pos = 0
+ self.end = len(s)
+ self.re_num = re.compile(r"(\d+)")
+ self.re_quote1 = re.compile(r"'([^']*)'")
+ self.re_quote2 = re.compile(r"\"([^\"]*)\"")
+ self.re_attr = re.compile(r"\[(\w+)=(\w+)\]")
+ self.pushed = None
+
+ def newToken(self, kind, sz):
+ t = Token(kind, self)
+ self.pos += sz
+ return t
+
+ def error(self, s):
+ raise ParseException(str(self.pos) + ': ' + s)
+
+ def matched(self, kind, repat):
+ pos = self.pos
+ match = re.match(repat, self.s[pos:])
+ if not match:
+ end = pos + 10
+ if end > self.end:
+ end = self.end
+ self.error('matching ' + kind + ' at "' +
+ self.s[pos:end] + '..."')
+ t = self.newToken(kind, match.end())
+ t.groups = match.groups()
+ t.s = self.s[pos:pos + match.end()]
+ return t
+
+ def available(self):
+ if self.pushed == None:
+ self.pushback(self.token())
+ return (self.pushed != None)
+
+ def pushback(self, token):
+ if self.pushed != None:
+ raise AssertionError('pushback more than once')
+ self.pushed = token
+
+ def peek(self):
+ token = self.token()
+ self.pushback(token)
+ return token
+
+ def scan(self):
+ while self.pos < self.end and self.s[self.pos].isspace():
+ self.pos += 1
+ return '' if self.pos >= self.end else self.s[self.pos]
+
+ def token(self):
+ if self.pushed != None:
+ ret = self.pushed
+ self.pushed = None
+ return ret
+ c = self.scan()
+ if self.pos >= self.end:
+ return None
+ lookahead = '' if self.pos + 1 >= self.end else self.s[self.pos+1]
+ #self.tty("Tokenizer.token char=" + c + ", lookahead=" + lookahead)
+ if c == "'":
+ t = self.matched(Token.STRING, self.re_quote1)
+ t.s = t.groups[0]
+ return t
+ if c == '"':
+ t = self.matched(Token.STRING, self.re_quote2)
+ t.s = t.groups[0]
+ return t
+ if c in "{}(),":
+ return self.newToken(c, 1)
+ if c == "|":
+ if lookahead != "|":
+ self.error('matching OR')
+ return self.newToken(Token.OR, 2)
+ if c == "&":
+ if lookahead != "&":
+ self.error('matching AND')
+ return self.newToken(Token.AND, 2)
+ if c in "0123456789":
+ t = self.matched(Token.NUMBER, self.re_num)
+ t.s = t.groups[0]
+ t.n = int(t.s)
+ return t
+ if c in "ABCDEFGHIJ":
+ t = self.newToken(Token.COLUMN, 1)
+ t.s = c
+ return t
+ if c == '<':
+ if lookahead == '=':
+ return self.newToken(Token.LE, 2)
+ else:
+ return self.newToken(Token.LT, 1)
+ if c == '>':
+ if lookahead == '=':
+ return self.newToken(Token.GE, 2)
+ else:
+ return self.newToken(Token.GT, 1)
+ if c in "=":
+ if lookahead != "=":
+ self.error('matching EQ')
+ return self.newToken(Token.EQ, 2)
+ if c in "[":
+ t = self.matched(Token.ATTRIBUTE, self.re_attr)
+ t.attr_key = t.groups[0]
+ t.attr_value = t.groups[1]
+ return t
+ return None
+
+ def tty(self, s):
+ wttest.WiredTigerTestCase.tty(s)
+
+# test_join07.py
+# Join interpreter
+class test_join07(wttest.WiredTigerTestCase):
+ reverseop = { '==' : '==', '<=' : '>=', '<' : '>', '>=' : '<=', '>' : '<' }
+ compareop = { '==' : 'eq', '<=' : 'le', '<' : 'lt', '>=' : 'ge',
+ '>' : 'gt' }
+ columnmult = { 'A' : 1, 'B' : 2, 'C' : 3, 'D' : 4, 'E' : 5,
+ 'F' : 6, 'G' : 7, 'H' : 8, 'I' : 9, 'J' : 10 }
+
+ extractscen = [
+ ('extractor', dict(extractor=True)),
+ ('noextractor', dict(extractor=False))
+ ]
+
+ scenarios = number_scenarios(extractscen)
+
+ # Return the wiredtiger_open extension argument for a shared library.
+ def extensionArg(self, exts):
+ extfiles = []
+ for ext in exts:
+ (dirname, name, libname) = ext
+ if name != None and name != 'none':
+ testdir = os.path.dirname(__file__)
+ extdir = os.path.join(run.wt_builddir, 'ext', dirname)
+ extfile = os.path.join(
+ extdir, name, '.libs', 'libwiredtiger_' + libname + '.so')
+ if not os.path.exists(extfile):
+ self.skipTest('extension "' + extfile + '" not built')
+ if not extfile in extfiles:
+ extfiles.append(extfile)
+ if len(extfiles) == 0:
+ return ''
+ else:
+ return ',extensions=["' + '","'.join(extfiles) + '"]'
+
+ # Override WiredTigerTestCase, we have extensions.
+ def setUpConnectionOpen(self, dir):
+ extarg = self.extensionArg([('extractors', 'csv', 'csv_extractor')])
+ connarg = 'create,error_prefix="{0}: ",{1}'.format(
+ self.shortid(), extarg)
+ conn = self.wiredtiger_open(dir, connarg)
+ self.pr(`conn`)
+ return conn
+
+ def expect(self, token, expected):
+ if token == None or token.kind not in expected:
+ self.err(token, 'expected one of: ' + str(expected))
+ return token
+
+ def err(self, token, msg):
+ self.assertTrue(False, 'ERROR at token ' + str(token) + ': ' + msg)
+
+ def gen_key(self, i):
+ if self.keyformat == 'S':
+ return [ 'key%06d' % i ] # zero pad so it sorts expectedly
+ else:
+ return [ i ]
+
+ def gen_values(self, i):
+ s = ""
+ ret = []
+ for x in range(1, 11):
+ v = (i * x) % self.N
+ if x <= 5:
+ ret.append(v)
+ else:
+ ret.append(str(v))
+ if s != "":
+ s += ","
+ s += str(v)
+ ret.insert(0, s)
+ return ret
+
+ def iterate(self, jc, mbr):
+ mbr = set(mbr) # we need a mutable set
+ gotkeys = []
+ #self.tty('iteration expects ' + str(len(mbr)) +
+ # ' entries: ' + str(mbr))
+ while jc.next() == 0:
+ [k] = jc.get_keys()
+ values = jc.get_values()
+ if self.keyformat == 'S':
+ i = int(str(k[3:]))
+ else:
+ i = k
+ #self.tty('GOT key=' + str(k) + ', values=' + str(values))
+
+ # Duplicates may be returned when the disjunctions are used,
+ # so we ignore them.
+ if not i in gotkeys:
+ self.assertEquals(self.gen_values(i), values)
+ if not i in mbr:
+ self.tty('ERROR: result ' + str(i) + ' is not in: ' +
+ str(mbr))
+ self.assertTrue(i in mbr)
+ mbr.remove(i)
+ gotkeys.append(i)
+ self.assertEquals(0, len(mbr))
+
+ def token_literal(self, token):
+ if token.kind == Token.STRING:
+ return token.s
+ elif token.kind == Token.NUMBER:
+ return token.n
+
+ def idx_sim(self, x, mult, isstr):
+ if isstr:
+ return str(int(x) * mult % self.N)
+ else:
+ return (x * mult % self.N)
+
+ def mkmbr(self, expr):
+ return frozenset([x for x in self.allN if expr(x)])
+
+ def join_one_side(self, jc, coltok, littok, optok, conjunction,
+ isright, mbr):
+ idxname = 'index:join07:' + coltok.s
+ cursor = self.session.open_cursor(idxname, None, None)
+ jc.cursors.append(cursor)
+ literal = self.token_literal(littok)
+ cursor.set_key(literal)
+ searchret = cursor.search()
+ if searchret != 0:
+ self.tty('ERROR: cannot find value ' + str(literal) +
+ ' in ' + idxname)
+ self.assertEquals(0, searchret)
+ op = optok.kind
+ if not isright:
+ op = self.reverseop[op]
+ mult = self.columnmult[coltok.s]
+ config = 'compare=' + self.compareop[op] + ',operation=' + \
+ ('and' if conjunction else 'or')
+ if hasattr(coltok, 'bloom'):
+ config += ',strategy=bloom,count=' + str(coltok.bloom)
+ #self.tty('join(jc, cursor=' + str(literal) + ', ' + config)
+ self.session.join(jc, cursor, config)
+ isstr = type(literal) is str
+ if op == '==':
+ tmbr = self.mkmbr(lambda x: self.idx_sim(x, mult, isstr) == literal)
+ elif op == '<=':
+ tmbr = self.mkmbr(lambda x: self.idx_sim(x, mult, isstr) <= literal)
+ elif op == '<':
+ tmbr = self.mkmbr(lambda x: self.idx_sim(x, mult, isstr) < literal)
+ elif op == '>=':
+ tmbr = self.mkmbr(lambda x: self.idx_sim(x, mult, isstr) >= literal)
+ elif op == '>':
+ tmbr = self.mkmbr(lambda x: self.idx_sim(x, mult, isstr) > literal)
+ if conjunction:
+ mbr = mbr.intersection(tmbr)
+ else:
+ mbr = mbr.union(tmbr)
+ return mbr
+
+ def parse_join(self, jc, tokenizer, conjunction, mbr):
+ left = None
+ right = None
+ leftop = None
+ rightop = None
+ col = None
+ token = tokenizer.token()
+ if token.kind == Token.LPAREN:
+ subjc = self.session.open_cursor('join:table:join07', None, None)
+ jc.cursors.append(subjc)
+ submbr = self.parse_junction(subjc, tokenizer)
+ config = 'operation=' + ('and' if conjunction else 'or')
+ self.session.join(jc, subjc, config)
+ if conjunction:
+ mbr = mbr.intersection(submbr)
+ else:
+ mbr = mbr.union(submbr)
+ return mbr
+ if token.kind in Token.COMPARATORS:
+ left = token
+ leftop = self.expect(tokenizer.token(), Token.COMPARE_OPS)
+ token = tokenizer.token()
+ col = self.expect(token, [Token.COLUMN])
+ token = tokenizer.token()
+ if token.kind in Token.ATTRIBUTE:
+ tokenizer.pushback(token)
+ self.parse_column_attributes(tokenizer, col)
+ token = tokenizer.token()
+ if token.kind in Token.COMPARE_OPS:
+ rightop = token
+ right = self.expect(tokenizer.token(), Token.COMPARATORS)
+ token = tokenizer.token()
+ tokenizer.pushback(token)
+
+ # Now we have everything we need to do a join.
+ if left != None:
+ mbr = self.join_one_side(jc, col, left, leftop, conjunction,
+ False, mbr)
+ if right != None:
+ mbr = self.join_one_side(jc, col, right, rightop, conjunction,
+ True, mbr)
+ return mbr
+
+ # Parse a set of joins, grouped by && or ||
+ def parse_junction(self, jc, tokenizer):
+ jc.cursors = []
+
+ # Take a peek at the tokenizer's stream to see if we
+ # have a conjunction or disjunction
+ token = tokenizer.peek()
+ s = tokenizer.s[token.pos:]
+ (andpos, orpos) = self.find_nonparen(s, ['&', '|'])
+ if orpos >= 0 and (andpos < 0 or orpos < andpos):
+ conjunction = False
+ mbr = frozenset()
+ else:
+ conjunction = True
+ mbr = frozenset(self.allN)
+
+ while tokenizer.available():
+ mbr = self.parse_join(jc, tokenizer, conjunction, mbr)
+ token = tokenizer.token()
+ if token != None:
+ if token.kind == Token.OR:
+ self.assertTrue(not conjunction)
+ elif token.kind == Token.AND:
+ self.assertTrue(conjunction)
+ elif token.kind == Token.RPAREN:
+ break
+ else:
+ self.err(token, 'unexpected token')
+ return mbr
+
+ def parse_attributes(self, tokenizer):
+ attributes = []
+ token = tokenizer.token()
+ while token != None and token.kind == Token.ATTRIBUTE:
+ attributes.append(token)
+ token = tokenizer.token()
+ tokenizer.pushback(token)
+ return attributes
+
+ # Find a set of chars that aren't within parentheses.
+ # For this simple language, we don't allow parentheses in quoted literals.
+ def find_nonparen(self, s, matchlist):
+ pos = 0
+ end = len(s)
+ nmatch = len(matchlist)
+ nfound = 0
+ result = [-1 for i in range(0, nmatch)]
+ parennest = 0
+ while pos < end and nfound < nmatch:
+ c = s[pos]
+ if c == '(':
+ parennest += 1
+ elif c == ')':
+ parennest -= 1
+ if parennest < 0:
+ break
+ elif parennest == 0 and c in matchlist:
+ m = matchlist.index(c)
+ if result[m] < 0:
+ result[m] = pos
+ nfound += 1
+ pos += 1
+ return result
+
+ def parse_toplevel(self, jc, tokenizer):
+ return self.parse_junction(jc, tokenizer)
+
+ def parse_toplevel_attributes(self, tokenizer):
+ for attrtoken in self.parse_attributes(tokenizer):
+ key = attrtoken.attr_key
+ value = attrtoken.attr_value
+ #self.tty('ATTR:' + str([key,value]))
+ if key == 'N':
+ self.N = int(value)
+ elif key == 'key':
+ self.keyformat = value
+ else:
+ tokenizer.error('bad attribute key: ' + str(key))
+
+ def parse_column_attributes(self, tokenizer, c):
+ for attrtoken in self.parse_attributes(tokenizer):
+ key = attrtoken.attr_key
+ value = attrtoken.attr_value
+ #self.tty('ATTR:' + str([key,value]))
+ if key == 'bloom':
+ c.bloom = int(value)
+ else:
+ tokenizer.error('bad column attribute key: ' + str(key))
+
+ def close_cursors(self, jc):
+ jc.close()
+ for c in jc.cursors:
+ if c.uri[0:5] == 'join:':
+ self.close_cursors(c)
+ else:
+ c.close()
+
+ def interpret(self, s):
+ #self.tty('INTERPRET: ' + s)
+ self.N = 1000
+ self.keyformat = "r"
+ self.keycols = 'k'
+
+ # Grab attributes before creating anything, as some attributes
+ # may override needed parameters.
+ tokenizer = Tokenizer(s)
+ self.parse_toplevel_attributes(tokenizer)
+ self.allN = range(1, self.N + 1)
+
+ self.session.create('table:join07', 'key_format=' + self.keyformat +
+ ',value_format=SiiiiiSSSSS,' +
+ 'columns=(' + self.keycols +
+ ',S,A,B,C,D,E,F,G,H,I,J)')
+ mdfieldnum = 0
+ mdformat = 'i'
+ mdconfig = ''
+ for colname in [ 'A','B','C','D','E','F','G','H','I','J' ]:
+ if self.extractor:
+ if colname == 'F':
+ mdformat = 'S'
+ mdconfig = 'app_metadata={"format" : "%s","field" : "%d"}' % \
+ (mdformat, mdfieldnum)
+ config = 'extractor=csv,key_format=%s' % mdformat
+ mdfieldnum += 1
+ else:
+ config = 'columns=(%s)' % colname
+ self.session.create('index:join07:%s' % colname,
+ '%s,%s' % (config, mdconfig))
+ c = self.session.open_cursor('table:join07', None, None)
+ for i in self.allN:
+ c.set_key(*self.gen_key(i))
+ c.set_value(*self.gen_values(i))
+ c.insert()
+ c.close()
+
+ jc = self.session.open_cursor('join:table:join07', None, None)
+ mbr = self.parse_toplevel(jc, tokenizer)
+ self.iterate(jc, mbr)
+
+ self.close_cursors(jc)
+ self.session.drop('table:join07')
+
+ def test_join_string(self):
+ self.interpret("[N=1000][key=r] 7 < A <= 500 && B < 150 && C > 17")
+ self.interpret("[N=1001][key=r] 7 < A <= 500 && B < 150 && F > '234'")
+ self.interpret("[N=10000][key=r] 7 < A <= 500 && B < 150 && " +
+ "(F > '234' || G < '100')")
+ self.interpret("[N=7919][key=r](7 < A <= 9)&&(F > '234')")
+ self.interpret("[N=1000][key=S](A>=0 && A<0)||(A>999)")
+ self.interpret("[N=2000][key=S](A>=0 && A<0)||(A>1999)")
+ self.interpret("(7<A<=10 && B < 150)||(B>998)")
+ self.interpret("(7<A<=10 && B < 150)||(J=='990')")
+ clause1 = "(7 < A <= 500 && B < 150)"
+ clause2 = "(F > '234' || G < '100')"
+ self.interpret("[N=1000][key=r]" + clause1 + "&&" + clause2)
+ self.interpret("(7<A<=10)||(B>994||C<12)")
+ self.interpret("(7<A<=10 && B < 150)||(B>996||C<6)")
+ self.interpret("[N=1000][key=r]" + clause2 + "||" + clause1)
+ self.interpret("[N=1000][key=r]" + clause1 + "||" + clause2)
+ self.interpret("[N=1000][key=S]" + clause2 + "&&" + clause1)
+ clause1 = "(7 < A <= 500 && B[bloom=300] < 150)"
+ clause2 = "(F[bloom=500] > '234' || G[bloom=20] < '100')"
+ self.interpret("[N=1000][key=S]" + clause1 + "&&" + clause2)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/test/suite/test_join08.py b/test/suite/test_join08.py
new file mode 100644
index 00000000000..6d674ab8193
--- /dev/null
+++ b/test/suite/test_join08.py
@@ -0,0 +1,265 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2016 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import wiredtiger, wttest
+from wtscenario import check_scenarios, multiply_scenarios, number_scenarios
+
+# test_join08.py
+# Test join error paths
+class test_join08(wttest.WiredTigerTestCase):
+ nentries = 100
+
+ # We need statistics for these tests.
+ conn_config = 'statistics=(all)'
+
+ def gen_key(self, i):
+ return [ i + 1 ]
+
+ def gen_values(self, i):
+ s = str(i)
+ rs = s[::-1]
+ sort3 = (self.nentries * (i % 3)) + i # multiples of 3 sort first
+ return [s, rs, sort3]
+
+ def test_join_errors(self):
+ self.session.create('table:join08', 'key_format=r,value_format=SS'
+ ',columns=(k,v0,v1)')
+ self.session.create('table:join08B', 'key_format=r,value_format=SS'
+ ',columns=(k,v0,v1)')
+ self.session.create('index:join08:index0','columns=(v0)')
+ self.session.create('index:join08:index1','columns=(v1)')
+ self.session.create('index:join08B:index0','columns=(v0)')
+ jc = self.session.open_cursor('join:table:join08', None, None)
+ tc = self.session.open_cursor('table:join08', None, None)
+ fc = self.session.open_cursor('file:join08.wt', None, None)
+ ic0 = self.session.open_cursor('index:join08:index0', None, None)
+ ic0again = self.session.open_cursor('index:join08:index0', None, None)
+ ic1 = self.session.open_cursor('index:join08:index1', None, None)
+ icB = self.session.open_cursor('index:join08B:index0', None, None)
+ tcB = self.session.open_cursor('table:join08B', None, None)
+
+ tc.set_key(1)
+ tc.set_value('val1', 'val1')
+ tc.insert()
+ tcB.set_key(1)
+ tcB.set_value('val1', 'val1')
+ tcB.insert()
+ fc.next()
+
+ # Joining using a non join-cursor
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.join(tc, ic0, 'compare=ge'),
+ '/not a join cursor/')
+ # Joining a table cursor, not index
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.join(jc, fc, 'compare=ge'),
+ '/must be an index, table or join cursor/')
+ # Joining a non positioned cursor
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.join(jc, ic0, 'compare=ge'),
+ '/requires reference cursor be positioned/')
+ ic0.set_key('val1')
+ # Joining a non positioned cursor (no search or next has been done)
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.join(jc, ic0, 'compare=ge'),
+ '/requires reference cursor be positioned/')
+ ic0.set_key('valXX')
+ self.assertEqual(ic0.search(), wiredtiger.WT_NOTFOUND)
+ # Joining a non positioned cursor after failed search
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.join(jc, ic0, 'compare=ge'),
+ '/requires reference cursor be positioned/')
+
+ # position the cursors now
+ ic0.set_key('val1')
+ ic0.search()
+ ic0again.next()
+ icB.next()
+
+ # Joining non matching index
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.join(jc, icB, 'compare=ge'),
+ '/table for join cursor does not match/')
+
+ # The cursor must be positioned
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.join(jc, ic1, 'compare=ge'),
+ '/requires reference cursor be positioned/')
+ ic1.next()
+
+ # This succeeds.
+ self.session.join(jc, ic1, 'compare=ge'),
+
+ # With bloom filters, a count is required
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.join(jc, ic0, 'compare=ge,strategy=bloom'),
+ '/count must be nonzero/')
+
+ # This succeeds.
+ self.session.join(jc, ic0, 'compare=ge,strategy=bloom,count=1000'),
+
+ bloom_config = ',strategy=bloom,count=1000'
+ # Cannot use the same index cursor
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.join(jc, ic0,
+ 'compare=le' + bloom_config),
+ '/cursor already used in a join/')
+
+ # When joining with the same index, need compatible compares
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.join(jc, ic0again, 'compare=ge' + bloom_config),
+ '/join has overlapping ranges/')
+
+ # Another incompatible compare
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.join(jc, ic0again, 'compare=gt' + bloom_config),
+ '/join has overlapping ranges/')
+
+ # Compare is compatible, but bloom args need to match
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.join(jc, ic0again, 'compare=le'),
+ '/join has incompatible strategy/')
+
+ # Counts need to match for bloom filters
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: self.session.join(jc, ic0again, 'compare=le,strategy=bloom,'
+ 'count=100'), '/count.* does not match previous count/')
+
+ # This succeeds
+ self.session.join(jc, ic0again, 'compare=le,strategy=bloom,count=1000')
+
+ # Need to do initial next() before getting key/values
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: jc.get_keys(),
+ '/join cursor must be advanced with next/')
+
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: jc.get_values(),
+ '/join cursor must be advanced with next/')
+
+ # Operations on the joined cursor are frozen until the join is closed.
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: ic0.next(),
+ '/cursor is being used in a join/')
+
+ # Operations on the joined cursor are frozen until the join is closed.
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: ic0.prev(),
+ '/cursor is being used in a join/')
+
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: ic0.reset(),
+ '/cursor is being used in a join/')
+
+ # Only a small number of operations allowed on a join cursor
+ msg = "/Unsupported cursor/"
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: jc.search(), msg)
+
+ self.assertRaisesWithMessage(wiredtiger.WiredTigerError,
+ lambda: jc.prev(), msg)
+
+ self.assertEquals(jc.next(), 0)
+ self.assertEquals(jc.next(), wiredtiger.WT_NOTFOUND)
+
+ # Only after the join cursor is closed can we use the index cursor
+ # normally
+ jc.close()
+ self.assertEquals(ic0.next(), wiredtiger.WT_NOTFOUND)
+ self.assertEquals(ic0.prev(), 0)
+
+ # common code for making sure that cursors can be
+ # implicitly closed, no matter the order they are created
+ def cursor_close_common(self, joinfirst):
+ self.session.create('table:join08', 'key_format=r' +
+ ',value_format=SS,columns=(k,v0,v1)')
+ self.session.create('index:join08:index0','columns=(v0)')
+ self.session.create('index:join08:index1','columns=(v1)')
+ c = self.session.open_cursor('table:join08', None, None)
+ for i in range(0, self.nentries):
+ c.set_key(*self.gen_key(i))
+ c.set_value(*self.gen_values(i))
+ c.insert()
+ c.close()
+
+ if joinfirst:
+ jc = self.session.open_cursor('join:table:join08', None, None)
+ c0 = self.session.open_cursor('index:join08:index0', None, None)
+ c1 = self.session.open_cursor('index:join08:index1', None, None)
+ c0.next() # index cursors must be positioned
+ c1.next()
+ if not joinfirst:
+ jc = self.session.open_cursor('join:table:join08', None, None)
+ self.session.join(jc, c0, 'compare=ge')
+ self.session.join(jc, c1, 'compare=ge')
+ self.session.close()
+ self.session = None
+
+ def test_cursor_close1(self):
+ self.cursor_close_common(True)
+
+ def test_cursor_close2(self):
+ self.cursor_close_common(False)
+
+ # test statistics with a simple one index join cursor
+ def test_simple_stats(self):
+ self.session.create("table:join01b",
+ "key_format=i,value_format=i,columns=(k,v)")
+ self.session.create("index:join01b:index", "columns=(v)")
+
+ cursor = self.session.open_cursor("table:join01b", None, None)
+ cursor[1] = 11
+ cursor[2] = 12
+ cursor[3] = 13
+ cursor.close()
+
+ cursor = self.session.open_cursor("index:join01b:index", None, None)
+ cursor.set_key(11)
+ cursor.search()
+
+ jcursor = self.session.open_cursor("join:table:join01b", None, None)
+ self.session.join(jcursor, cursor, "compare=gt")
+
+ while jcursor.next() == 0:
+ [k] = jcursor.get_keys()
+ [v] = jcursor.get_values()
+
+ statcur = self.session.open_cursor("statistics:join", jcursor, None)
+ found = False
+ while statcur.next() == 0:
+ [desc, pvalue, value] = statcur.get_values()
+ #self.tty(str(desc) + "=" + str(pvalue))
+ found = True
+ self.assertEquals(found, True)
+
+ jcursor.close()
+ cursor.close()
+
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/test/suite/test_reconfig02.py b/test/suite/test_reconfig02.py
index aee8ee4458b..85a9ceb2a34 100644
--- a/test/suite/test_reconfig02.py
+++ b/test/suite/test_reconfig02.py
@@ -74,9 +74,15 @@ class test_reconfig02(wttest.WiredTigerTestCase):
# Now turn on pre-allocation. Sleep to give the worker thread
# a chance to run and verify pre-allocated log files exist.
+ #
+ # Potentially loop a few times in case it is a very slow system.
self.conn.reconfigure("log=(prealloc=true)")
- time.sleep(2)
- prep_logs = fnmatch.filter(os.listdir('.'), "*Prep*")
+ for x in xrange(0, 20):
+ time.sleep(1)
+ prep_logs = fnmatch.filter(os.listdir('.'), "*Prep*")
+ if len(prep_logs) != 0:
+ break
+
self.assertNotEqual(0, len(prep_logs))
# Logging starts on, but archive is off. Verify it is off.
diff --git a/test/suite/test_stat05.py b/test/suite/test_stat05.py
index 6a93ec2c84d..9bcedd65089 100644
--- a/test/suite/test_stat05.py
+++ b/test/suite/test_stat05.py
@@ -37,9 +37,13 @@ from helper import complex_value_populate, key_populate, value_populate
# Statistics cursor using size only
class test_stat_cursor_config(wttest.WiredTigerTestCase):
pfx = 'test_stat_cursor_size'
+ conn_config = 'statistics=(fast)'
+
uri = [
('file', dict(uri='file:' + pfx, pop=simple_populate, cfg='')),
('table', dict(uri='table:' + pfx, pop=simple_populate, cfg='')),
+ ('inmem', dict(uri='table:' + pfx, pop=simple_populate, cfg='',
+ conn_config='in_memory,statistics=(fast)')),
('table-lsm', dict(uri='table:' + pfx, pop=simple_populate,
cfg=',type=lsm,lsm=(chunk_size=1MB,merge_min=2)')),
('complex', dict(uri='table:' + pfx, pop=complex_populate, cfg='')),
@@ -49,7 +53,6 @@ class test_stat_cursor_config(wttest.WiredTigerTestCase):
]
scenarios = number_scenarios(uri)
- conn_config = 'statistics=(fast)'
def openAndWalkStatCursor(self):
c = self.session.open_cursor(
diff --git a/test/suite/test_txn04.py b/test/suite/test_txn04.py
index bbd6ce8c4e2..9d9d2db62c6 100644
--- a/test/suite/test_txn04.py
+++ b/test/suite/test_txn04.py
@@ -193,7 +193,7 @@ class test_txn04(wttest.WiredTigerTestCase, suite_subprocess):
self.hot_backup(self.uri, committed)
def test_ops(self):
- with self.expectedStdoutPattern('Recreating metadata'):
+ with self.expectedStdoutPattern('recreating metadata'):
self.ops()
if __name__ == '__main__':
diff --git a/test/thread/Makefile.am b/test/thread/Makefile.am
index a58f019b513..ead783185f8 100644
--- a/test/thread/Makefile.am
+++ b/test/thread/Makefile.am
@@ -9,4 +9,4 @@ t_LDFLAGS = -static
TESTS = smoke.sh
clean-local:
- rm -rf WiredTiger* wt.* *.core __stats
+ rm -rf WT_TEST __stats *.core
diff --git a/test/thread/rw.c b/test/thread/rw.c
index 913fa6e6c25..10f13b9eb04 100644
--- a/test/thread/rw.c
+++ b/test/thread/rw.c
@@ -59,16 +59,13 @@ rw_start(u_int readers, u_int writers)
total_nops = 0;
/* Create per-thread structures. */
- if ((run_info = calloc(
- (size_t)(readers + writers), sizeof(*run_info))) == NULL ||
- (tids = calloc((size_t)(readers + writers), sizeof(*tids))) == NULL)
- testutil_die(errno, "calloc");
+ run_info = dcalloc((size_t)(readers + writers), sizeof(*run_info));
+ tids = dcalloc((size_t)(readers + writers), sizeof(*tids));
/* Create the files and load the initial records. */
for (i = 0; i < writers; ++i) {
if (i == 0 || multiple_files) {
- if ((run_info[i].name = malloc(64)) == NULL)
- testutil_die(errno, "malloc");
+ run_info[i].name = dmalloc(64);
snprintf(run_info[i].name, 64, FNAME, i);
/* Vary by orders of magnitude */
@@ -88,8 +85,7 @@ rw_start(u_int readers, u_int writers)
for (i = 0; i < readers; ++i) {
offset = i + writers;
if (multiple_files) {
- if ((run_info[offset].name = malloc(64)) == NULL)
- testutil_die(errno, "malloc");
+ run_info[offset].name = dmalloc(64);
/* Have readers read from tables with writes. */
name_index = i % writers;
snprintf(
diff --git a/test/thread/thread.h b/test/thread/thread.h
index 36cdbebd210..d5f0f42ea35 100644
--- a/test/thread/thread.h
+++ b/test/thread/thread.h
@@ -26,19 +26,9 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
-#include <sys/types.h>
-#include <sys/time.h>
+#include "test_util.i"
-#include <errno.h>
-#include <inttypes.h>
-#include <pthread.h>
#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-#include "test_util.i"
#define FNAME "file:wt.%03d" /* File name */
#define FNAME_STAT "__stats" /* File name for statistics */
diff --git a/test/utility/test_util.i b/test/utility/test_util.i
index 43982d9e4a1..833eddd87aa 100644
--- a/test/utility/test_util.i
+++ b/test/utility/test_util.i
@@ -64,9 +64,11 @@ testutil_die(int e, const char *fmt, ...)
if (custom_die != NULL)
(*custom_die)();
- va_start(ap, fmt);
- vfprintf(stderr, fmt, ap);
- va_end(ap);
+ if (fmt != NULL) {
+ va_start(ap, fmt);
+ vfprintf(stderr, fmt, ap);
+ va_end(ap);
+ }
if (e != 0)
fprintf(stderr, ": %s", wiredtiger_strerror(e));
fprintf(stderr, "\n");
@@ -161,3 +163,58 @@ testutil_make_work_dir(char *dir)
testutil_die(ret, "%s", buf);
free(buf);
}
+
+/*
+ * dcalloc --
+ * Call calloc, dying on failure.
+ */
+static inline void *
+dcalloc(size_t number, size_t size)
+{
+ void *p;
+
+ if ((p = calloc(number, size)) != NULL)
+ return (p);
+ testutil_die(errno, "calloc: %" WT_SIZET_FMT "B", number * size);
+}
+
+/*
+ * dmalloc --
+ * Call malloc, dying on failure.
+ */
+static inline void *
+dmalloc(size_t len)
+{
+ void *p;
+
+ if ((p = malloc(len)) != NULL)
+ return (p);
+ testutil_die(errno, "malloc: %" WT_SIZET_FMT "B", len);
+}
+
+/*
+ * drealloc --
+ * Call realloc, dying on failure.
+ */
+static inline void *
+drealloc(void *p, size_t len)
+{
+ void *t;
+ if ((t = realloc(p, len)) != NULL)
+ return (t);
+ testutil_die(errno, "realloc: %" WT_SIZET_FMT "B", len);
+}
+
+/*
+ * dstrdup --
+ * Call strdup, dying on failure.
+ */
+static inline void *
+dstrdup(const void *str)
+{
+ char *p;
+
+ if ((p = strdup(str)) != NULL)
+ return (p);
+ testutil_die(errno, "strdup");
+}